From 8baf85e4e9355611532e361a5cd4d458bc8fe1fe Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Fri, 11 Oct 2024 15:18:50 -0300
Subject: [PATCH] [Doc] Compatibility matrix for mutual exclusive features
 (#8512)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 docs/source/index.rst                        |   1 +
 docs/source/models/performance.rst           |   2 +
 docs/source/serving/compatibility_matrix.rst | 427 +++++++++++++++++++
 vllm/attention/backends/rocm_flash_attn.py   |   2 +
 vllm/config.py                               |  10 +
 vllm/engine/arg_utils.py                     |   2 +
 vllm/engine/output_processor/multi_step.py   |   2 +
 vllm/executor/cpu_executor.py                |   8 +
 vllm/inputs/preprocess.py                    |   2 +
 vllm/spec_decode/spec_decode_worker.py       |   2 +
 vllm/utils.py                                |   3 +
 vllm/worker/multi_step_model_runner.py       |   3 +
 vllm/worker/utils.py                         |   3 +
 13 files changed, 467 insertions(+)
 create mode 100644 docs/source/serving/compatibility_matrix.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 961373eb..d20e46b4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -86,6 +86,7 @@ Documentation
    serving/usage_stats
    serving/integrations
    serving/tensorizer
+   serving/compatibility_matrix
    serving/faq
 
 .. toctree::
diff --git a/docs/source/models/performance.rst b/docs/source/models/performance.rst
index d8750ddc..23b5ab79 100644
--- a/docs/source/models/performance.rst
+++ b/docs/source/models/performance.rst
@@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo
 
 You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
 
+.. _chunked-prefill:
+
 Chunked Prefill
 ---------------
 vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
new file mode 100644
index 00000000..cac0605c
--- /dev/null
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -0,0 +1,427 @@
+.. _compatibility_matrix:
+
+Compatibility Matrix
+====================
+
+The tables below show mutually exclusive features and the support on some hardware. 
+
+.. note::
+
+   Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
+
+Feature x Feature
+-----------------
+
+
+.. raw:: html
+
+    <style>
+      /* Make smaller to try to improve readability  */
+      td {
+        font-size: 0.8rem;
+        text-align: center;
+      }
+
+      th {
+        text-align: center;
+        font-size: 0.8rem;
+      }
+    </style>
+
+.. list-table::
+   :header-rows: 1
+   :widths: auto
+
+   * - Feature
+     - :ref:`CP <chunked-prefill>`
+     - :ref:`APC <apc>`
+     - :ref:`LoRA <lora>`
+     - :abbr:`prmpt adptr (Prompt Adapter)`
+     - :ref:`SD <spec_decode>`
+     - CUDA graph
+     - :abbr:`enc-dec (Encoder-Decoder Models)`
+     - :abbr:`logP (Logprobs)`
+     - :abbr:`prmpt logP (Prompt Logprobs)`
+     - :abbr:`async output (Async Output Processing)`
+     - multi-step
+     - :abbr:`MM (Multimodal)`
+     - best-of
+     - beam-search
+     - :abbr:`guided dec (Guided Decoding)`
+   * - :ref:`CP <chunked-prefill>`
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :ref:`APC <apc>`
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :ref:`LoRA <lora>`
+     - `✗ <https://github.com/vllm-project/vllm/pull/9057>`__ 
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`prmpt adptr (Prompt Adapter)`
+     - ✅
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :ref:`SD <spec_decode>`
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - CUDA graph
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`enc-dec (Encoder-Decoder Models)`
+     - ✗
+     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
+     - ✗ 
+     - ✗
+     - `✗ <https://github.com/vllm-project/vllm/issues/7366>`__ 
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`logP (Logprobs)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`prmpt logP (Prompt Logprobs)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/pull/8199>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`async output (Async Output Processing)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅ 
+     - ✗
+     - ✅
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - multi-step
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - ✗
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/8198>`__ 
+     - ✅
+     - 
+     - 
+     - 
+     - 
+     - 
+   * - :abbr:`MM (Multimodal)`
+     -  `✗ <https://github.com/vllm-project/vllm/pull/8346>`__ 
+     -  `✗ <https://github.com/vllm-project/vllm/pull/8348>`__ 
+     -  `✗ <https://github.com/vllm-project/vllm/pull/7199>`__ 
+     - ?
+     - ?
+     - ✅
+     - ✗
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - 
+     - 
+     - 
+     - 
+   * - best-of
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - `✗ <https://github.com/vllm-project/vllm/issues/7968>`__ 
+     - ✅
+     - 
+     - 
+     - 
+   * - beam-search
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/6137>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ?
+     - `✗ <https://github.com/vllm-project/vllm/issues/7968>`__ 
+     - ?
+     - ✅
+     - 
+     - 
+   * - :abbr:`guided dec (Guided Decoding)`
+     - ✅
+     - ✅
+     - ?
+     - ?
+     - ✅
+     - ✅
+     - ?
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ?
+     - ✅
+     - ✅
+     - 
+
+
+Feature x Hardware
+^^^^^^^^^^^^^^^^^^
+
+.. list-table::
+   :header-rows: 1
+   :widths: auto
+
+   * - Feature
+     - Volta
+     - Turing
+     - Ampere
+     - Ada
+     - Hopper
+     - CPU
+     - AMD
+   * - :ref:`CP <chunked-prefill>`
+     - `✗ <https://github.com/vllm-project/vllm/issues/2729>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗ 
+     - ✅
+   * - :ref:`APC <apc>`
+     - `✗ <https://github.com/vllm-project/vllm/issues/3687>`__ 
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+   * - :ref:`LoRA <lora>`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/pull/4830>`__ 
+     - ✅
+   * - :abbr:`prmpt adptr (Prompt Adapter)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/8475>`__ 
+     - ✅
+   * - :ref:`SD <spec_decode>`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - CUDA graph
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✅
+   * - :abbr:`enc-dec (Encoder-Decoder Models)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/blob/a84e598e2125960d3b4f716b78863f24ac562947/vllm/worker/cpu_model_runner.py#L125>`__ 
+     - ✗
+   * - :abbr:`logP (Logprobs)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - :abbr:`prmpt logP (Prompt Logprobs)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - :abbr:`async output (Async Output Processing)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✗
+     - ✗
+   * - multi-step
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - `✗ <https://github.com/vllm-project/vllm/issues/8477>`__ 
+     - ✅
+   * - :abbr:`MM (Multimodal)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - best-of
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - beam-search
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+   * - :abbr:`guided dec (Guided Decoding)`
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
+     - ✅
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 7456aab8..03fb9193 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -420,6 +420,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
diff --git a/vllm/config.py b/vllm/config.py
index f964928a..b0761ae0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -359,6 +359,8 @@ class ModelConfig:
             self.use_async_output_proc = False
             return
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if device_config.device_type not in ("cuda", "tpu"):
             logger.warning(
                 "Async output processing is only supported for CUDA or TPU. "
@@ -372,6 +374,8 @@ class ModelConfig:
             self.use_async_output_proc = False
             return
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if device_config.device_type == "cuda" and self.enforce_eager:
             logger.warning(
                 "To see benefits of async output processing, enable CUDA "
@@ -385,6 +389,8 @@ class ModelConfig:
         if self.embedding_mode:
             self.use_async_output_proc = False
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if speculative_config:
             logger.warning("Async output processing is not supported with"
                            " speculative decoding currently.")
@@ -1200,6 +1206,8 @@ class SpeculativeConfig:
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if enable_chunked_prefill:
             raise ValueError(
                 "Speculative decoding and chunked prefill are "
@@ -1561,6 +1569,8 @@ class LoRAConfig:
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if scheduler_config.chunked_prefill_enabled:
             raise ValueError("LoRA is not supported with chunked prefill yet.")
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bdfecabf..1b132cf7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1000,6 +1000,8 @@ class EngineArgs:
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
                 raise ValueError("Speculative decoding is not supported with "
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 47de3656..74ddb250 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -62,6 +62,8 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
     @staticmethod
     @functools.lru_cache()
     def _log_prompt_logprob_unsupported_warning_once():
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
             "(e.g., speculative decode uses multi step workers).")
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 9ad240ef..e32993e0 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -28,6 +28,8 @@ class CPUExecutor(ExecutorBase):
 
     def _init_executor(self) -> None:
         assert self.device_config.device_type == "cpu"
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         assert self.lora_config is None, "cpu backend doesn't support LoRA"
 
         #
@@ -324,6 +326,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
     if config.dtype == torch.float16:
         logger.warning("float16 is not supported on CPU, casting to bfloat16.")
         config.dtype = torch.bfloat16
+    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # If the feature combo become valid
     if not config.enforce_eager:
         logger.warning(
             "CUDA graph is not supported on CPU, fallback to the eager "
@@ -334,6 +338,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
 
 def _verify_and_get_scheduler_config(
         config: SchedulerConfig) -> SchedulerConfig:
+    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # If the feature combo become valid
     if config.chunked_prefill_enabled:
         logger.warning("Chunked prefill is not supported on CPU, disable it.")
         config.chunked_prefill_enabled = False
@@ -342,6 +348,8 @@ def _verify_and_get_scheduler_config(
 
 
 def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
+    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # If the feature combo become valid
     if config.enable_prefix_caching:
         logger.warning("Prefix caching is not supported on CPU, disable it.")
         config.enable_prefix_caching = False
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 22adb163..64387fd2 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -310,6 +310,8 @@ class InputPreprocessor:
         encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps
         decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps
 
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
         if decoder_mm_data is not None:
             raise ValueError(
                 "Multi-modality decoder inputs of encoder-decoder models are "
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index a6771529..13d39773 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -87,6 +87,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
+# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# If the feature combo become valid
 class SpecDecodeWorker(LoraNotSupportedWorkerBase):
     """Worker which implements speculative decoding.
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 314fec0a..8debae52 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -41,6 +41,9 @@ logger = init_logger(__name__)
 
 # Exception strings for non-implemented encoder/decoder scenarios
 
+# Reminder: Please update docs/source/serving/compatibility_matrix.rst
+# If the feature combo become valid
+
 STR_NOT_IMPL_ENC_DEC_SWA = \
     "Sliding window attention for encoder/decoder models " + \
                     "is not currently supported."
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 12aa4735..0cd0047b 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -816,6 +816,9 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
+        # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
             assert len(seq_group.sampling_params.logits_processors) == 0, (
                 "Logits Processors are not supported in multi-step decoding")
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index a07395df..f4363546 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -13,6 +13,9 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
+    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+    # If the feature combo become valid
+
     if enc_dec_mr.cache_config.enable_prefix_caching:
         raise NotImplementedError(
             STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])