From 8baf85e4e9355611532e361a5cd4d458bc8fe1fe Mon Sep 17 00:00:00 2001 From: Wallas Henrique Date: Fri, 11 Oct 2024 15:18:50 -0300 Subject: [PATCH] [Doc] Compatibility matrix for mutual exclusive features (#8512) Signed-off-by: Wallas Santos --- docs/source/index.rst | 1 + docs/source/models/performance.rst | 2 + docs/source/serving/compatibility_matrix.rst | 427 +++++++++++++++++++ vllm/attention/backends/rocm_flash_attn.py | 2 + vllm/config.py | 10 + vllm/engine/arg_utils.py | 2 + vllm/engine/output_processor/multi_step.py | 2 + vllm/executor/cpu_executor.py | 8 + vllm/inputs/preprocess.py | 2 + vllm/spec_decode/spec_decode_worker.py | 2 + vllm/utils.py | 3 + vllm/worker/multi_step_model_runner.py | 3 + vllm/worker/utils.py | 3 + 13 files changed, 467 insertions(+) create mode 100644 docs/source/serving/compatibility_matrix.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 961373eb..d20e46b4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -86,6 +86,7 @@ Documentation serving/usage_stats serving/integrations serving/tensorizer + serving/compatibility_matrix serving/faq .. toctree:: diff --git a/docs/source/models/performance.rst b/docs/source/models/performance.rst index d8750ddc..23b5ab79 100644 --- a/docs/source/models/performance.rst +++ b/docs/source/models/performance.rst @@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False. +.. _chunked-prefill: + Chunked Prefill --------------- vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst new file mode 100644 index 00000000..cac0605c --- /dev/null +++ b/docs/source/serving/compatibility_matrix.rst @@ -0,0 +1,427 @@ +.. _compatibility_matrix: + +Compatibility Matrix +==================== + +The tables below show mutually exclusive features and the support on some hardware. + +.. note:: + + Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. + +Feature x Feature +----------------- + + +.. raw:: html + + + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - :ref:`CP ` + - :ref:`APC ` + - :ref:`LoRA ` + - :abbr:`prmpt adptr (Prompt Adapter)` + - :ref:`SD ` + - CUDA graph + - :abbr:`enc-dec (Encoder-Decoder Models)` + - :abbr:`logP (Logprobs)` + - :abbr:`prmpt logP (Prompt Logprobs)` + - :abbr:`async output (Async Output Processing)` + - multi-step + - :abbr:`MM (Multimodal)` + - best-of + - beam-search + - :abbr:`guided dec (Guided Decoding)` + * - :ref:`CP ` + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`APC ` + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`LoRA ` + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + * - :ref:`SD ` + - ✗ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✗ + - `✗ `__ + - ✗ + - ✗ + - `✗ `__ + - ✅ + - + - + - + - + - + - + - + - + - + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + * - multi-step + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - `✗ `__ + - ✅ + - + - + - + - + - + * - :abbr:`MM (Multimodal)` + - `✗ `__ + - `✗ `__ + - `✗ `__ + - ? + - ? + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - + - + - + - + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ✅ + - + - + - + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - `✗ `__ + - ? + - ✅ + - + - + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ? + - ? + - ✅ + - ✅ + - ? + - ✅ + - ✅ + - ✅ + - ✗ + - ? + - ✅ + - ✅ + - + + +Feature x Hardware +^^^^^^^^^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - Feature + - Volta + - Turing + - Ampere + - Ada + - Hopper + - CPU + - AMD + * - :ref:`CP ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`APC ` + - `✗ `__ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :ref:`LoRA ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`prmpt adptr (Prompt Adapter)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :ref:`SD ` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - :abbr:`enc-dec (Encoder-Decoder Models)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✗ + * - :abbr:`logP (Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`prmpt logP (Prompt Logprobs)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`async output (Async Output Processing)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✗ + * - multi-step + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - `✗ `__ + - ✅ + * - :abbr:`MM (Multimodal)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - :abbr:`guided dec (Guided Decoding)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 7456aab8..03fb9193 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -420,6 +420,8 @@ class ROCmFlashAttentionImpl(AttentionImpl): Returns: shape = [num_tokens, num_heads * head_size] """ + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " "encoder/decoder cross-attention " diff --git a/vllm/config.py b/vllm/config.py index f964928a..b0761ae0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -359,6 +359,8 @@ class ModelConfig: self.use_async_output_proc = False return + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if device_config.device_type not in ("cuda", "tpu"): logger.warning( "Async output processing is only supported for CUDA or TPU. " @@ -372,6 +374,8 @@ class ModelConfig: self.use_async_output_proc = False return + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if device_config.device_type == "cuda" and self.enforce_eager: logger.warning( "To see benefits of async output processing, enable CUDA " @@ -385,6 +389,8 @@ class ModelConfig: if self.embedding_mode: self.use_async_output_proc = False + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" " speculative decoding currently.") @@ -1200,6 +1206,8 @@ class SpeculativeConfig: "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if enable_chunked_prefill: raise ValueError( "Speculative decoding and chunked prefill are " @@ -1561,6 +1569,8 @@ class LoRAConfig: model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bdfecabf..1b132cf7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1000,6 +1000,8 @@ class EngineArgs: disable_logprobs=self.disable_logprobs_during_spec_decoding, ) + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: raise ValueError("Speculative decoding is not supported with " diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 47de3656..74ddb250 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -62,6 +62,8 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): @staticmethod @functools.lru_cache() def _log_prompt_logprob_unsupported_warning_once(): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " "(e.g., speculative decode uses multi step workers).") diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 9ad240ef..e32993e0 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -28,6 +28,8 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" # @@ -324,6 +326,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: if config.dtype == torch.float16: logger.warning("float16 is not supported on CPU, casting to bfloat16.") config.dtype = torch.bfloat16 + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if not config.enforce_eager: logger.warning( "CUDA graph is not supported on CPU, fallback to the eager " @@ -334,6 +338,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: def _verify_and_get_scheduler_config( config: SchedulerConfig) -> SchedulerConfig: + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if config.chunked_prefill_enabled: logger.warning("Chunked prefill is not supported on CPU, disable it.") config.chunked_prefill_enabled = False @@ -342,6 +348,8 @@ def _verify_and_get_scheduler_config( def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if config.enable_prefix_caching: logger.warning("Prefix caching is not supported on CPU, disable it.") config.enable_prefix_caching = False diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 22adb163..64387fd2 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -310,6 +310,8 @@ class InputPreprocessor: encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid if decoder_mm_data is not None: raise ValueError( "Multi-modality decoder inputs of encoder-decoder models are " diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index a6771529..13d39773 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -87,6 +87,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker +# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/utils.py b/vllm/utils.py index 314fec0a..8debae52 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -41,6 +41,9 @@ logger = init_logger(__name__) # Exception strings for non-implemented encoder/decoder scenarios +# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# If the feature combo become valid + STR_NOT_IMPL_ENC_DEC_SWA = \ "Sliding window attention for encoder/decoder models " + \ "is not currently supported." diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 12aa4735..0cd0047b 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -816,6 +816,9 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid + # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: assert len(seq_group.sampling_params.logits_processors) == 0, ( "Logits Processors are not supported in multi-step decoding") diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index a07395df..f4363546 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,6 +13,9 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid + if enc_dec_mr.cache_config.enable_prefix_caching: raise NotImplementedError( STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])