mirror of
https://github.com/wassname/vllm.git
synced 2026-06-27 17:32:55 +08:00
[VLM] Support multimodal inputs for Florence-2 models (#13320)
This commit is contained in:
+3
-3
@@ -600,8 +600,8 @@ class HfRunner:
|
||||
if images is not None and images[i] is not None:
|
||||
processor_kwargs["images"] = images[i]
|
||||
|
||||
encoder_input_ids = self.wrap_device(
|
||||
self.processor(**processor_kwargs).input_ids,
|
||||
encoder_inputs = self.wrap_device(
|
||||
self.processor(**processor_kwargs),
|
||||
device=self.model.device.type,
|
||||
)
|
||||
|
||||
@@ -615,13 +615,13 @@ class HfRunner:
|
||||
)
|
||||
|
||||
output = self.model.generate(
|
||||
encoder_input_ids,
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
use_cache=True,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_tokens,
|
||||
output_hidden_states=True,
|
||||
return_dict_in_generate=True,
|
||||
**encoder_inputs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ from ....conftest import HfRunner, VllmRunner
|
||||
from ....utils import RemoteOpenAIServer
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_4"
|
||||
|
||||
AudioTuple = Tuple[np.ndarray, int]
|
||||
|
||||
@@ -187,7 +187,7 @@ def run_multi_audio_test(
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [
|
||||
|
||||
@@ -1,52 +1,59 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from functools import partial
|
||||
from typing import List, Optional, Tuple, Type
|
||||
from typing import Optional, Type
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.inputs.data import ExplicitEncoderDecoderPrompt
|
||||
from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
Florence2Prompt = partial(ExplicitEncoderDecoderPrompt,
|
||||
decoder_prompt=None,
|
||||
mm_processor_kwargs=None)
|
||||
|
||||
MODELS = ["microsoft/Florence-2-base"]
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
TOKENIZER = "facebook/bart-base"
|
||||
PROMPTS = [
|
||||
Florence2Prompt(encoder_prompt="<CAPTION>"),
|
||||
Florence2Prompt(encoder_prompt="<DETAILED_CAPTION>"),
|
||||
Florence2Prompt(encoder_prompt="<MORE_DETAILED_CAPTION>"),
|
||||
Florence2Prompt(encoder_prompt="<CAPTION_TO_PHRASE_GROUNDING>"),
|
||||
Florence2Prompt(encoder_prompt="<DENSE_REGION_CAPTION>"),
|
||||
Florence2Prompt(encoder_prompt="<REGION_PROPOSAL>"),
|
||||
Florence2Prompt(encoder_prompt="<OCR_WITH_REGION>"),
|
||||
Florence2Prompt(encoder_prompt="<OCR>"),
|
||||
Florence2Prompt(encoder_prompt="<OD>"),
|
||||
]
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<CAPTION>", # special task token
|
||||
"cherry_blossom":
|
||||
"Describe in detail what is shown in the image.",
|
||||
})
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||
Optional[SampleLogprobs]], ):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
def get_hf_images_prompts(
|
||||
prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
|
||||
) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
|
||||
prompts, images = [], []
|
||||
for prompt in prompts_:
|
||||
encoder_prompt = prompt["encoder_prompt"]
|
||||
prompts.append(
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=encoder_prompt["prompt"],
|
||||
decoder_prompt=None,
|
||||
))
|
||||
images.append(encoder_prompt["multi_modal_data"]["image"])
|
||||
return prompts, images
|
||||
|
||||
hf_output_str = "</s><s>" + output_str + "</s>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
def hf_to_vllm_output(hf_output: tuple[list[int], str,
|
||||
Optional[SampleLogprobs]]):
|
||||
"""Sanitize hf output to be comparable with vllm output."""
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
|
||||
output_str = output_str.replace("</s>", "").replace("<s>", "")
|
||||
output_ids = [ids for ids in output_ids if ids not in [0, 2]]
|
||||
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
prompts: List[ExplicitEncoderDecoderPrompt],
|
||||
inputs: list[list[ExplicitEncoderDecoderPrompt]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
@@ -56,46 +63,76 @@ def run_test(
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
max_num_seqs=8,
|
||||
tokenizer_name=TOKENIZER,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs)
|
||||
for prompts in inputs
|
||||
]
|
||||
|
||||
hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
|
||||
|
||||
# Florence-2 processors require image inputs
|
||||
dummy_image = Image.new(mode="RGB", size=(2, 2))
|
||||
with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.lm_head
|
||||
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
images=[dummy_image] * len(prompts),
|
||||
))
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, images=images)
|
||||
for prompts, images in hf_inputs
|
||||
]
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
|
||||
num_logprobs) -> None:
|
||||
def test_models(hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets, model: str,
|
||||
size_factors: list[int], dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [[
|
||||
ExplicitEncoderDecoderPrompt(
|
||||
encoder_prompt=TextPrompt(
|
||||
prompt=prompt,
|
||||
multi_modal_data={"image": rescale_image_size(image, factor)}),
|
||||
decoder_prompt=None,
|
||||
) for factor in size_factors
|
||||
] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
PROMPTS,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
|
||||
@@ -29,8 +29,8 @@ def _test_processing_correctness(
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
@@ -151,6 +151,7 @@ def _test_processing_correctness(
|
||||
"Salesforce/blip2-opt-2.7b",
|
||||
"facebook/chameleon-7b",
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
"microsoft/Florence-2-base",
|
||||
"adept/fuyu-8b",
|
||||
"THUDM/glm-4v-9b",
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
|
||||
@@ -193,11 +193,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
# [Encoder-decoder]
|
||||
"BartModel": _HfExamplesInfo("facebook/bart-base"),
|
||||
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
|
||||
tokenizer="facebook/bart-base",
|
||||
trust_remote_code=True), # noqa: E501
|
||||
}
|
||||
|
||||
_EMBEDDING_EXAMPLE_MODELS = {
|
||||
@@ -288,6 +283,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
extras={"v0.5": "fixie-ai/ultravox-v0_5-llama-3_2-1b"}, # noqa: E501
|
||||
trust_remote_code=True),
|
||||
# [Encoder-decoder]
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
|
||||
tokenizer="facebook/bart-base",
|
||||
trust_remote_code=True), # noqa: E501
|
||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user