From be1d6f9fbe24d9a02b2d01f5de2efcf9f63860fb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 28 Aug 2024 09:09:16 +0000 Subject: [PATCH 1/8] Fix incompatibility between multimodal tensor stacking and multi-image support in LLaVA-NeXT --- vllm/model_executor/models/llava_next.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 3a87242954114..629ec11c1de2a 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -361,13 +361,16 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of image sizes. " f"Got type: {type(image_sizes)}") - # Remove the N dimension until multiple images are supported. + # Flatten the B and N dimensions if isinstance(pixel_values, torch.Tensor): - pixel_values = pixel_values.squeeze(1) + pixel_values = pixel_values.flatten(0, 1) else: - pixel_values = [t.squeeze(0) for t in pixel_values] + pixel_values = [ + patch_item for batch_item in pixel_values + for patch_item in batch_item + ] - image_sizes = image_sizes.squeeze(1) + image_sizes = image_sizes.flatten(0, 1) return LlavaNextImagePixelInputs( type="pixel_values", @@ -380,8 +383,8 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of image embeds. " f"Got type: {type(image_embeds)}") - # Remove the N dimension until multiple images are supported. - image_embeds = image_embeds.squeeze(1) + # Flatten the B and N dimensions + image_embeds = image_embeds.flatten(0, 1) return LlavaNextImageEmbeddingInputs( type="image_embeds", From 21070710da66d0b5d48a86f9ebc886137052d9cd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 28 Aug 2024 09:41:40 +0000 Subject: [PATCH 2/8] Fix handling of `image_sizes`; update docs --- vllm/model_executor/models/blip2.py | 4 +-- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/internvl.py | 17 ++++-------- vllm/model_executor/models/llava.py | 4 +-- vllm/model_executor/models/llava_next.py | 32 +++++++--------------- vllm/model_executor/models/paligemma.py | 4 +-- vllm/model_executor/models/phi3v.py | 7 +++-- vllm/model_executor/models/ultravox.py | 2 +- vllm/model_executor/models/utils.py | 35 ++++++++++++++++++++++-- 9 files changed, 61 insertions(+), 46 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 7c9123079c44f..addfec91b950e 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -40,13 +40,13 @@ class Blip2ImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: (batch_size, num_channels, height, width)""" + """Shape: `(batch_size, num_images, num_channels, height, width)`""" class Blip2ImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor - """Shape: `(batch_size, image_feature_size, hidden_size)` + """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 2d4f172ce0be6..664d83c193481 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -53,7 +53,7 @@ class ChameleonImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: `(batch_size, num_channels, height, width)`""" + """Shape: `(batch_size, num_images, num_channels, height, width)`""" def get_max_chameleon_image_tokens(ctx: InputContext): diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 7f213287f33b4..6e8c57670a6f5 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -29,7 +29,7 @@ from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_num_patches) from .interfaces import SupportsMultiModal -from .utils import (filter_weights, init_vllm_registered_model, +from .utils import (filter_weights, flatten_bn, init_vllm_registered_model, merge_multimodal_embeddings) IMG_START = '' @@ -44,7 +44,8 @@ class InternVLImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: Union[torch.Tensor, List[torch.Tensor]] """ - Shape: `(batch_size, 1 + num_patches, num_channels, height, width)` + Shape: + `(batch_size, num_images, 1 + num_patches, num_channels, height, width)` Note that `num_patches` may be different for each batch, in which case the data is passed as a list instead of a batched tensor. @@ -54,7 +55,7 @@ class InternVLImagePixelInputs(TypedDict): class InternVLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: Union[torch.Tensor, List[torch.Tensor]] - """Shape: `(batch_size, image_feature_size, hidden_size)` + """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ @@ -413,12 +414,9 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of image embeddings. " f"Got type: {type(image_embeds)}") - # Flatten the B and N dimensions - image_embeds = image_embeds.flatten(0, 2) - return InternVLImageEmbeddingInputs( type="image_embeds", - data=image_embeds, + data=flatten_bn(image_embeds), ) self.img_context_token_id = image_token_id[0] @@ -428,12 +426,9 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - # Flatten the B and N dimensions - pixel_values = pixel_values.flatten(0, 2) - return InternVLImagePixelInputs( type="pixel_values", - data=self._validate_pixel_values(pixel_values), + data=self._validate_pixel_values(flatten_bn(pixel_values)), ) raise AssertionError("This line should be unreachable.") diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 03a0abf1db481..32226ae9a1108 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -30,13 +30,13 @@ class LlavaImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: `(batch_size, num_channels, height, width)`""" + """Shape: `(batch_size, num_images, num_channels, height, width)`""" class LlavaImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor - """Shape: `(batch_size, image_feature_size, hidden_size)` + """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 629ec11c1de2a..871d3b6995854 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -29,7 +29,7 @@ from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) -from .utils import (filter_weights, init_vllm_registered_model, +from .utils import (filter_weights, flatten_bn, init_vllm_registered_model, merge_multimodal_embeddings) logger = init_logger(__name__) @@ -47,7 +47,8 @@ class LlavaNextImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: Union[torch.Tensor, List[torch.Tensor]] """ - Shape: `(batch_size, 1 + num_patches, num_channels, height, width)` + Shape: + `(batch_size, num_images, 1 + num_patches, num_channels, height, width)` Note that `num_patches` may be different for each batch, in which case the data is passed as a list instead of a batched tensor. @@ -55,7 +56,7 @@ class LlavaNextImagePixelInputs(TypedDict): image_sizes: NotRequired[torch.Tensor] """ - Shape: `(batch_size, 2)` + Shape: `(batch_size, num_images, 2)` This should be in `(height, width)` format. """ @@ -64,7 +65,7 @@ class LlavaNextImagePixelInputs(TypedDict): class LlavaNextImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor - """Shape: `(batch_size, image_feature_size, hidden_size)` + """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ @@ -357,25 +358,15 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - if not isinstance(image_sizes, torch.Tensor): + if not isinstance(image_sizes, (torch.Tensor, list)): raise ValueError("Incorrect type of image sizes. " f"Got type: {type(image_sizes)}") - # Flatten the B and N dimensions - if isinstance(pixel_values, torch.Tensor): - pixel_values = pixel_values.flatten(0, 1) - else: - pixel_values = [ - patch_item for batch_item in pixel_values - for patch_item in batch_item - ] - - image_sizes = image_sizes.flatten(0, 1) - return LlavaNextImagePixelInputs( type="pixel_values", - data=self._validate_pixel_values(pixel_values), - image_sizes=self._validate_image_sizes(image_sizes), + data=self._validate_pixel_values(flatten_bn(pixel_values)), + image_sizes=self._validate_image_sizes( + flatten_bn(image_sizes)), ) if image_embeds is not None: @@ -383,12 +374,9 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of image embeds. " f"Got type: {type(image_embeds)}") - # Flatten the B and N dimensions - image_embeds = image_embeds.flatten(0, 1) - return LlavaNextImageEmbeddingInputs( type="image_embeds", - data=image_embeds, + data=flatten_bn(image_embeds), ) raise AssertionError("This line should be unreachable.") diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 0700f0c29d708..33b0224147a0a 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -34,13 +34,13 @@ class PaliGemmaImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: (batch_size, num_channels, height, width)""" + """Shape: `(batch_size, num_images, num_channels, height, width)`""" class PaliGemmaImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor - """Shape: `(batch_size, image_feature_size, hidden_size)` + """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 61f1d73976379..da0796388922f 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -75,7 +75,8 @@ class Phi3VImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: Union[torch.Tensor, List[torch.Tensor]] """ - Shape: `(batch_size, 1 + num_patches, num_channels, height, width)` + Shape: + `(batch_size, num_images, 1 + num_patches, num_channels, height, width)` Note that `num_patches` may be different for each batch, in which case the data is passed as a list instead of a batched tensor. @@ -83,7 +84,7 @@ class Phi3VImagePixelInputs(TypedDict): image_sizes: torch.Tensor """ - Shape: `(batch_size, 2)` + Shape: `(batch_size, num_images, 2)` This should be in `(height, width)` format. """ @@ -92,7 +93,7 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: Union[torch.Tensor, List[torch.Tensor]] - """Shape: `(batch_size, image_feature_size, hidden_size)` + """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index c81c2fd114eb8..1a9cf9ad0df62 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -49,7 +49,7 @@ class UltravoxAudioFeatureInputs(TypedDict): type: Literal["audio_features"] data: Union[torch.Tensor, List[torch.Tensor]] - """Shape: `(batch_size, 80, M)""" + """Shape: `(batch_size, num_audios, 80, M)""" class UltravoxAudioEmbeddingInputs(TypedDict): diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 00026b7ebe2e1..ba6d1e2312412 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,4 +1,5 @@ -from typing import Dict, Iterable, List, Optional, Protocol, Tuple +from typing import (Dict, Iterable, List, Optional, Protocol, Tuple, Union, + overload) import numpy as np import torch @@ -55,6 +56,35 @@ def init_vllm_registered_model( ) +@overload +def flatten_bn(x: torch.Tensor) -> torch.Tensor: + ... + + +@overload +def flatten_bn(x: List[torch.Tensor]) -> torch.Tensor: + ... + + +@overload +def flatten_bn(x: List[List[torch.Tensor]]) -> List[torch.Tensor]: + ... + + +def flatten_bn( + x: Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor] +) -> Union[List[torch.Tensor], torch.Tensor]: + """ + Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. + + The input tensor should have shape ``(B, N, ...)```. + """ + if isinstance(x, torch.Tensor): + return x.flatten(0, 1) + + return [x_n for x_b in x for x_n in x_b] + + def _flatten_embeddings(embeddings: NestedTensors) -> torch.Tensor: """ Recursively concatenates NestedTensors along any heterogeneously sized @@ -93,7 +123,8 @@ def merge_multimodal_embeddings(input_ids: torch.Tensor, This updates ``inputs_embeds`` in place. """ mask = (input_ids == placeholder_token_id) - num_expected_tokens = mask.sum() + num_expected_tokens = mask.sum().item() + assert isinstance(num_expected_tokens, int) flattened = _flatten_embeddings(multimodal_embeddings) *dims, embed_dim = flattened.shape From 0334d34fb90dad8dffefb9d76d3b3ecd4d875da0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 28 Aug 2024 09:54:21 +0000 Subject: [PATCH 3/8] Correction --- vllm/model_executor/models/blip2.py | 4 +- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/internvl.py | 18 +++----- vllm/model_executor/models/llava.py | 4 +- vllm/model_executor/models/llava_next.py | 35 ++++++++++------ vllm/model_executor/models/paligemma.py | 4 +- vllm/model_executor/models/phi3v.py | 52 +++++++++++++----------- vllm/model_executor/models/utils.py | 9 +--- 8 files changed, 65 insertions(+), 63 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index addfec91b950e..8be786fd3f6f5 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -40,13 +40,13 @@ class Blip2ImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: `(batch_size, num_images, num_channels, height, width)`""" + """Shape: `(batch_size * num_images, num_channels, height, width)`""" class Blip2ImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor - """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 664d83c193481..b25f5d521a9bf 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -53,7 +53,7 @@ class ChameleonImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: `(batch_size, num_images, num_channels, height, width)`""" + """Shape: `(batch_size * num_images, num_channels, height, width)`""" def get_max_chameleon_image_tokens(ctx: InputContext): diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 6e8c57670a6f5..d68148c67e777 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -45,17 +45,17 @@ class InternVLImagePixelInputs(TypedDict): data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: - `(batch_size, num_images, 1 + num_patches, num_channels, height, width)` + `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` - Note that `num_patches` may be different for each batch, in which case - the data is passed as a list instead of a batched tensor. + Note that `num_patches` may be different for per batch and image, + in which case the data is passed as a list instead of a batched tensor. """ class InternVLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: Union[torch.Tensor, List[torch.Tensor]] - """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ @@ -371,14 +371,6 @@ def extract_feature(self, pixel_values): vit_embeds = self.mlp1(vit_embeds) return vit_embeds - def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: - if list(data.shape[1:]) != [2]: - raise ValueError( - f"The expected image sizes shape is batch dimension plus " - f"{[2]}. You supplied {data.shape}.") - - return data - def _validate_pixel_values( self, data: Union[torch.Tensor, List[torch.Tensor]] ) -> Union[torch.Tensor, List[torch.Tensor]]: @@ -392,7 +384,7 @@ def _validate_shape(d: torch.Tensor): if actual_dims != expected_dims: expected_expr = ("num_patches", *map(str, expected_dims)) raise ValueError( - "The expected shape of pixel values in each batch element " + "The expected shape of pixel values per image per batch " f"is {expected_expr}. You supplied {tuple(d.shape)}.") for d in data: diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 32226ae9a1108..490c93294d50f 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -30,13 +30,13 @@ class LlavaImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: `(batch_size, num_images, num_channels, height, width)`""" + """Shape: `(batch_size * num_images, num_channels, height, width)`""" class LlavaImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor - """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 871d3b6995854..511e4844a9645 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -48,15 +48,15 @@ class LlavaNextImagePixelInputs(TypedDict): data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: - `(batch_size, num_images, 1 + num_patches, num_channels, height, width)` + `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` - Note that `num_patches` may be different for each batch, in which case - the data is passed as a list instead of a batched tensor. + Note that `num_patches` may be different for per batch and image, + in which case the data is passed as a list instead of a batched tensor. """ - image_sizes: NotRequired[torch.Tensor] + image_sizes: NotRequired[Union[torch.Tensor, List[torch.Tensor]]] """ - Shape: `(batch_size, num_images, 2)` + Shape: `(batch_size * num_images, 2)` This should be in `(height, width)` format. """ @@ -65,7 +65,7 @@ class LlavaNextImagePixelInputs(TypedDict): class LlavaNextImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor - """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ @@ -315,11 +315,22 @@ def __init__(self, self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) - def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: - if list(data.shape[1:]) != [2]: - raise ValueError( - f"The expected image sizes shape is batch dimension plus " - f"{[2]}. You supplied {data.shape}.") + def _validate_image_sizes( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + expected_dims = (2,) + + def _validate_shape(d: torch.Tensor): + actual_dims = tuple(d.shape) + + if actual_dims != expected_dims: + expected_expr = str(expected_dims) + raise ValueError( + f"The expected shape of image sizes per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) return data @@ -336,7 +347,7 @@ def _validate_shape(d: torch.Tensor): if actual_dims != expected_dims: expected_expr = ("num_patches", *map(str, expected_dims)) raise ValueError( - "The expected shape of pixel values in each batch element " + "The expected shape of pixel values per image per batch " f"is {expected_expr}. You supplied {tuple(d.shape)}.") for d in data: diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 33b0224147a0a..46ee4c3208b7a 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -34,13 +34,13 @@ class PaliGemmaImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: `(batch_size, num_images, num_channels, height, width)`""" + """Shape: `(batch_size * num_images, num_channels, height, width)`""" class PaliGemmaImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor - """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index da0796388922f..a46e5b21b4fd8 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -44,7 +44,7 @@ from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal -from .utils import merge_multimodal_embeddings +from .utils import flatten_bn, merge_multimodal_embeddings logger = init_logger(__name__) @@ -76,15 +76,15 @@ class Phi3VImagePixelInputs(TypedDict): data: Union[torch.Tensor, List[torch.Tensor]] """ Shape: - `(batch_size, num_images, 1 + num_patches, num_channels, height, width)` + `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` - Note that `num_patches` may be different for each batch, in which case - the data is passed as a list instead of a batched tensor. + Note that `num_patches` may be different for per batch and image, + in which case the data is passed as a list instead of a batched tensor. """ - image_sizes: torch.Tensor + image_sizes: Union[torch.Tensor, List[torch.Tensor]] """ - Shape: `(batch_size, num_images, 2)` + Shape: `(batch_size * num_images, 2)` This should be in `(height, width)` format. """ @@ -93,7 +93,7 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: Union[torch.Tensor, List[torch.Tensor]] - """Shape: `(batch_size, num_images, image_feature_size, hidden_size)` + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ @@ -511,11 +511,22 @@ def __init__(self, self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() - def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: - if list(data.shape[1:]) != [2]: - raise ValueError( - f"The expected shape of image sizes is batch dimension plus " - f"{[2]}. You supplied {tuple(data.shape)}.") + def _validate_image_sizes( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + expected_dims = (2,) + + def _validate_shape(d: torch.Tensor): + actual_dims = tuple(d.shape) + + if actual_dims != expected_dims: + expected_expr = str(expected_dims) + raise ValueError( + f"The expected shape of image sizes per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) return data @@ -532,7 +543,7 @@ def _validate_shape(d: torch.Tensor): if actual_dims != expected_dims: expected_expr = ("num_patches", *map(str, expected_dims)) raise ValueError( - "The expected shape of pixel values in each batch element " + "The expected shape of pixel values per image per batch " f"is {expected_expr}. You supplied {tuple(d.shape)}.") for d in data: @@ -561,26 +572,19 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of image sizes. " f"Got type: {type(image_sizes)}") - # Merge the B and N dimensions. - if isinstance(pixel_values, torch.Tensor): - pixel_values = pixel_values.flatten(0, 1) - else: - pixel_values = torch.cat(pixel_values) - - image_sizes = image_sizes.flatten(0, 1) - return Phi3VImagePixelInputs( type="pixel_values", - data=self._validate_pixel_values(pixel_values), - image_sizes=self._validate_image_sizes(image_sizes)) + data=self._validate_pixel_values(flatten_bn(pixel_values)), + image_sizes=self._validate_image_sizes(flatten_bn(image_sizes))) if image_embeds is not None: if not isinstance(image_embeds, torch.Tensor): raise ValueError("Incorrect type of image embeddings. " f"Got type: {type(image_embeds)}") + return Phi3VImageEmbeddingInputs( type="image_embeds", - data=image_embeds, + data=flatten_bn(image_embeds), ) raise AssertionError("This line should be unreachable.") diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index ba6d1e2312412..78fd6abd08ff5 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -62,17 +62,12 @@ def flatten_bn(x: torch.Tensor) -> torch.Tensor: @overload -def flatten_bn(x: List[torch.Tensor]) -> torch.Tensor: - ... - - -@overload -def flatten_bn(x: List[List[torch.Tensor]]) -> List[torch.Tensor]: +def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]: ... def flatten_bn( - x: Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor] + x: Union[List[torch.Tensor], torch.Tensor] ) -> Union[List[torch.Tensor], torch.Tensor]: """ Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. From 89e606738c4ba3a5392347b58266f37363468e58 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 28 Aug 2024 09:55:09 +0000 Subject: [PATCH 4/8] Fix typo --- vllm/model_executor/models/internvl.py | 2 +- vllm/model_executor/models/llava_next.py | 6 +++--- vllm/model_executor/models/phi3v.py | 9 +++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index d68148c67e777..b02d54389a54d 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -47,7 +47,7 @@ class InternVLImagePixelInputs(TypedDict): Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` - Note that `num_patches` may be different for per batch and image, + Note that `num_patches` may be different per batch and image, in which case the data is passed as a list instead of a batched tensor. """ diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 511e4844a9645..b9636f769fe62 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -50,7 +50,7 @@ class LlavaNextImagePixelInputs(TypedDict): Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` - Note that `num_patches` may be different for per batch and image, + Note that `num_patches` may be different per batch and image, in which case the data is passed as a list instead of a batched tensor. """ @@ -318,11 +318,11 @@ def __init__(self, def _validate_image_sizes( self, data: Union[torch.Tensor, List[torch.Tensor]] ) -> Union[torch.Tensor, List[torch.Tensor]]: - expected_dims = (2,) + expected_dims = (2, ) def _validate_shape(d: torch.Tensor): actual_dims = tuple(d.shape) - + if actual_dims != expected_dims: expected_expr = str(expected_dims) raise ValueError( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index a46e5b21b4fd8..26ba4f8645164 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -78,7 +78,7 @@ class Phi3VImagePixelInputs(TypedDict): Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` - Note that `num_patches` may be different for per batch and image, + Note that `num_patches` may be different per batch and image, in which case the data is passed as a list instead of a batched tensor. """ @@ -514,11 +514,11 @@ def __init__(self, def _validate_image_sizes( self, data: Union[torch.Tensor, List[torch.Tensor]] ) -> Union[torch.Tensor, List[torch.Tensor]]: - expected_dims = (2,) + expected_dims = (2, ) def _validate_shape(d: torch.Tensor): actual_dims = tuple(d.shape) - + if actual_dims != expected_dims: expected_expr = str(expected_dims) raise ValueError( @@ -575,7 +575,8 @@ def _parse_and_validate_image_input( return Phi3VImagePixelInputs( type="pixel_values", data=self._validate_pixel_values(flatten_bn(pixel_values)), - image_sizes=self._validate_image_sizes(flatten_bn(image_sizes))) + image_sizes=self._validate_image_sizes( + flatten_bn(image_sizes))) if image_embeds is not None: if not isinstance(image_embeds, torch.Tensor): From f98887fa04698d1a8b76edbe88f456d8360db7e5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 28 Aug 2024 10:01:27 +0000 Subject: [PATCH 5/8] Simplify --- vllm/model_executor/models/llava_next.py | 8 +++----- vllm/model_executor/models/phi3v.py | 8 +++----- vllm/model_executor/models/utils.py | 20 +++++++++++++++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index b9636f769fe62..048ca16974e3c 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -54,7 +54,7 @@ class LlavaNextImagePixelInputs(TypedDict): in which case the data is passed as a list instead of a batched tensor. """ - image_sizes: NotRequired[Union[torch.Tensor, List[torch.Tensor]]] + image_sizes: NotRequired[torch.Tensor] """ Shape: `(batch_size * num_images, 2)` @@ -315,9 +315,7 @@ def __init__(self, self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) - def _validate_image_sizes( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) def _validate_shape(d: torch.Tensor): @@ -377,7 +375,7 @@ def _parse_and_validate_image_input( type="pixel_values", data=self._validate_pixel_values(flatten_bn(pixel_values)), image_sizes=self._validate_image_sizes( - flatten_bn(image_sizes)), + flatten_bn(image_sizes, concat=True)), ) if image_embeds is not None: diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 26ba4f8645164..370c34a9fbb69 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -511,9 +511,7 @@ def __init__(self, self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() - def _validate_image_sizes( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) def _validate_shape(d: torch.Tensor): @@ -568,7 +566,7 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - if not isinstance(image_sizes, torch.Tensor): + if not isinstance(image_sizes, (torch.Tensor, list)): raise ValueError("Incorrect type of image sizes. " f"Got type: {type(image_sizes)}") @@ -576,7 +574,7 @@ def _parse_and_validate_image_input( type="pixel_values", data=self._validate_pixel_values(flatten_bn(pixel_values)), image_sizes=self._validate_image_sizes( - flatten_bn(image_sizes))) + flatten_bn(image_sizes, concat=True))) if image_embeds is not None: if not isinstance(image_embeds, torch.Tensor): diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 78fd6abd08ff5..6e7ee511bf27f 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,5 +1,5 @@ -from typing import (Dict, Iterable, List, Optional, Protocol, Tuple, Union, - overload) +from typing import (Dict, Iterable, List, Literal, Optional, Protocol, Tuple, + Union, overload) import numpy as np import torch @@ -66,8 +66,19 @@ def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]: ... +@overload +def flatten_bn( + x: Union[List[torch.Tensor], torch.Tensor], + *, + concat: Literal[True], +) -> torch.Tensor: + ... + + def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor] + x: Union[List[torch.Tensor], torch.Tensor], + *, + concat: bool = False, ) -> Union[List[torch.Tensor], torch.Tensor]: """ Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. @@ -77,6 +88,9 @@ def flatten_bn( if isinstance(x, torch.Tensor): return x.flatten(0, 1) + if concat: + return torch.cat(x) + return [x_n for x_b in x for x_n in x_b] From 058f8ecabf0ddfd912f5e00e90dc8f5533161088 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 28 Aug 2024 10:05:01 +0000 Subject: [PATCH 6/8] Fix docstring --- vllm/model_executor/models/ultravox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 1a9cf9ad0df62..03d6223225511 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -49,7 +49,7 @@ class UltravoxAudioFeatureInputs(TypedDict): type: Literal["audio_features"] data: Union[torch.Tensor, List[torch.Tensor]] - """Shape: `(batch_size, num_audios, 80, M)""" + """Shape: `(batch_size * num_audios, 80, M)""" class UltravoxAudioEmbeddingInputs(TypedDict): From 88748cb6c1b268e0b137bae3442d074a3c67422d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 28 Aug 2024 10:06:37 +0000 Subject: [PATCH 7/8] Fix type annotation --- vllm/model_executor/models/phi3v.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 370c34a9fbb69..bec1d35388506 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -82,7 +82,7 @@ class Phi3VImagePixelInputs(TypedDict): in which case the data is passed as a list instead of a batched tensor. """ - image_sizes: Union[torch.Tensor, List[torch.Tensor]] + image_sizes: torch.Tensor """ Shape: `(batch_size * num_images, 2)` From f99d4cc8c966a4f913d04aefd3b4f8db0f09d55d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 28 Aug 2024 10:27:35 +0000 Subject: [PATCH 8/8] Fix InternVL --- vllm/model_executor/models/internvl.py | 23 ++++++++++------------- vllm/multimodal/base.py | 4 ++-- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index b02d54389a54d..ca4d773190e0f 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -42,19 +42,16 @@ class InternVLImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: torch.Tensor """ Shape: - `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` - - Note that `num_patches` may be different per batch and image, - in which case the data is passed as a list instead of a batched tensor. + `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` """ class InternVLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: torch.Tensor """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -358,7 +355,7 @@ def pixel_shuffle(self, x, scale_factor=0.5): x = x.permute(0, 2, 1, 3).contiguous() return x - def extract_feature(self, pixel_values): + def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor: vit_embeds = self.vision_model(pixel_values=pixel_values) vit_embeds = vit_embeds[:, 1:, :] @@ -371,9 +368,7 @@ def extract_feature(self, pixel_values): vit_embeds = self.mlp1(vit_embeds) return vit_embeds - def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -382,10 +377,11 @@ def _validate_shape(d: torch.Tensor): actual_dims = tuple(d.shape) if actual_dims != expected_dims: - expected_expr = ("num_patches", *map(str, expected_dims)) + expected_expr = str(expected_dims) raise ValueError( "The expected shape of pixel values per image per batch " - f"is {expected_expr}. You supplied {tuple(d.shape)}.") + f" per patch is {expected_expr}. " + f"You supplied {tuple(d.shape)}.") for d in data: _validate_shape(d) @@ -420,7 +416,8 @@ def _parse_and_validate_image_input( return InternVLImagePixelInputs( type="pixel_values", - data=self._validate_pixel_values(flatten_bn(pixel_values)), + data=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True).flatten(0, 1)), ) raise AssertionError("This line should be unreachable.") diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index f26e3292c264d..c02e61596927a 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -18,7 +18,7 @@ logger = init_logger(__name__) -NestedTensors = Union[List["NestedTensors"], torch.Tensor] +NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor] """ Uses a list instead of a tensor if the dimensions of each element do not match. """ @@ -61,7 +61,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: tensors_ = cast(List[torch.Tensor], stacked) if any(t.shape != tensors_[0].shape for t in tensors_): # The tensors have incompatible shapes and can't be stacked. - return stacked + return tensors_ return torch.stack(tensors_)