Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix][VLM] Fix incompatibility between #7902 and #7230 #7948

Merged
merged 8 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@
class Blip2ImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: (batch_size, num_channels, height, width)"""
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""


class Blip2ImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`

`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
class ChameleonImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""


def get_max_chameleon_image_tokens(ctx: InputContext):
Expand Down
46 changes: 15 additions & 31 deletions vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
get_clip_num_patches)
from .interfaces import SupportsMultiModal
from .utils import (filter_weights, init_vllm_registered_model,
from .utils import (filter_weights, flatten_bn, init_vllm_registered_model,
merge_multimodal_embeddings)

IMG_START = '<img>'
Expand All @@ -42,19 +42,17 @@

class InternVLImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: Union[torch.Tensor, List[torch.Tensor]]
data: torch.Tensor
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`

Note that `num_patches` may be different for each batch, in which case
the data is passed as a list instead of a batched tensor.
Shape:
`(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
"""


class InternVLImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""Shape: `(batch_size, image_feature_size, hidden_size)`
data: torch.Tensor
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`

`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down Expand Up @@ -357,7 +355,7 @@ def pixel_shuffle(self, x, scale_factor=0.5):
x = x.permute(0, 2, 1, 3).contiguous()
return x

def extract_feature(self, pixel_values):
def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
vit_embeds = self.vision_model(pixel_values=pixel_values)
vit_embeds = vit_embeds[:, 1:, :]

Expand All @@ -370,17 +368,7 @@ def extract_feature(self, pixel_values):
vit_embeds = self.mlp1(vit_embeds)
return vit_embeds

def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
if list(data.shape[1:]) != [2]:
raise ValueError(
f"The expected image sizes shape is batch dimension plus "
f"{[2]}. You supplied {data.shape}.")

return data

def _validate_pixel_values(
self, data: Union[torch.Tensor, List[torch.Tensor]]
) -> Union[torch.Tensor, List[torch.Tensor]]:
def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:

h = w = self.config.vision_config.image_size
expected_dims = (3, h, w)
Expand All @@ -389,10 +377,11 @@ def _validate_shape(d: torch.Tensor):
actual_dims = tuple(d.shape)

if actual_dims != expected_dims:
expected_expr = ("num_patches", *map(str, expected_dims))
expected_expr = str(expected_dims)
raise ValueError(
"The expected shape of pixel values in each batch element "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")
"The expected shape of pixel values per image per batch "
f" per patch is {expected_expr}. "
f"You supplied {tuple(d.shape)}.")

for d in data:
_validate_shape(d)
Expand All @@ -413,12 +402,9 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}")

# Flatten the B and N dimensions
image_embeds = image_embeds.flatten(0, 2)

return InternVLImageEmbeddingInputs(
type="image_embeds",
data=image_embeds,
data=flatten_bn(image_embeds),
)

self.img_context_token_id = image_token_id[0]
Expand All @@ -428,12 +414,10 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")

# Flatten the B and N dimensions
pixel_values = pixel_values.flatten(0, 2)

return InternVLImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(pixel_values),
data=self._validate_pixel_values(
flatten_bn(pixel_values, concat=True).flatten(0, 1)),
)

raise AssertionError("This line should be unreachable.")
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@
class LlavaImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""


class LlavaImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`

`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down
52 changes: 26 additions & 26 deletions vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
dummy_seq_data_for_siglip, get_siglip_image_feature_size,
get_siglip_patch_grid_length, input_processor_for_siglip)
from .utils import (filter_weights, init_vllm_registered_model,
from .utils import (filter_weights, flatten_bn, init_vllm_registered_model,
merge_multimodal_embeddings)

logger = init_logger(__name__)
Expand All @@ -47,15 +47,16 @@ class LlavaNextImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Shape:
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`

Note that `num_patches` may be different for each batch, in which case
the data is passed as a list instead of a batched tensor.
Note that `num_patches` may be different per batch and image,
in which case the data is passed as a list instead of a batched tensor.
"""

image_sizes: NotRequired[torch.Tensor]
"""
Shape: `(batch_size, 2)`
Shape: `(batch_size * num_images, 2)`

This should be in `(height, width)` format.
"""
Expand All @@ -64,7 +65,7 @@ class LlavaNextImagePixelInputs(TypedDict):
class LlavaNextImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`

`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down Expand Up @@ -315,10 +316,19 @@ def __init__(self,
torch.empty(config.text_config.hidden_size))

def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
if list(data.shape[1:]) != [2]:
raise ValueError(
f"The expected image sizes shape is batch dimension plus "
f"{[2]}. You supplied {data.shape}.")
expected_dims = (2, )

def _validate_shape(d: torch.Tensor):
actual_dims = tuple(d.shape)

if actual_dims != expected_dims:
expected_expr = str(expected_dims)
raise ValueError(
f"The expected shape of image sizes per image per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")

for d in data:
_validate_shape(d)

return data

Expand All @@ -335,7 +345,7 @@ def _validate_shape(d: torch.Tensor):
if actual_dims != expected_dims:
expected_expr = ("num_patches", *map(str, expected_dims))
raise ValueError(
"The expected shape of pixel values in each batch element "
"The expected shape of pixel values per image per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")

for d in data:
Expand All @@ -357,35 +367,25 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")

if not isinstance(image_sizes, torch.Tensor):
if not isinstance(image_sizes, (torch.Tensor, list)):
raise ValueError("Incorrect type of image sizes. "
f"Got type: {type(image_sizes)}")

# Remove the N dimension until multiple images are supported.
if isinstance(pixel_values, torch.Tensor):
pixel_values = pixel_values.squeeze(1)
else:
pixel_values = [t.squeeze(0) for t in pixel_values]

image_sizes = image_sizes.squeeze(1)

return LlavaNextImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(pixel_values),
image_sizes=self._validate_image_sizes(image_sizes),
data=self._validate_pixel_values(flatten_bn(pixel_values)),
image_sizes=self._validate_image_sizes(
flatten_bn(image_sizes, concat=True)),
)

if image_embeds is not None:
if not isinstance(image_embeds, torch.Tensor):
raise ValueError("Incorrect type of image embeds. "
f"Got type: {type(image_embeds)}")

# Remove the N dimension until multiple images are supported.
image_embeds = image_embeds.squeeze(1)

return LlavaNextImageEmbeddingInputs(
type="image_embeds",
data=image_embeds,
data=flatten_bn(image_embeds),
)

raise AssertionError("This line should be unreachable.")
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/paligemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,13 @@
class PaliGemmaImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: (batch_size, num_channels, height, width)"""
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""


class PaliGemmaImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`

`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down
50 changes: 27 additions & 23 deletions vllm/model_executor/models/phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
from .interfaces import SupportsMultiModal
from .utils import merge_multimodal_embeddings
from .utils import flatten_bn, merge_multimodal_embeddings

logger = init_logger(__name__)

Expand Down Expand Up @@ -75,15 +75,16 @@ class Phi3VImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Shape:
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`

Note that `num_patches` may be different for each batch, in which case
the data is passed as a list instead of a batched tensor.
Note that `num_patches` may be different per batch and image,
in which case the data is passed as a list instead of a batched tensor.
"""

image_sizes: torch.Tensor
"""
Shape: `(batch_size, 2)`
Shape: `(batch_size * num_images, 2)`

This should be in `(height, width)` format.
"""
Expand All @@ -92,7 +93,7 @@ class Phi3VImagePixelInputs(TypedDict):
class Phi3VImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`

`hidden_size` must match the hidden size of language model backbone.
"""
Expand Down Expand Up @@ -511,10 +512,19 @@ def __init__(self,
self.sampler = Sampler()

def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
if list(data.shape[1:]) != [2]:
raise ValueError(
f"The expected shape of image sizes is batch dimension plus "
f"{[2]}. You supplied {tuple(data.shape)}.")
expected_dims = (2, )

def _validate_shape(d: torch.Tensor):
actual_dims = tuple(d.shape)

if actual_dims != expected_dims:
expected_expr = str(expected_dims)
raise ValueError(
f"The expected shape of image sizes per image per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")

for d in data:
_validate_shape(d)
Comment on lines +515 to +527
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While this is more complex, the error message becomes consistent with the one for pixel_values.


return data

Expand All @@ -531,7 +541,7 @@ def _validate_shape(d: torch.Tensor):
if actual_dims != expected_dims:
expected_expr = ("num_patches", *map(str, expected_dims))
raise ValueError(
"The expected shape of pixel values in each batch element "
"The expected shape of pixel values per image per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")

for d in data:
Expand All @@ -556,30 +566,24 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")

if not isinstance(image_sizes, torch.Tensor):
if not isinstance(image_sizes, (torch.Tensor, list)):
raise ValueError("Incorrect type of image sizes. "
f"Got type: {type(image_sizes)}")

# Merge the B and N dimensions.
if isinstance(pixel_values, torch.Tensor):
pixel_values = pixel_values.flatten(0, 1)
else:
pixel_values = torch.cat(pixel_values)

image_sizes = image_sizes.flatten(0, 1)

return Phi3VImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(pixel_values),
image_sizes=self._validate_image_sizes(image_sizes))
data=self._validate_pixel_values(flatten_bn(pixel_values)),
image_sizes=self._validate_image_sizes(
flatten_bn(image_sizes, concat=True)))

if image_embeds is not None:
if not isinstance(image_embeds, torch.Tensor):
raise ValueError("Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}")

return Phi3VImageEmbeddingInputs(
type="image_embeds",
data=image_embeds,
data=flatten_bn(image_embeds),
)

raise AssertionError("This line should be unreachable.")
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
class UltravoxAudioFeatureInputs(TypedDict):
type: Literal["audio_features"]
data: Union[torch.Tensor, List[torch.Tensor]]
"""Shape: `(batch_size, 80, M)"""
"""Shape: `(batch_size * num_audios, 80, M)"""


class UltravoxAudioEmbeddingInputs(TypedDict):
Expand Down
Loading
Loading