From be1d6f9fbe24d9a02b2d01f5de2efcf9f63860fb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 28 Aug 2024 09:09:16 +0000
Subject: [PATCH 1/8] Fix incompatibility between multimodal tensor stacking
 and multi-image support in LLaVA-NeXT

---
 vllm/model_executor/models/llava_next.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 3a87242954114..629ec11c1de2a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -361,13 +361,16 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image sizes. "
                                  f"Got type: {type(image_sizes)}")
 
-            # Remove the N dimension until multiple images are supported.
+            # Flatten the B and N dimensions
             if isinstance(pixel_values, torch.Tensor):
-                pixel_values = pixel_values.squeeze(1)
+                pixel_values = pixel_values.flatten(0, 1)
             else:
-                pixel_values = [t.squeeze(0) for t in pixel_values]
+                pixel_values = [
+                    patch_item for batch_item in pixel_values
+                    for patch_item in batch_item
+                ]
 
-            image_sizes = image_sizes.squeeze(1)
+            image_sizes = image_sizes.flatten(0, 1)
 
             return LlavaNextImagePixelInputs(
                 type="pixel_values",
@@ -380,8 +383,8 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image embeds. "
                                  f"Got type: {type(image_embeds)}")
 
-            # Remove the N dimension until multiple images are supported.
-            image_embeds = image_embeds.squeeze(1)
+            # Flatten the B and N dimensions
+            image_embeds = image_embeds.flatten(0, 1)
 
             return LlavaNextImageEmbeddingInputs(
                 type="image_embeds",

From 21070710da66d0b5d48a86f9ebc886137052d9cd Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 28 Aug 2024 09:41:40 +0000
Subject: [PATCH 2/8] Fix handling of `image_sizes`; update docs

---
 vllm/model_executor/models/blip2.py      |  4 +--
 vllm/model_executor/models/chameleon.py  |  2 +-
 vllm/model_executor/models/internvl.py   | 17 ++++--------
 vllm/model_executor/models/llava.py      |  4 +--
 vllm/model_executor/models/llava_next.py | 32 +++++++---------------
 vllm/model_executor/models/paligemma.py  |  4 +--
 vllm/model_executor/models/phi3v.py      |  7 +++--
 vllm/model_executor/models/ultravox.py   |  2 +-
 vllm/model_executor/models/utils.py      | 35 ++++++++++++++++++++++--
 9 files changed, 61 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7c9123079c44f..addfec91b950e 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -40,13 +40,13 @@
 class Blip2ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: (batch_size, num_channels, height, width)"""
+    """Shape: `(batch_size, num_images, num_channels, height, width)`"""
 
 
 class Blip2ImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
-    """Shape: `(batch_size, image_feature_size, hidden_size)`
+    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 2d4f172ce0be6..664d83c193481 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -53,7 +53,7 @@
 class ChameleonImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_channels, height, width)`"""
+    """Shape: `(batch_size, num_images, num_channels, height, width)`"""
 
 
 def get_max_chameleon_image_tokens(ctx: InputContext):
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 7f213287f33b4..6e8c57670a6f5 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -29,7 +29,7 @@
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
                    get_clip_num_patches)
 from .interfaces import SupportsMultiModal
-from .utils import (filter_weights, init_vllm_registered_model,
+from .utils import (filter_weights, flatten_bn, init_vllm_registered_model,
                     merge_multimodal_embeddings)
 
 IMG_START = '<img>'
@@ -44,7 +44,8 @@ class InternVLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: Union[torch.Tensor, List[torch.Tensor]]
     """
-    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+    Shape:
+    `(batch_size, num_images, 1 + num_patches, num_channels, height, width)`
 
     Note that `num_patches` may be different for each batch, in which case
     the data is passed as a list instead of a batched tensor.
@@ -54,7 +55,7 @@ class InternVLImagePixelInputs(TypedDict):
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: Union[torch.Tensor, List[torch.Tensor]]
-    """Shape: `(batch_size, image_feature_size, hidden_size)`
+    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
@@ -413,12 +414,9 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
 
-            # Flatten the B and N dimensions
-            image_embeds = image_embeds.flatten(0, 2)
-
             return InternVLImageEmbeddingInputs(
                 type="image_embeds",
-                data=image_embeds,
+                data=flatten_bn(image_embeds),
             )
 
         self.img_context_token_id = image_token_id[0]
@@ -428,12 +426,9 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            # Flatten the B and N dimensions
-            pixel_values = pixel_values.flatten(0, 2)
-
             return InternVLImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
+                data=self._validate_pixel_values(flatten_bn(pixel_values)),
             )
 
         raise AssertionError("This line should be unreachable.")
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 03a0abf1db481..32226ae9a1108 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -30,13 +30,13 @@
 class LlavaImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_channels, height, width)`"""
+    """Shape: `(batch_size, num_images, num_channels, height, width)`"""
 
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
-    """Shape: `(batch_size, image_feature_size, hidden_size)`
+    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 629ec11c1de2a..871d3b6995854 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -29,7 +29,7 @@
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      dummy_seq_data_for_siglip, get_siglip_image_feature_size,
                      get_siglip_patch_grid_length, input_processor_for_siglip)
-from .utils import (filter_weights, init_vllm_registered_model,
+from .utils import (filter_weights, flatten_bn, init_vllm_registered_model,
                     merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
@@ -47,7 +47,8 @@ class LlavaNextImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: Union[torch.Tensor, List[torch.Tensor]]
     """
-    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+    Shape:
+    `(batch_size, num_images, 1 + num_patches, num_channels, height, width)`
 
     Note that `num_patches` may be different for each batch, in which case
     the data is passed as a list instead of a batched tensor.
@@ -55,7 +56,7 @@ class LlavaNextImagePixelInputs(TypedDict):
 
     image_sizes: NotRequired[torch.Tensor]
     """
-    Shape: `(batch_size, 2)`
+    Shape: `(batch_size, num_images, 2)`
 
     This should be in `(height, width)` format.
     """
@@ -64,7 +65,7 @@ class LlavaNextImagePixelInputs(TypedDict):
 class LlavaNextImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
-    """Shape: `(batch_size, image_feature_size, hidden_size)`
+    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
@@ -357,25 +358,15 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            if not isinstance(image_sizes, torch.Tensor):
+            if not isinstance(image_sizes, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image sizes. "
                                  f"Got type: {type(image_sizes)}")
 
-            # Flatten the B and N dimensions
-            if isinstance(pixel_values, torch.Tensor):
-                pixel_values = pixel_values.flatten(0, 1)
-            else:
-                pixel_values = [
-                    patch_item for batch_item in pixel_values
-                    for patch_item in batch_item
-                ]
-
-            image_sizes = image_sizes.flatten(0, 1)
-
             return LlavaNextImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
-                image_sizes=self._validate_image_sizes(image_sizes),
+                data=self._validate_pixel_values(flatten_bn(pixel_values)),
+                image_sizes=self._validate_image_sizes(
+                    flatten_bn(image_sizes)),
             )
 
         if image_embeds is not None:
@@ -383,12 +374,9 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image embeds. "
                                  f"Got type: {type(image_embeds)}")
 
-            # Flatten the B and N dimensions
-            image_embeds = image_embeds.flatten(0, 1)
-
             return LlavaNextImageEmbeddingInputs(
                 type="image_embeds",
-                data=image_embeds,
+                data=flatten_bn(image_embeds),
             )
 
         raise AssertionError("This line should be unreachable.")
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 0700f0c29d708..33b0224147a0a 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -34,13 +34,13 @@
 class PaliGemmaImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: (batch_size, num_channels, height, width)"""
+    """Shape: `(batch_size, num_images, num_channels, height, width)`"""
 
 
 class PaliGemmaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
-    """Shape: `(batch_size, image_feature_size, hidden_size)`
+    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 61f1d73976379..da0796388922f 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -75,7 +75,8 @@ class Phi3VImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: Union[torch.Tensor, List[torch.Tensor]]
     """
-    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+    Shape:
+    `(batch_size, num_images, 1 + num_patches, num_channels, height, width)`
 
     Note that `num_patches` may be different for each batch, in which case
     the data is passed as a list instead of a batched tensor.
@@ -83,7 +84,7 @@ class Phi3VImagePixelInputs(TypedDict):
 
     image_sizes: torch.Tensor
     """
-    Shape: `(batch_size, 2)`
+    Shape: `(batch_size, num_images, 2)`
 
     This should be in `(height, width)` format.
     """
@@ -92,7 +93,7 @@ class Phi3VImagePixelInputs(TypedDict):
 class Phi3VImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: Union[torch.Tensor, List[torch.Tensor]]
-    """Shape: `(batch_size, image_feature_size, hidden_size)`
+    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index c81c2fd114eb8..1a9cf9ad0df62 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -49,7 +49,7 @@
 class UltravoxAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
     data: Union[torch.Tensor, List[torch.Tensor]]
-    """Shape: `(batch_size, 80, M)"""
+    """Shape: `(batch_size, num_audios, 80, M)"""
 
 
 class UltravoxAudioEmbeddingInputs(TypedDict):
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 00026b7ebe2e1..ba6d1e2312412 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,4 +1,5 @@
-from typing import Dict, Iterable, List, Optional, Protocol, Tuple
+from typing import (Dict, Iterable, List, Optional, Protocol, Tuple, Union,
+                    overload)
 
 import numpy as np
 import torch
@@ -55,6 +56,35 @@ def init_vllm_registered_model(
     )
 
 
+@overload
+def flatten_bn(x: torch.Tensor) -> torch.Tensor:
+    ...
+
+
+@overload
+def flatten_bn(x: List[torch.Tensor]) -> torch.Tensor:
+    ...
+
+
+@overload
+def flatten_bn(x: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+    ...
+
+
+def flatten_bn(
+    x: Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor]
+) -> Union[List[torch.Tensor], torch.Tensor]:
+    """
+    Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.
+
+    The input tensor should have shape ``(B, N, ...)```.
+    """
+    if isinstance(x, torch.Tensor):
+        return x.flatten(0, 1)
+
+    return [x_n for x_b in x for x_n in x_b]
+
+
 def _flatten_embeddings(embeddings: NestedTensors) -> torch.Tensor:
     """
     Recursively concatenates NestedTensors along any heterogeneously sized
@@ -93,7 +123,8 @@ def merge_multimodal_embeddings(input_ids: torch.Tensor,
         This updates ``inputs_embeds`` in place.
     """
     mask = (input_ids == placeholder_token_id)
-    num_expected_tokens = mask.sum()
+    num_expected_tokens = mask.sum().item()
+    assert isinstance(num_expected_tokens, int)
 
     flattened = _flatten_embeddings(multimodal_embeddings)
     *dims, embed_dim = flattened.shape

From 0334d34fb90dad8dffefb9d76d3b3ecd4d875da0 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 28 Aug 2024 09:54:21 +0000
Subject: [PATCH 3/8] Correction

---
 vllm/model_executor/models/blip2.py      |  4 +-
 vllm/model_executor/models/chameleon.py  |  2 +-
 vllm/model_executor/models/internvl.py   | 18 +++-----
 vllm/model_executor/models/llava.py      |  4 +-
 vllm/model_executor/models/llava_next.py | 35 ++++++++++------
 vllm/model_executor/models/paligemma.py  |  4 +-
 vllm/model_executor/models/phi3v.py      | 52 +++++++++++++-----------
 vllm/model_executor/models/utils.py      |  9 +---
 8 files changed, 65 insertions(+), 63 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index addfec91b950e..8be786fd3f6f5 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -40,13 +40,13 @@
 class Blip2ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_images, num_channels, height, width)`"""
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
 class Blip2ImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 664d83c193481..b25f5d521a9bf 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -53,7 +53,7 @@
 class ChameleonImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_images, num_channels, height, width)`"""
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
 def get_max_chameleon_image_tokens(ctx: InputContext):
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 6e8c57670a6f5..d68148c67e777 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -45,17 +45,17 @@ class InternVLImagePixelInputs(TypedDict):
     data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape:
-    `(batch_size, num_images, 1 + num_patches, num_channels, height, width)`
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different for each batch, in which case
-    the data is passed as a list instead of a batched tensor.
+    Note that `num_patches` may be different for per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
     """
 
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: Union[torch.Tensor, List[torch.Tensor]]
-    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
@@ -371,14 +371,6 @@ def extract_feature(self, pixel_values):
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
-    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
-        if list(data.shape[1:]) != [2]:
-            raise ValueError(
-                f"The expected image sizes shape is batch dimension plus "
-                f"{[2]}. You supplied {data.shape}.")
-
-        return data
-
     def _validate_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -392,7 +384,7 @@ def _validate_shape(d: torch.Tensor):
             if actual_dims != expected_dims:
                 expected_expr = ("num_patches", *map(str, expected_dims))
                 raise ValueError(
-                    "The expected shape of pixel values in each batch element "
+                    "The expected shape of pixel values per image per batch "
                     f"is {expected_expr}. You supplied {tuple(d.shape)}.")
 
         for d in data:
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 32226ae9a1108..490c93294d50f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -30,13 +30,13 @@
 class LlavaImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_images, num_channels, height, width)`"""
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 871d3b6995854..511e4844a9645 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -48,15 +48,15 @@ class LlavaNextImagePixelInputs(TypedDict):
     data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape:
-    `(batch_size, num_images, 1 + num_patches, num_channels, height, width)`
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different for each batch, in which case
-    the data is passed as a list instead of a batched tensor.
+    Note that `num_patches` may be different for per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
     """
 
-    image_sizes: NotRequired[torch.Tensor]
+    image_sizes: NotRequired[Union[torch.Tensor, List[torch.Tensor]]]
     """
-    Shape: `(batch_size, num_images, 2)`
+    Shape: `(batch_size * num_images, 2)`
 
     This should be in `(height, width)` format.
     """
@@ -65,7 +65,7 @@ class LlavaNextImagePixelInputs(TypedDict):
 class LlavaNextImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
@@ -315,11 +315,22 @@ def __init__(self,
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
 
-    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
-        if list(data.shape[1:]) != [2]:
-            raise ValueError(
-                f"The expected image sizes shape is batch dimension plus "
-                f"{[2]}. You supplied {data.shape}.")
+    def _validate_image_sizes(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        expected_dims = (2,)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+    
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    f"The expected shape of image sizes per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
 
         return data
 
@@ -336,7 +347,7 @@ def _validate_shape(d: torch.Tensor):
             if actual_dims != expected_dims:
                 expected_expr = ("num_patches", *map(str, expected_dims))
                 raise ValueError(
-                    "The expected shape of pixel values in each batch element "
+                    "The expected shape of pixel values per image per batch "
                     f"is {expected_expr}. You supplied {tuple(d.shape)}.")
 
         for d in data:
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 33b0224147a0a..46ee4c3208b7a 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -34,13 +34,13 @@
 class PaliGemmaImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_images, num_channels, height, width)`"""
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
 
 
 class PaliGemmaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: torch.Tensor
-    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index da0796388922f..a46e5b21b4fd8 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -44,7 +44,7 @@
 
 from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsMultiModal
-from .utils import merge_multimodal_embeddings
+from .utils import flatten_bn, merge_multimodal_embeddings
 
 logger = init_logger(__name__)
 
@@ -76,15 +76,15 @@ class Phi3VImagePixelInputs(TypedDict):
     data: Union[torch.Tensor, List[torch.Tensor]]
     """
     Shape:
-    `(batch_size, num_images, 1 + num_patches, num_channels, height, width)`
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different for each batch, in which case
-    the data is passed as a list instead of a batched tensor.
+    Note that `num_patches` may be different for per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
     """
 
-    image_sizes: torch.Tensor
+    image_sizes: Union[torch.Tensor, List[torch.Tensor]]
     """
-    Shape: `(batch_size, num_images, 2)`
+    Shape: `(batch_size * num_images, 2)`
 
     This should be in `(height, width)` format.
     """
@@ -93,7 +93,7 @@ class Phi3VImagePixelInputs(TypedDict):
 class Phi3VImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
     data: Union[torch.Tensor, List[torch.Tensor]]
-    """Shape: `(batch_size, num_images, image_feature_size, hidden_size)`
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     """
@@ -511,11 +511,22 @@ def __init__(self,
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
-    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
-        if list(data.shape[1:]) != [2]:
-            raise ValueError(
-                f"The expected shape of image sizes is batch dimension plus "
-                f"{[2]}. You supplied {tuple(data.shape)}.")
+    def _validate_image_sizes(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        expected_dims = (2,)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+    
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    f"The expected shape of image sizes per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
 
         return data
 
@@ -532,7 +543,7 @@ def _validate_shape(d: torch.Tensor):
             if actual_dims != expected_dims:
                 expected_expr = ("num_patches", *map(str, expected_dims))
                 raise ValueError(
-                    "The expected shape of pixel values in each batch element "
+                    "The expected shape of pixel values per image per batch "
                     f"is {expected_expr}. You supplied {tuple(d.shape)}.")
 
         for d in data:
@@ -561,26 +572,19 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image sizes. "
                                  f"Got type: {type(image_sizes)}")
 
-            # Merge the B and N dimensions.
-            if isinstance(pixel_values, torch.Tensor):
-                pixel_values = pixel_values.flatten(0, 1)
-            else:
-                pixel_values = torch.cat(pixel_values)
-
-            image_sizes = image_sizes.flatten(0, 1)
-
             return Phi3VImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
-                image_sizes=self._validate_image_sizes(image_sizes))
+                data=self._validate_pixel_values(flatten_bn(pixel_values)),
+                image_sizes=self._validate_image_sizes(flatten_bn(image_sizes)))
 
         if image_embeds is not None:
             if not isinstance(image_embeds, torch.Tensor):
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
+
             return Phi3VImageEmbeddingInputs(
                 type="image_embeds",
-                data=image_embeds,
+                data=flatten_bn(image_embeds),
             )
 
         raise AssertionError("This line should be unreachable.")
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index ba6d1e2312412..78fd6abd08ff5 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -62,17 +62,12 @@ def flatten_bn(x: torch.Tensor) -> torch.Tensor:
 
 
 @overload
-def flatten_bn(x: List[torch.Tensor]) -> torch.Tensor:
-    ...
-
-
-@overload
-def flatten_bn(x: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]:
     ...
 
 
 def flatten_bn(
-    x: Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor]
+    x: Union[List[torch.Tensor], torch.Tensor]
 ) -> Union[List[torch.Tensor], torch.Tensor]:
     """
     Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.

From 89e606738c4ba3a5392347b58266f37363468e58 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 28 Aug 2024 09:55:09 +0000
Subject: [PATCH 4/8] Fix typo

---
 vllm/model_executor/models/internvl.py   | 2 +-
 vllm/model_executor/models/llava_next.py | 6 +++---
 vllm/model_executor/models/phi3v.py      | 9 +++++----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index d68148c67e777..b02d54389a54d 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -47,7 +47,7 @@ class InternVLImagePixelInputs(TypedDict):
     Shape:
     `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different for per batch and image,
+    Note that `num_patches` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 511e4844a9645..b9636f769fe62 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -50,7 +50,7 @@ class LlavaNextImagePixelInputs(TypedDict):
     Shape:
     `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different for per batch and image,
+    Note that `num_patches` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
 
@@ -318,11 +318,11 @@ def __init__(self,
     def _validate_image_sizes(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
-        expected_dims = (2,)
+        expected_dims = (2, )
 
         def _validate_shape(d: torch.Tensor):
             actual_dims = tuple(d.shape)
-    
+
             if actual_dims != expected_dims:
                 expected_expr = str(expected_dims)
                 raise ValueError(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a46e5b21b4fd8..26ba4f8645164 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -78,7 +78,7 @@ class Phi3VImagePixelInputs(TypedDict):
     Shape:
     `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different for per batch and image,
+    Note that `num_patches` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
 
@@ -514,11 +514,11 @@ def __init__(self,
     def _validate_image_sizes(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
-        expected_dims = (2,)
+        expected_dims = (2, )
 
         def _validate_shape(d: torch.Tensor):
             actual_dims = tuple(d.shape)
-    
+
             if actual_dims != expected_dims:
                 expected_expr = str(expected_dims)
                 raise ValueError(
@@ -575,7 +575,8 @@ def _parse_and_validate_image_input(
             return Phi3VImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(flatten_bn(pixel_values)),
-                image_sizes=self._validate_image_sizes(flatten_bn(image_sizes)))
+                image_sizes=self._validate_image_sizes(
+                    flatten_bn(image_sizes)))
 
         if image_embeds is not None:
             if not isinstance(image_embeds, torch.Tensor):

From f98887fa04698d1a8b76edbe88f456d8360db7e5 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 28 Aug 2024 10:01:27 +0000
Subject: [PATCH 5/8] Simplify

---
 vllm/model_executor/models/llava_next.py |  8 +++-----
 vllm/model_executor/models/phi3v.py      |  8 +++-----
 vllm/model_executor/models/utils.py      | 20 +++++++++++++++++---
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index b9636f769fe62..048ca16974e3c 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -54,7 +54,7 @@ class LlavaNextImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    image_sizes: NotRequired[Union[torch.Tensor, List[torch.Tensor]]]
+    image_sizes: NotRequired[torch.Tensor]
     """
     Shape: `(batch_size * num_images, 2)`
 
@@ -315,9 +315,7 @@ def __init__(self,
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
 
-    def _validate_image_sizes(
-        self, data: Union[torch.Tensor, List[torch.Tensor]]
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
 
         def _validate_shape(d: torch.Tensor):
@@ -377,7 +375,7 @@ def _parse_and_validate_image_input(
                 type="pixel_values",
                 data=self._validate_pixel_values(flatten_bn(pixel_values)),
                 image_sizes=self._validate_image_sizes(
-                    flatten_bn(image_sizes)),
+                    flatten_bn(image_sizes, concat=True)),
             )
 
         if image_embeds is not None:
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 26ba4f8645164..370c34a9fbb69 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -511,9 +511,7 @@ def __init__(self,
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
 
-    def _validate_image_sizes(
-        self, data: Union[torch.Tensor, List[torch.Tensor]]
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
 
         def _validate_shape(d: torch.Tensor):
@@ -568,7 +566,7 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            if not isinstance(image_sizes, torch.Tensor):
+            if not isinstance(image_sizes, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image sizes. "
                                  f"Got type: {type(image_sizes)}")
 
@@ -576,7 +574,7 @@ def _parse_and_validate_image_input(
                 type="pixel_values",
                 data=self._validate_pixel_values(flatten_bn(pixel_values)),
                 image_sizes=self._validate_image_sizes(
-                    flatten_bn(image_sizes)))
+                    flatten_bn(image_sizes, concat=True)))
 
         if image_embeds is not None:
             if not isinstance(image_embeds, torch.Tensor):
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 78fd6abd08ff5..6e7ee511bf27f 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,5 +1,5 @@
-from typing import (Dict, Iterable, List, Optional, Protocol, Tuple, Union,
-                    overload)
+from typing import (Dict, Iterable, List, Literal, Optional, Protocol, Tuple,
+                    Union, overload)
 
 import numpy as np
 import torch
@@ -66,8 +66,19 @@ def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]:
     ...
 
 
+@overload
+def flatten_bn(
+    x: Union[List[torch.Tensor], torch.Tensor],
+    *,
+    concat: Literal[True],
+) -> torch.Tensor:
+    ...
+
+
 def flatten_bn(
-    x: Union[List[torch.Tensor], torch.Tensor]
+    x: Union[List[torch.Tensor], torch.Tensor],
+    *,
+    concat: bool = False,
 ) -> Union[List[torch.Tensor], torch.Tensor]:
     """
     Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.
@@ -77,6 +88,9 @@ def flatten_bn(
     if isinstance(x, torch.Tensor):
         return x.flatten(0, 1)
 
+    if concat:
+        return torch.cat(x)
+
     return [x_n for x_b in x for x_n in x_b]
 
 

From 058f8ecabf0ddfd912f5e00e90dc8f5533161088 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 28 Aug 2024 10:05:01 +0000
Subject: [PATCH 6/8] Fix docstring

---
 vllm/model_executor/models/ultravox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 1a9cf9ad0df62..03d6223225511 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -49,7 +49,7 @@
 class UltravoxAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
     data: Union[torch.Tensor, List[torch.Tensor]]
-    """Shape: `(batch_size, num_audios, 80, M)"""
+    """Shape: `(batch_size * num_audios, 80, M)"""
 
 
 class UltravoxAudioEmbeddingInputs(TypedDict):

From 88748cb6c1b268e0b137bae3442d074a3c67422d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 28 Aug 2024 10:06:37 +0000
Subject: [PATCH 7/8] Fix type annotation

---
 vllm/model_executor/models/phi3v.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 370c34a9fbb69..bec1d35388506 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -82,7 +82,7 @@ class Phi3VImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    image_sizes: Union[torch.Tensor, List[torch.Tensor]]
+    image_sizes: torch.Tensor
     """
     Shape: `(batch_size * num_images, 2)`
 

From f99d4cc8c966a4f913d04aefd3b4f8db0f09d55d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 28 Aug 2024 10:27:35 +0000
Subject: [PATCH 8/8] Fix InternVL

---
 vllm/model_executor/models/internvl.py | 23 ++++++++++-------------
 vllm/multimodal/base.py                |  4 ++--
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index b02d54389a54d..ca4d773190e0f 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -42,19 +42,16 @@
 
 class InternVLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: Union[torch.Tensor, List[torch.Tensor]]
+    data: torch.Tensor
     """
     Shape:
-    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
-
-    Note that `num_patches` may be different per batch and image,
-    in which case the data is passed as a list instead of a batched tensor.
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
     """
 
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: Union[torch.Tensor, List[torch.Tensor]]
+    data: torch.Tensor
     """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
@@ -358,7 +355,7 @@ def pixel_shuffle(self, x, scale_factor=0.5):
             x = x.permute(0, 2, 1, 3).contiguous()
         return x
 
-    def extract_feature(self, pixel_values):
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
         vit_embeds = self.vision_model(pixel_values=pixel_values)
         vit_embeds = vit_embeds[:, 1:, :]
 
@@ -371,9 +368,7 @@ def extract_feature(self, pixel_values):
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, List[torch.Tensor]]
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
 
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -382,10 +377,11 @@ def _validate_shape(d: torch.Tensor):
             actual_dims = tuple(d.shape)
 
             if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
+                expected_expr = str(expected_dims)
                 raise ValueError(
                     "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
 
         for d in data:
             _validate_shape(d)
@@ -420,7 +416,8 @@ def _parse_and_validate_image_input(
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(flatten_bn(pixel_values)),
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True).flatten(0, 1)),
             )
 
         raise AssertionError("This line should be unreachable.")
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index f26e3292c264d..c02e61596927a 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -18,7 +18,7 @@
 
 logger = init_logger(__name__)
 
-NestedTensors = Union[List["NestedTensors"], torch.Tensor]
+NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
 """
 Uses a list instead of a tensor if the dimensions of each element do not match.
 """
@@ -61,7 +61,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         tensors_ = cast(List[torch.Tensor], stacked)
         if any(t.shape != tensors_[0].shape for t in tensors_):
             # The tensors have incompatible shapes and can't be stacked.
-            return stacked
+            return tensors_
 
         return torch.stack(tensors_)