huggingface · amyeroberts · Jun 13, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
@@ -15,6 +15,7 @@
 
 import base64
 import os
+import warnings
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -202,7 +203,12 @@ def infer_channel_dimension_format(
     else:
         raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
 
-    if image.shape[first_dim] in num_channels:
+    if image.shape[first_dim] in num_channels and image.shape[last_dim] in num_channels:
+        warnings.warn(
+            f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension. Please specify with input_data_format if this is incorrect."
+        )
+        return ChannelDimension.FIRST
+    elif image.shape[first_dim] in num_channels:
         return ChannelDimension.FIRST
     elif image.shape[last_dim] in num_channels:
         return ChannelDimension.LAST

diff --git a/src/transformers/models/siglip/image_processing_siglip.py b/src/transformers/models/siglip/image_processing_siglip.py
@@ -216,6 +216,7 @@ def preprocess(
             size=size,
             resample=resample,
         )
+
         # All transformations expect numpy arrays.
         images = [to_numpy_array(image) for image in images]