huggingface · molbap · Jun 13, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
@@ -16,7 +16,7 @@
 Image/Text processor class for ALIGN
 """
 
-from typing import List, Union
+from typing import List, Union, Unpack
 
 from ...image_utils import ImageInput
 from ...processing_utils import (
@@ -37,39 +37,26 @@
     import torch  # noqa: F401
 
 
-class AlignProcessorKwargs(ProcessingKwargs, total=False):
+class AlignProcessorKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False):
     """
     Inherits from `ProcessingKwargs` to provide:
         1) Additional keys that this model requires to process inputs.
         2) Default values for extra keys.
     New keys have to be defined as follows to ensure type hinting is done correctly.
 
     ```python
-    common_kwargs: CommonKwargs = {
-            **CommonKwargs.__annotations__,
-        }
-        text_kwargs: TextKwargs = {
-            **TextKwargs.__annotations__,
-            "a_new_text_boolean_key": Optional[bool],
-        }
-        images_kwargs: ImagesKwargs = {
-            **ImagesKwargs.__annotations__,
-            "a_new_image_processing_key": Optional[int]
-        }
-    ```
+    images_kwargs: ImagesKwargs = {"new_image_kwarg": Optional[bool]}
 
-    """
-
-    common_kwargs: CommonKwargs = {
-        **CommonKwargs.__annotations__,
-    }
-    text_kwargs: TextKwargs = {
-        **TextKwargs.__annotations__,
-    }
-    images_kwargs: ImagesKwargs = {
-        **ImagesKwargs.__annotations__,
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+            "max_length": 64,
+        },
     }
 
+    ```
+    """
+
     _defaults = {
-    _defaults = {
+    padding: "max_length"
+    max_lenght: 64
-    _defaults = {
+    padding: "max_length"
+    max_lenght: 64
         "text_kwargs": {
             "padding": "max_length",
@@ -106,9 +93,10 @@ class AlignProcessor(ProcessorMixin):
 
         processor(images=your_pil_image, text=["What is that?"], **all_kwargs)
 
-        # passing directly any number of kwargs is also supported, but not recommended
+        # passing directly any number of kwargs flattened is also supported
 
-        processor(images=your_pil_image, text=["What is that?"], padding="do_not_pad)
+        all_kwargs = {"return_tensors": "pt", "crop_size": {"height": 214, "width": 214}, "padding": "max_length", "max_length": 76}
+        processor(images=your_pil_image, text=["What is that?"], **all_kwargs)
         ```
 
     Args:
@@ -132,10 +120,7 @@ def __call__(
         images: ImageInput = None,
         audio=None,
         videos=None,
-        text_kwargs: AlignProcessorKwargs.text_kwargs = None,
-        images_kwargs: AlignProcessorKwargs.images_kwargs = None,
-        common_kwargs: AlignProcessorKwargs.common_kwargs = None,
-        **kwargs: AlignProcessorKwargs,
+        **kwargs: Unpack[AlignProcessorKwargs],
     ) -> BatchEncoding:
         """
         Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
@@ -171,9 +156,6 @@ def __call__(
             raise ValueError("You must specify either text or images.")
         output_kwargs = self._merge_kwargs(
             AlignProcessorKwargs,
-            text_kwargs=text_kwargs,
-            images_kwargs=images_kwargs,
-            common_kwargs=common_kwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )

@@ -244,12 +244,22 @@ class CommonKwargs(TypedDict, total=False):
     return_tensors: Optional[Union[str, TensorType]]
 
 
-class ProcessingKwargs(TypedDict, total=False):
-    common_kwargs: CommonKwargs
-    text_kwargs: TextKwargs
-    images_kwargs: ImagesKwargs
-    audio_kwargs: AudioKwargs
-    videos_kwargs: VideosKwargs
+class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, total=False):
+    common_kwargs: CommonKwargs = {
+        **CommonKwargs.__annotations__,
+    }
+    text_kwargs: TextKwargs = {
+        **TextKwargs.__annotations__,
+    }
+    images_kwargs: ImagesKwargs = {
+        **ImagesKwargs.__annotations__,
+    }
+    videos_kwargs: VideosKwargs = {
+        **VideosKwargs.__annotations__,
+    }
+    audio_kwargs: AudioKwargs = {
+        **AudioKwargs.__annotations__,
+    }
 
 
 class ProcessorMixin(PushToHubMixin):
@@ -610,11 +620,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
     def _merge_kwargs(
         self,
         ModelProcessorKwargs: ProcessingKwargs,
-        text_kwargs: Optional[TextKwargs] = None,
-        images_kwargs: Optional[ImagesKwargs] = None,
-        common_kwargs: Optional[CommonKwargs] = None,
-        videos_kwargs: Optional[VideosKwargs] = None,
-        audio_kwargs: Optional[AudioKwargs] = None,
         tokenizer_init_kwargs: Optional[Dict] = None,
         **kwargs,
     ) -> Dict[str, Dict]:
@@ -648,30 +653,21 @@ def _merge_kwargs(
         Args:
             ModelProcessorKwargs (`ProcessingKwargs`):
                 Typed dictionary of kwargs specifically required by the model passed.
-            text_kwargs (`TextKwargs`, *optional*):
-                Typed dictionary of kwargs inputs applied to the text modality processor, i.e. the tokenizer.
-            images_kwargs (`ImagesKwargs`, *optional*):
-                Typed dictionary of kwargs inputs applied to the images modality processor.
-            videos_kwargs (`VideosKwargs`, *optional*):
-                Typed dictionary of kwargs inputs applied to the videos modality processor.
-            audio_kwargs (`AudioKwargs`, *optional*):
-                Typed dictionary of kwargs inputs applied to the audio modality processor.
             tokenizer_init_kwargs (`Dict`, *optional*):
-                Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over other kwargs.
+                Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults.
 
         Returns:
             output_kwargs (`Dict`):
                 Dictionary of per-modality kwargs to be passed to each modality-specific processor.
 
         """
-
         # Initialize dictionaries
         output_kwargs = {
-            "text_kwargs": text_kwargs or {},
-            "images_kwargs": images_kwargs or {},
-            "audio_kwargs": audio_kwargs or {},
-            "videos_kwargs": videos_kwargs or {},
-            "common_kwargs": common_kwargs or {},
+            "text_kwargs": {},
+            "images_kwargs": {},
+            "audio_kwargs": {},
+            "videos_kwargs": {},
+            "common_kwargs": {},
         }
 
         default_kwargs = {
@@ -685,31 +681,28 @@ def _merge_kwargs(
         # get defaults from set model processor kwargs if they exist
         for modality in default_kwargs:
             default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
-        # then override with tokenizer-level arguments passed
-        if tokenizer_init_kwargs:
-            default_kwargs["text_kwargs"].update(
-                {k: v for k, v in tokenizer_init_kwargs.items() if k in ModelProcessorKwargs.text_kwargs}
-            )
-
-        # then get passed per-modality dictionaries if they exist
+        # update modality kwargs with passed kwargs
         for modality in output_kwargs:
             output_kwargs[modality] = {
                 **default_kwargs[modality],
-                **output_kwargs[modality],
-                **kwargs.pop(modality, {}),
             }
-            # then merge kwargs by name
-            for modality_key in ModelProcessorKwargs[modality].__annotations__.keys():
-                modality_kwarg_value = kwargs.pop(modality_key, None)
-                if modality_kwarg_value is not None:
-                    output_kwargs[modality] = modality_kwarg_value
+            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
+                # init with tokenizer init kwargs if necessary
+                if modality_key in tokenizer_init_kwargs:
+                    output_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
+                # check if we received a structured kwarg dict or not to handle it correctly
+                if modality in kwargs:
+                    kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
+                else:
+                    kwarg_value = kwargs.pop(modality_key, "__empty__")
+                if kwarg_value != "__empty__":
+                    output_kwargs[modality][modality_key] = kwarg_value
 
         # if something remains in kwargs, it belongs to common
         output_kwargs["common_kwargs"].update(kwargs)
         # all modality-specific kwargs are updated with common kwargs
         for modality in output_kwargs:
             output_kwargs[modality].update(output_kwargs["common_kwargs"])
-
         return output_kwargs
 
     @classmethod

@@ -205,7 +205,8 @@ def test_model_input_names(self):
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
 
-    def test_defaults_preserved(self):
+    # TODO move these tests to a common Mixin
+    def test_defaults_preserved_kwargs(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer(max_length=117)
 
@@ -218,6 +219,19 @@ def test_defaults_preserved(self):
 
         self.assertEqual(len(inputs["input_ids"]), 117)
 
+    @require_torch
+    def test_defaults_preserved_image_kwargs(self):
+        image_processor = self.get_image_processor(crop_size=(234, 234))
+        tokenizer = self.get_tokenizer(max_length=117)
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+
     @require_torch
     def test_structured_kwargs(self):
         image_processor = self.get_image_processor()
@@ -229,12 +243,34 @@ def test_structured_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
-        common_kwargs = {"return_tensors": "pt"}
-        images_kwargs = {"crop_size": {"height": 214, "width": 214}}
-        text_kwargs = {"padding": "max_length", "max_length": 76}
+        all_kwargs = {
+            "return_tensors": "pt",
+            "crop_size": {"height": 214, "width": 214},
+            "padding": "max_length",
+            "max_length": 76,
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
-        # Combine them into a single dictionary
-        all_kwargs = {"images_kwargs": images_kwargs, "text_kwargs": text_kwargs, "common_kwargs": common_kwargs}
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    def test_structured_kwargs_nested(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = AlignProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.assertEqual(inputs["pixel_values"].shape[2], 214)