From 41a39da78c58496afc1bb9c2a950c1029760db50 Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Sat, 14 Jan 2023 10:46:39 +0100
Subject: [PATCH 01/11] first draft

---
 haystack/nodes/image_to_text/__init__.py     |   2 +
 haystack/nodes/image_to_text/base.py         |  55 ++++++++
 haystack/nodes/image_to_text/transformers.py | 140 +++++++++++++++++++
 3 files changed, 197 insertions(+)
 create mode 100644 haystack/nodes/image_to_text/__init__.py
 create mode 100644 haystack/nodes/image_to_text/base.py
 create mode 100644 haystack/nodes/image_to_text/transformers.py

diff --git a/haystack/nodes/image_to_text/__init__.py b/haystack/nodes/image_to_text/__init__.py
new file mode 100644
index 0000000000..44a8bb419c
--- /dev/null
+++ b/haystack/nodes/image_to_text/__init__.py
@@ -0,0 +1,2 @@
+from haystack.nodes.image_to_text.base import BaseImageToText
+from haystack.nodes.image_to_text.transformers import TransformersImageToText
diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py
new file mode 100644
index 0000000000..3c0542d94f
--- /dev/null
+++ b/haystack/nodes/image_to_text/base.py
@@ -0,0 +1,55 @@
+from typing import List, Optional
+
+from abc import abstractmethod
+
+from haystack.schema import Document
+from haystack.nodes.base import BaseComponent
+
+
+class BaseImageToText(BaseComponent):
+    """
+    Abstract class for ImageToText
+    """
+
+    outgoing_edges = 1
+
+    @abstractmethod
+    def generate_captions(
+        self, image_file_paths: List[str], generate_kwargs: Optional[dict] = None, batch_size: Optional[int] = None
+    ) -> List[Document]:
+        """
+        Abstract method for generating captions.
+
+        :param image_file_paths: Paths of the images
+        :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
+                                See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
+        :param batch_size: Number of images to process at a time.
+        :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path.
+        """
+        pass
+
+    def run(self, file_paths: Optional[List[str]] = None, documents: Optional[List[Document]] = None):  # type: ignore
+
+        if file_paths is None and documents is None:
+            raise ValueError("You must either specify documents or image file_paths to process.")
+        if file_paths is not None and documents is not None:
+            raise ValueError(
+                "You specified both documents and image_file_paths. You need to specify only one of the two parameters."
+            )
+        if file_paths is not None:
+            image_file_paths = file_paths
+        if documents is not None:
+            if any((doc.content_type != "image" for doc in documents)):
+                raise ValueError("The ImageToText node only supports image documents.")
+            image_file_paths = [doc.content for doc in documents]
+
+        results: dict = {}
+        results["documents"] = self.generate_captions(image_file_paths=image_file_paths)
+
+        return results, "output_1"
+
+    def run_batch(
+        self, file_paths: Optional[List[str]] = None, documents: Optional[List[Document]] = None
+    ):  # type: ignore
+
+        return self.run(file_paths=file_paths, documents=documents)
diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py
new file mode 100644
index 0000000000..189be55e9b
--- /dev/null
+++ b/haystack/nodes/image_to_text/transformers.py
@@ -0,0 +1,140 @@
+from typing import List, Optional, Union
+
+import logging
+
+import torch
+from tqdm.auto import tqdm
+from transformers import pipeline
+
+from haystack.schema import Document
+from haystack.nodes.image_to_text.base import BaseImageToText
+from haystack.modeling.utils import initialize_device_settings
+from haystack.utils.torch_utils import ListDataset
+
+logger = logging.getLogger(__name__)
+
+
+class TransformersImageToText(BaseImageToText):
+    """
+    Transformer based model to generate captions for images using the HuggingFace's transformers framework
+
+    See the up-to-date list of available models on
+    `huggingface.co/models <https://huggingface.co/models?pipeline_tag=image-to-text>`__
+
+    **Example**
+
+     ```python
+        image_file_paths = ["/path/to/images/apple.jpg",
+                            "/path/to/images/cat.jpg", ]
+
+        # Generate captions
+        documents = image_to_text.generate_captions(image_file_paths=image_file_paths)
+
+        # Show results (List of Documents, containing caption and image file_path)
+        print(documents)
+
+        [
+            {
+                "content": "a red apple is sitting on a pile of hay",
+                ...
+                "meta": {
+                            "image_file_path": "/path/to/images/apple.jpg",
+                            ...
+                        },
+                ...
+            },
+            ...
+        ]
+    ```
+    """
+
+    def __init__(
+        self,
+        model_name_or_path: str = "nlpconnect/vit-gpt2-image-captioning",
+        model_version: Optional[str] = None,
+        generate_kwargs: Optional[dict] = None,
+        use_gpu: bool = True,
+        batch_size: int = 16,
+        progress_bar: bool = True,
+        use_auth_token: Optional[Union[str, bool]] = None,
+        devices: Optional[List[Union[str, torch.device]]] = None,
+    ):
+        """
+        Load an Image To Text model from Transformers.
+        See the up-to-date list of available models at
+        https://huggingface.co/models?pipeline_tag=image-to-text
+
+        :param model_name_or_path: Directory of a saved model or the name of a public model.
+                                   See https://huggingface.co/models?pipeline_tag=image-to-text for full list of available models.
+        :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
+        :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
+                                See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
+        :param use_gpu: Whether to use GPU (if available).
+        :param batch_size: Number of documents to process at a time.
+        :param progress_bar: Whether to show a progress bar.
+        :param use_auth_token: The API token used to download private models from Huggingface.
+                               If this parameter is set to `True`, then the token generated when running
+                               `transformers-cli login` (stored in ~/.huggingface) will be used.
+                               Additional information can be found here
+                               https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained
+        :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices.
+                        A list containing torch device objects and/or strings is supported (For example
+                        [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices
+                        parameter is not used and a single cpu device is used for inference.
+        """
+        super().__init__()
+
+        self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
+        if len(self.devices) > 1:
+            logger.warning(
+                f"Multiple devices are not supported in {self.__class__.__name__} inference, "
+                f"using the first device {self.devices[0]}."
+            )
+
+        self.model = pipeline(
+            task="image-to-text",
+            model=model_name_or_path,
+            revision=model_version,
+            device=self.devices[0],
+            use_auth_token=use_auth_token,
+        )
+        self.generate_kwargs = generate_kwargs
+        self.batch_size = batch_size
+        self.progress_bar = progress_bar
+
+    def generate_captions(
+        self, image_file_paths: List[str], generate_kwargs: Optional[dict] = None, batch_size: Optional[int] = None
+    ) -> List[Document]:
+        """
+        Generate captions for provided image files
+
+        :param image_file_paths: Paths of the images
+        :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
+                                See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
+        :param batch_size: Number of images to process at a time.
+        :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path.
+        """
+        generate_kwargs = generate_kwargs or self.generate_kwargs
+        batch_size = batch_size or self.batch_size
+
+        if len(image_file_paths) == 0:
+            raise AttributeError("ImageToText needs at least one filepath to produce a caption.")
+
+        images_dataset = ListDataset(image_file_paths)
+
+        captions: List[str] = []
+
+        for captions_batch in tqdm(
+            self.model(images_dataset, generate_kwargs=generate_kwargs, batch_size=batch_size),
+            disable=not self.progress_bar,
+            total=len(images_dataset),
+            desc="Generating captions",
+        ):
+            captions.append("".join([el["generated_text"] for el in captions_batch]).strip())
+
+        result: List[Document] = []
+        for caption, image_file_path in zip(captions, image_file_paths):
+            document = Document(content=caption, content_type="text", meta={"image_file_path": image_file_path})
+            result.append(document)
+
+        return result

From bfac94f4f32b16843d1e6899bc9479fae0439c8e Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Sat, 14 Jan 2023 11:20:51 +0100
Subject: [PATCH 02/11] fix pylint and mypy

---
 haystack/nodes/image_to_text/transformers.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py
index 189be55e9b..90f1a98410 100644
--- a/haystack/nodes/image_to_text/transformers.py
+++ b/haystack/nodes/image_to_text/transformers.py
@@ -87,8 +87,9 @@ def __init__(
         self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False)
         if len(self.devices) > 1:
             logger.warning(
-                f"Multiple devices are not supported in {self.__class__.__name__} inference, "
-                f"using the first device {self.devices[0]}."
+                "Multiple devices are not supported in %s inference, using the first device %s.",
+                self.__class__.__name__,
+                self.devices[0],
             )
 
         self.model = pipeline(

From ae414dab400510919822ef5f73f80ab631d10d84 Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Sat, 14 Jan 2023 11:28:51 +0100
Subject: [PATCH 03/11] retry w mypy

---
 haystack/nodes/image_to_text/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py
index 3c0542d94f..f66507bb6c 100644
--- a/haystack/nodes/image_to_text/base.py
+++ b/haystack/nodes/image_to_text/base.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Union
 
 from abc import abstractmethod
 
@@ -49,7 +49,7 @@ def run(self, file_paths: Optional[List[str]] = None, documents: Optional[List[D
         return results, "output_1"
 
     def run_batch(
-        self, file_paths: Optional[List[str]] = None, documents: Optional[List[Document]] = None
+        self, file_paths: Optional[List[str]] = None, documents: Union[List[Document], None] = None
     ):  # type: ignore
 
         return self.run(file_paths=file_paths, documents=documents)

From 79b720e408bb3f71bdb25b5054da2f0601f7665d Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Sat, 14 Jan 2023 11:33:40 +0100
Subject: [PATCH 04/11] mypy :-)

---
 haystack/nodes/image_to_text/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py
index f66507bb6c..dc9d1d627b 100644
--- a/haystack/nodes/image_to_text/base.py
+++ b/haystack/nodes/image_to_text/base.py
@@ -48,8 +48,8 @@ def run(self, file_paths: Optional[List[str]] = None, documents: Optional[List[D
 
         return results, "output_1"
 
-    def run_batch(
-        self, file_paths: Optional[List[str]] = None, documents: Union[List[Document], None] = None
-    ):  # type: ignore
+    def run_batch(  # type: ignore
+        self, file_paths: Optional[List[str]] = None, documents: Optional[List[Document]] = None
+    ):
 
         return self.run(file_paths=file_paths, documents=documents)

From 6b14f2e81e244d990645935946ea7e90cb4726a3 Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Sat, 14 Jan 2023 11:37:06 +0100
Subject: [PATCH 05/11] rem unused import

---
 haystack/nodes/image_to_text/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py
index dc9d1d627b..abd33e6769 100644
--- a/haystack/nodes/image_to_text/base.py
+++ b/haystack/nodes/image_to_text/base.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union
+from typing import List, Optional
 
 from abc import abstractmethod
 

From 191660432332fbbcc2d41ae748f56893ca7187c3 Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Tue, 17 Jan 2023 18:49:03 +0100
Subject: [PATCH 06/11] incorporate feedback and initial tests

---
 haystack/nodes/image_to_text/base.py         | 14 ++--
 haystack/nodes/image_to_text/transformers.py | 16 ++--
 test/nodes/test_image_to_text.py             | 87 ++++++++++++++++++++
 3 files changed, 101 insertions(+), 16 deletions(-)
 create mode 100644 test/nodes/test_image_to_text.py

diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py
index abd33e6769..841c162c7a 100644
--- a/haystack/nodes/image_to_text/base.py
+++ b/haystack/nodes/image_to_text/base.py
@@ -15,13 +15,13 @@ class BaseImageToText(BaseComponent):
 
     @abstractmethod
     def generate_captions(
-        self, image_file_paths: List[str], generate_kwargs: Optional[dict] = None, batch_size: Optional[int] = None
+        self, image_file_paths: List[str], generation_kwargs: Optional[dict] = None, batch_size: Optional[int] = None
     ) -> List[Document]:
         """
         Abstract method for generating captions.
 
         :param image_file_paths: Paths of the images
-        :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
+        :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
                                 See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
         :param batch_size: Number of images to process at a time.
         :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path.
@@ -32,16 +32,14 @@ def run(self, file_paths: Optional[List[str]] = None, documents: Optional[List[D
 
         if file_paths is None and documents is None:
             raise ValueError("You must either specify documents or image file_paths to process.")
-        if file_paths is not None and documents is not None:
-            raise ValueError(
-                "You specified both documents and image_file_paths. You need to specify only one of the two parameters."
-            )
+
+        image_file_paths = []
         if file_paths is not None:
-            image_file_paths = file_paths
+            image_file_paths.extend(file_paths)
         if documents is not None:
             if any((doc.content_type != "image" for doc in documents)):
                 raise ValueError("The ImageToText node only supports image documents.")
-            image_file_paths = [doc.content for doc in documents]
+            image_file_paths.extend([doc.content for doc in documents])
 
         results: dict = {}
         results["documents"] = self.generate_captions(image_file_paths=image_file_paths)
diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py
index 90f1a98410..9e7c18d0f3 100644
--- a/haystack/nodes/image_to_text/transformers.py
+++ b/haystack/nodes/image_to_text/transformers.py
@@ -52,7 +52,7 @@ def __init__(
         self,
         model_name_or_path: str = "nlpconnect/vit-gpt2-image-captioning",
         model_version: Optional[str] = None,
-        generate_kwargs: Optional[dict] = None,
+        generation_kwargs: Optional[dict] = None,
         use_gpu: bool = True,
         batch_size: int = 16,
         progress_bar: bool = True,
@@ -67,7 +67,7 @@ def __init__(
         :param model_name_or_path: Directory of a saved model or the name of a public model.
                                    See https://huggingface.co/models?pipeline_tag=image-to-text for full list of available models.
         :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
-        :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
+        :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
                                 See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
         :param use_gpu: Whether to use GPU (if available).
         :param batch_size: Number of documents to process at a time.
@@ -99,23 +99,23 @@ def __init__(
             device=self.devices[0],
             use_auth_token=use_auth_token,
         )
-        self.generate_kwargs = generate_kwargs
+        self.generation_kwargs = generation_kwargs
         self.batch_size = batch_size
         self.progress_bar = progress_bar
 
     def generate_captions(
-        self, image_file_paths: List[str], generate_kwargs: Optional[dict] = None, batch_size: Optional[int] = None
+        self, image_file_paths: List[str], generation_kwargs: Optional[dict] = None, batch_size: Optional[int] = None
     ) -> List[Document]:
         """
         Generate captions for provided image files
 
         :param image_file_paths: Paths of the images
-        :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
+        :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
                                 See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
         :param batch_size: Number of images to process at a time.
         :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path.
         """
-        generate_kwargs = generate_kwargs or self.generate_kwargs
+        generation_kwargs = generation_kwargs or self.generation_kwargs
         batch_size = batch_size or self.batch_size
 
         if len(image_file_paths) == 0:
@@ -126,7 +126,7 @@ def generate_captions(
         captions: List[str] = []
 
         for captions_batch in tqdm(
-            self.model(images_dataset, generate_kwargs=generate_kwargs, batch_size=batch_size),
+            self.model(images_dataset, generate_kwargs=generation_kwargs, batch_size=batch_size),
             disable=not self.progress_bar,
             total=len(images_dataset),
             desc="Generating captions",
@@ -135,7 +135,7 @@ def generate_captions(
 
         result: List[Document] = []
         for caption, image_file_path in zip(captions, image_file_paths):
-            document = Document(content=caption, content_type="text", meta={"image_file_path": image_file_path})
+            document = Document(content=caption, content_type="text", meta={"image_path": image_file_path})
             result.append(document)
 
         return result
diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py
new file mode 100644
index 0000000000..5e0d6d47cc
--- /dev/null
+++ b/test/nodes/test_image_to_text.py
@@ -0,0 +1,87 @@
+import pytest
+import os
+from pathlib import Path
+
+from haystack import Document
+from haystack.nodes.image_to_text.transformers import TransformersImageToText
+from haystack.nodes.image_to_text.base import BaseImageToText
+
+from ..conftest import SAMPLES_PATH
+
+IMAGE_FILE_PATHS = sorted([str(image_path) for image_path in Path(SAMPLES_PATH / "images").glob("*.jpg")])
+
+IMAGE_DOCS = [Document(content=image_path, content_type="image") for image_path in IMAGE_FILE_PATHS]
+
+EXPECTED_CAPTIONS = [
+    "a red apple is sitting on a pile of hay",
+    "a white car parked in a parking lot",
+    "a cat laying in the grass",
+    "a blurry photo of a blurry shot of a black object",
+    "a city with a large building and a clock tower",
+]
+
+
+@pytest.fixture
+def image_to_text():
+    return TransformersImageToText(
+        model_name_or_path="nlpconnect/vit-gpt2-image-captioning",
+        devices=["cpu"],
+        generation_kwargs={"max_new_tokens": 50},
+    )
+
+
+@pytest.mark.integration
+def test_image_to_text(image_to_text):
+    assert isinstance(image_to_text, BaseImageToText)
+
+    results = image_to_text.run(file_paths=IMAGE_FILE_PATHS)
+    generated_captions = [doc.content for doc in results[0]["documents"]]
+
+    assert generated_captions == EXPECTED_CAPTIONS
+
+
+# improve!!!!
+
+# no image!
+
+#     docs = [
+#         Document(
+#             content="""That's good. I like it.""" * 700,  # extra long text to check truncation
+#             meta={"name": "0"},
+#             id="1",
+#         ),
+#         Document(content="""That's bad. I don't like it.""", meta={"name": "1"}, id="2"),
+#     ]
+#     results = document_classifier.predict(documents=docs)
+#     expected_labels = ["joy", "sadness"]
+#     for i, doc in enumerate(results):
+#         assert doc.to_dict()["meta"]["classification"]["label"] == expected_labels[i]
+
+
+# # test node
+# ti2t = TransformersImageToText(model_name_or_path="nlpconnect/vit-gpt2-image-captioning", batch_size=1, generation_kwargs={'max_new_tokens':50})
+# # print(ti2t.generate_captions(image_file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg')))
+
+# # # test in a pipeline, passing file_paths
+# # from haystack.pipelines import Pipeline
+
+# # p = Pipeline()
+# # p.add_node(component=ti2t, name="ti2t", inputs=["File"])
+
+
+# # print(p.run(file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg')[:2]))
+
+# # test in a pipeline, passing documents
+# from haystack.pipelines import Pipeline
+# # from haystack.document_stores import InMemoryDocumentStore
+# from haystack import Document
+
+# # ds = InMemoryDocumentStore()
+# file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg')
+
+# docs= []
+# for path in file_paths:
+#     doc = Document(content=path, content_type="image")
+#     docs.append(doc)
+
+# print(ti2t.run(documents=docs))

From dddf026b8e990a675c6b3070dd2570eec3a80dc8 Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Tue, 17 Jan 2023 23:10:51 +0100
Subject: [PATCH 07/11] better tests

---
 test/nodes/test_image_to_text.py | 73 +++++++++++---------------------
 1 file changed, 24 insertions(+), 49 deletions(-)

diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py
index 5e0d6d47cc..9ed33b4310 100644
--- a/test/nodes/test_image_to_text.py
+++ b/test/nodes/test_image_to_text.py
@@ -1,6 +1,7 @@
 import pytest
+
 import os
-from pathlib import Path
+from PIL import UnidentifiedImageError
 
 from haystack import Document
 from haystack.nodes.image_to_text.transformers import TransformersImageToText
@@ -8,9 +9,11 @@
 
 from ..conftest import SAMPLES_PATH
 
-IMAGE_FILE_PATHS = sorted([str(image_path) for image_path in Path(SAMPLES_PATH / "images").glob("*.jpg")])
 
+IMAGE_FILE_NAMES = ["apple.jpg", "car.jpg", "cat.jpg", "galaxy.jpg", "paris.jpg"]
+IMAGE_FILE_PATHS = [os.path.join(SAMPLES_PATH, "images", file_name) for file_name in IMAGE_FILE_NAMES]
 IMAGE_DOCS = [Document(content=image_path, content_type="image") for image_path in IMAGE_FILE_PATHS]
+INVALID_IMAGE_FILE_PATH = str(SAMPLES_PATH / "markdown" / "sample.md")
 
 EXPECTED_CAPTIONS = [
     "a red apple is sitting on a pile of hay",
@@ -34,54 +37,26 @@ def image_to_text():
 def test_image_to_text(image_to_text):
     assert isinstance(image_to_text, BaseImageToText)
 
-    results = image_to_text.run(file_paths=IMAGE_FILE_PATHS)
-    generated_captions = [doc.content for doc in results[0]["documents"]]
-
-    assert generated_captions == EXPECTED_CAPTIONS
-
-
-# improve!!!!
-
-# no image!
-
-#     docs = [
-#         Document(
-#             content="""That's good. I like it.""" * 700,  # extra long text to check truncation
-#             meta={"name": "0"},
-#             id="1",
-#         ),
-#         Document(content="""That's bad. I don't like it.""", meta={"name": "1"}, id="2"),
-#     ]
-#     results = document_classifier.predict(documents=docs)
-#     expected_labels = ["joy", "sadness"]
-#     for i, doc in enumerate(results):
-#         assert doc.to_dict()["meta"]["classification"]["label"] == expected_labels[i]
-
+    results_0 = image_to_text.run(file_paths=IMAGE_FILE_PATHS)
+    image_paths_0 = [doc.meta["image_path"] for doc in results_0[0]["documents"]]
+    assert image_paths_0 == IMAGE_FILE_PATHS
+    generated_captions_0 = [doc.content for doc in results_0[0]["documents"]]
+    assert generated_captions_0 == EXPECTED_CAPTIONS
 
-# # test node
-# ti2t = TransformersImageToText(model_name_or_path="nlpconnect/vit-gpt2-image-captioning", batch_size=1, generation_kwargs={'max_new_tokens':50})
-# # print(ti2t.generate_captions(image_file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg')))
+    results_1 = image_to_text.run(documents=IMAGE_DOCS)
+    image_paths_1 = [doc.meta["image_path"] for doc in results_1[0]["documents"]]
+    assert image_paths_1 == IMAGE_FILE_PATHS
+    generated_captions_1 = [doc.content for doc in results_1[0]["documents"]]
+    assert generated_captions_1 == EXPECTED_CAPTIONS
 
-# # # test in a pipeline, passing file_paths
-# # from haystack.pipelines import Pipeline
+    results_2 = image_to_text.run(file_paths=IMAGE_FILE_PATHS[:3], documents=IMAGE_DOCS[3:])
+    image_paths_2 = [doc.meta["image_path"] for doc in results_2[0]["documents"]]
+    assert image_paths_2 == IMAGE_FILE_PATHS
+    generated_captions_2 = [doc.content for doc in results_2[0]["documents"]]
+    assert generated_captions_2 == EXPECTED_CAPTIONS
 
-# # p = Pipeline()
-# # p.add_node(component=ti2t, name="ti2t", inputs=["File"])
 
-
-# # print(p.run(file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg')[:2]))
-
-# # test in a pipeline, passing documents
-# from haystack.pipelines import Pipeline
-# # from haystack.document_stores import InMemoryDocumentStore
-# from haystack import Document
-
-# # ds = InMemoryDocumentStore()
-# file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg')
-
-# docs= []
-# for path in file_paths:
-#     doc = Document(content=path, content_type="image")
-#     docs.append(doc)
-
-# print(ti2t.run(documents=docs))
+@pytest.mark.integration
+def test_image_to_text_invalid_image(image_to_text):
+    with pytest.raises(UnidentifiedImageError, match="cannot identify image file"):
+        image_to_text.run(file_paths=[INVALID_IMAGE_FILE_PATH])

From 3b7fedf4464f193475d9884401343215c801d9ab Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Tue, 17 Jan 2023 23:11:49 +0100
Subject: [PATCH 08/11] fix import order

---
 test/nodes/test_image_to_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py
index 9ed33b4310..fa752caf13 100644
--- a/test/nodes/test_image_to_text.py
+++ b/test/nodes/test_image_to_text.py
@@ -1,6 +1,6 @@
+import os
 import pytest
 
-import os
 from PIL import UnidentifiedImageError
 
 from haystack import Document

From 29504ee5a241e90d26a9421a25b0f88d7ea8f437 Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Tue, 17 Jan 2023 23:16:10 +0100
Subject: [PATCH 09/11] fix docstring

---
 haystack/nodes/image_to_text/base.py         | 2 +-
 haystack/nodes/image_to_text/transformers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py
index 841c162c7a..e8e41d8af8 100644
--- a/haystack/nodes/image_to_text/base.py
+++ b/haystack/nodes/image_to_text/base.py
@@ -22,7 +22,7 @@ def generate_captions(
 
         :param image_file_paths: Paths of the images
         :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
-                                See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
+                                  See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
         :param batch_size: Number of images to process at a time.
         :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path.
         """
diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py
index 9e7c18d0f3..233e640cd2 100644
--- a/haystack/nodes/image_to_text/transformers.py
+++ b/haystack/nodes/image_to_text/transformers.py
@@ -111,7 +111,7 @@ def generate_captions(
 
         :param image_file_paths: Paths of the images
         :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model.
-                                See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
+                                  See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
         :param batch_size: Number of images to process at a time.
         :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path.
         """

From bac8393b7689104b84d85d98f0a2a78dcfa3d98f Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Tue, 17 Jan 2023 23:22:07 +0100
Subject: [PATCH 10/11] other fix docstring

---
 haystack/nodes/image_to_text/transformers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py
index 233e640cd2..8a5cc471ef 100644
--- a/haystack/nodes/image_to_text/transformers.py
+++ b/haystack/nodes/image_to_text/transformers.py
@@ -38,7 +38,7 @@ class TransformersImageToText(BaseImageToText):
                 "content": "a red apple is sitting on a pile of hay",
                 ...
                 "meta": {
-                            "image_file_path": "/path/to/images/apple.jpg",
+                            "image_path": "/path/to/images/apple.jpg",
                             ...
                         },
                 ...

From af7fbc25a442ab3b1389fad18963b3d2c5a9572e Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Wed, 18 Jan 2023 20:14:19 +0100
Subject: [PATCH 11/11] more and better tests

---
 haystack/errors.py                           |  7 ++
 haystack/nodes/image_to_text/transformers.py | 35 +++++++---
 test/nodes/test_image_to_text.py             | 69 ++++++++++++++------
 3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/haystack/errors.py b/haystack/errors.py
index f4ca3acdba..59d134888a 100644
--- a/haystack/errors.py
+++ b/haystack/errors.py
@@ -147,3 +147,10 @@ def __init__(
     ):
         super().__init__(message=message, send_message_in_event=send_message_in_event)
         self.status_code = status_code
+
+
+class ImageToTextError(NodeError):
+    """Exception for issues that occur in the ImageToText node"""
+
+    def __init__(self, message: Optional[str] = None):
+        super().__init__(message=message)
diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py
index 8a5cc471ef..9786a58812 100644
--- a/haystack/nodes/image_to_text/transformers.py
+++ b/haystack/nodes/image_to_text/transformers.py
@@ -10,10 +10,16 @@
 from haystack.nodes.image_to_text.base import BaseImageToText
 from haystack.modeling.utils import initialize_device_settings
 from haystack.utils.torch_utils import ListDataset
+from haystack.errors import ImageToTextError
 
 logger = logging.getLogger(__name__)
 
 
+# supported models classes should be extended when HF image-to-text pipeline willl support more classes
+# see https://github.com/huggingface/transformers/issues/21110
+SUPPORTED_MODELS_CLASSES = ["VisionEncoderDecoderModel"]
+
+
 class TransformersImageToText(BaseImageToText):
     """
     Transformer based model to generate captions for images using the HuggingFace's transformers framework
@@ -99,6 +105,15 @@ def __init__(
             device=self.devices[0],
             use_auth_token=use_auth_token,
         )
+
+        model_class_name = self.model.model.__class__.__name__
+        if model_class_name not in SUPPORTED_MODELS_CLASSES:
+            raise ValueError(
+                f"The model of class '{model_class_name}' is not supported for ImageToText."
+                f"The supported classes are: {SUPPORTED_MODELS_CLASSES}."
+                f"You can find the availaible models here: https://huggingface.co/models?pipeline_tag=image-to-text."
+            )
+
         self.generation_kwargs = generation_kwargs
         self.batch_size = batch_size
         self.progress_bar = progress_bar
@@ -119,19 +134,23 @@ def generate_captions(
         batch_size = batch_size or self.batch_size
 
         if len(image_file_paths) == 0:
-            raise AttributeError("ImageToText needs at least one filepath to produce a caption.")
+            raise ImageToTextError("ImageToText needs at least one filepath to produce a caption.")
 
         images_dataset = ListDataset(image_file_paths)
 
         captions: List[str] = []
 
-        for captions_batch in tqdm(
-            self.model(images_dataset, generate_kwargs=generation_kwargs, batch_size=batch_size),
-            disable=not self.progress_bar,
-            total=len(images_dataset),
-            desc="Generating captions",
-        ):
-            captions.append("".join([el["generated_text"] for el in captions_batch]).strip())
+        try:
+            for captions_batch in tqdm(
+                self.model(images_dataset, generate_kwargs=generation_kwargs, batch_size=batch_size),
+                disable=not self.progress_bar,
+                total=len(images_dataset),
+                desc="Generating captions",
+            ):
+                captions.append("".join([el["generated_text"] for el in captions_batch]).strip())
+
+        except Exception as exc:
+            raise ImageToTextError(str(exc)) from exc
 
         result: List[Document] = []
         for caption, image_file_path in zip(captions, image_file_paths):
diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py
index fa752caf13..2494b08c94 100644
--- a/test/nodes/test_image_to_text.py
+++ b/test/nodes/test_image_to_text.py
@@ -1,11 +1,11 @@
 import os
 import pytest
 
-from PIL import UnidentifiedImageError
-
 from haystack import Document
 from haystack.nodes.image_to_text.transformers import TransformersImageToText
 from haystack.nodes.image_to_text.base import BaseImageToText
+from haystack.errors import ImageToTextError
+
 
 from ..conftest import SAMPLES_PATH
 
@@ -13,7 +13,6 @@
 IMAGE_FILE_NAMES = ["apple.jpg", "car.jpg", "cat.jpg", "galaxy.jpg", "paris.jpg"]
 IMAGE_FILE_PATHS = [os.path.join(SAMPLES_PATH, "images", file_name) for file_name in IMAGE_FILE_NAMES]
 IMAGE_DOCS = [Document(content=image_path, content_type="image") for image_path in IMAGE_FILE_PATHS]
-INVALID_IMAGE_FILE_PATH = str(SAMPLES_PATH / "markdown" / "sample.md")
 
 EXPECTED_CAPTIONS = [
     "a red apple is sitting on a pile of hay",
@@ -34,29 +33,57 @@ def image_to_text():
 
 
 @pytest.mark.integration
-def test_image_to_text(image_to_text):
+def test_image_to_text_from_files(image_to_text):
     assert isinstance(image_to_text, BaseImageToText)
 
-    results_0 = image_to_text.run(file_paths=IMAGE_FILE_PATHS)
-    image_paths_0 = [doc.meta["image_path"] for doc in results_0[0]["documents"]]
-    assert image_paths_0 == IMAGE_FILE_PATHS
-    generated_captions_0 = [doc.content for doc in results_0[0]["documents"]]
-    assert generated_captions_0 == EXPECTED_CAPTIONS
+    results = image_to_text.run(file_paths=IMAGE_FILE_PATHS)
+    image_paths = [doc.meta["image_path"] for doc in results[0]["documents"]]
+    assert image_paths == IMAGE_FILE_PATHS
+    generated_captions = [doc.content for doc in results[0]["documents"]]
+    assert generated_captions == EXPECTED_CAPTIONS
+
+
+@pytest.mark.integration
+def test_image_to_text_from_documents(image_to_text):
+    results = image_to_text.run(documents=IMAGE_DOCS)
+    image_paths = [doc.meta["image_path"] for doc in results[0]["documents"]]
+    assert image_paths == IMAGE_FILE_PATHS
+    generated_captions = [doc.content for doc in results[0]["documents"]]
+    assert generated_captions == EXPECTED_CAPTIONS
 
-    results_1 = image_to_text.run(documents=IMAGE_DOCS)
-    image_paths_1 = [doc.meta["image_path"] for doc in results_1[0]["documents"]]
-    assert image_paths_1 == IMAGE_FILE_PATHS
-    generated_captions_1 = [doc.content for doc in results_1[0]["documents"]]
-    assert generated_captions_1 == EXPECTED_CAPTIONS
 
-    results_2 = image_to_text.run(file_paths=IMAGE_FILE_PATHS[:3], documents=IMAGE_DOCS[3:])
-    image_paths_2 = [doc.meta["image_path"] for doc in results_2[0]["documents"]]
-    assert image_paths_2 == IMAGE_FILE_PATHS
-    generated_captions_2 = [doc.content for doc in results_2[0]["documents"]]
-    assert generated_captions_2 == EXPECTED_CAPTIONS
+@pytest.mark.integration
+def test_image_to_text_from_files_and_documents(image_to_text):
+    results = image_to_text.run(file_paths=IMAGE_FILE_PATHS[:3], documents=IMAGE_DOCS[3:])
+    image_paths = [doc.meta["image_path"] for doc in results[0]["documents"]]
+    assert image_paths == IMAGE_FILE_PATHS
+    generated_captions = [doc.content for doc in results[0]["documents"]]
+    assert generated_captions == EXPECTED_CAPTIONS
 
 
 @pytest.mark.integration
 def test_image_to_text_invalid_image(image_to_text):
-    with pytest.raises(UnidentifiedImageError, match="cannot identify image file"):
-        image_to_text.run(file_paths=[INVALID_IMAGE_FILE_PATH])
+    markdown_path = str(SAMPLES_PATH / "markdown" / "sample.md")
+    with pytest.raises(ImageToTextError, match="cannot identify image file"):
+        image_to_text.run(file_paths=[markdown_path])
+
+
+@pytest.mark.integration
+def test_image_to_text_incorrect_path(image_to_text):
+    with pytest.raises(ImageToTextError, match="Incorrect path"):
+        image_to_text.run(file_paths=["wrong_path.jpg"])
+
+
+@pytest.mark.integration
+def test_image_to_text_not_image_document(image_to_text):
+    textual_document = Document(content="this document is textual", content_type="text")
+    with pytest.raises(ValueError, match="The ImageToText node only supports image documents."):
+        image_to_text.run(documents=[textual_document])
+
+
+@pytest.mark.integration
+def test_image_to_text_unsupported_model():
+    with pytest.raises(
+        ValueError, match="The model of class 'BertForQuestionAnswering' is not supported for ImageToText"
+    ):
+        _ = TransformersImageToText(model_name_or_path="deepset/minilm-uncased-squad2")