From 41a39da78c58496afc1bb9c2a950c1029760db50 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Sat, 14 Jan 2023 10:46:39 +0100 Subject: [PATCH 01/11] first draft --- haystack/nodes/image_to_text/__init__.py | 2 + haystack/nodes/image_to_text/base.py | 55 ++++++++ haystack/nodes/image_to_text/transformers.py | 140 +++++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 haystack/nodes/image_to_text/__init__.py create mode 100644 haystack/nodes/image_to_text/base.py create mode 100644 haystack/nodes/image_to_text/transformers.py diff --git a/haystack/nodes/image_to_text/__init__.py b/haystack/nodes/image_to_text/__init__.py new file mode 100644 index 0000000000..44a8bb419c --- /dev/null +++ b/haystack/nodes/image_to_text/__init__.py @@ -0,0 +1,2 @@ +from haystack.nodes.image_to_text.base import BaseImageToText +from haystack.nodes.image_to_text.transformers import TransformersImageToText diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py new file mode 100644 index 0000000000..3c0542d94f --- /dev/null +++ b/haystack/nodes/image_to_text/base.py @@ -0,0 +1,55 @@ +from typing import List, Optional + +from abc import abstractmethod + +from haystack.schema import Document +from haystack.nodes.base import BaseComponent + + +class BaseImageToText(BaseComponent): + """ + Abstract class for ImageToText + """ + + outgoing_edges = 1 + + @abstractmethod + def generate_captions( + self, image_file_paths: List[str], generate_kwargs: Optional[dict] = None, batch_size: Optional[int] = None + ) -> List[Document]: + """ + Abstract method for generating captions. + + :param image_file_paths: Paths of the images + :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. + See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate + :param batch_size: Number of images to process at a time. + :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path. + """ + pass + + def run(self, file_paths: Optional[List[str]] = None, documents: Optional[List[Document]] = None): # type: ignore + + if file_paths is None and documents is None: + raise ValueError("You must either specify documents or image file_paths to process.") + if file_paths is not None and documents is not None: + raise ValueError( + "You specified both documents and image_file_paths. You need to specify only one of the two parameters." + ) + if file_paths is not None: + image_file_paths = file_paths + if documents is not None: + if any((doc.content_type != "image" for doc in documents)): + raise ValueError("The ImageToText node only supports image documents.") + image_file_paths = [doc.content for doc in documents] + + results: dict = {} + results["documents"] = self.generate_captions(image_file_paths=image_file_paths) + + return results, "output_1" + + def run_batch( + self, file_paths: Optional[List[str]] = None, documents: Optional[List[Document]] = None + ): # type: ignore + + return self.run(file_paths=file_paths, documents=documents) diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py new file mode 100644 index 0000000000..189be55e9b --- /dev/null +++ b/haystack/nodes/image_to_text/transformers.py @@ -0,0 +1,140 @@ +from typing import List, Optional, Union + +import logging + +import torch +from tqdm.auto import tqdm +from transformers import pipeline + +from haystack.schema import Document +from haystack.nodes.image_to_text.base import BaseImageToText +from haystack.modeling.utils import initialize_device_settings +from haystack.utils.torch_utils import ListDataset + +logger = logging.getLogger(__name__) + + +class TransformersImageToText(BaseImageToText): + """ + Transformer based model to generate captions for images using the HuggingFace's transformers framework + + See the up-to-date list of available models on + `huggingface.co/models `__ + + **Example** + + ```python + image_file_paths = ["/path/to/images/apple.jpg", + "/path/to/images/cat.jpg", ] + + # Generate captions + documents = image_to_text.generate_captions(image_file_paths=image_file_paths) + + # Show results (List of Documents, containing caption and image file_path) + print(documents) + + [ + { + "content": "a red apple is sitting on a pile of hay", + ... + "meta": { + "image_file_path": "/path/to/images/apple.jpg", + ... + }, + ... + }, + ... + ] + ``` + """ + + def __init__( + self, + model_name_or_path: str = "nlpconnect/vit-gpt2-image-captioning", + model_version: Optional[str] = None, + generate_kwargs: Optional[dict] = None, + use_gpu: bool = True, + batch_size: int = 16, + progress_bar: bool = True, + use_auth_token: Optional[Union[str, bool]] = None, + devices: Optional[List[Union[str, torch.device]]] = None, + ): + """ + Load an Image To Text model from Transformers. + See the up-to-date list of available models at + https://huggingface.co/models?pipeline_tag=image-to-text + + :param model_name_or_path: Directory of a saved model or the name of a public model. + See https://huggingface.co/models?pipeline_tag=image-to-text for full list of available models. + :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. + :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. + See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate + :param use_gpu: Whether to use GPU (if available). + :param batch_size: Number of documents to process at a time. + :param progress_bar: Whether to show a progress bar. + :param use_auth_token: The API token used to download private models from Huggingface. + If this parameter is set to `True`, then the token generated when running + `transformers-cli login` (stored in ~/.huggingface) will be used. + Additional information can be found here + https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + :param devices: List of torch devices (e.g. cuda, cpu, mps) to limit inference to specific devices. + A list containing torch device objects and/or strings is supported (For example + [torch.device('cuda:0'), "mps", "cuda:1"]). When specifying `use_gpu=False` the devices + parameter is not used and a single cpu device is used for inference. + """ + super().__init__() + + self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) + if len(self.devices) > 1: + logger.warning( + f"Multiple devices are not supported in {self.__class__.__name__} inference, " + f"using the first device {self.devices[0]}." + ) + + self.model = pipeline( + task="image-to-text", + model=model_name_or_path, + revision=model_version, + device=self.devices[0], + use_auth_token=use_auth_token, + ) + self.generate_kwargs = generate_kwargs + self.batch_size = batch_size + self.progress_bar = progress_bar + + def generate_captions( + self, image_file_paths: List[str], generate_kwargs: Optional[dict] = None, batch_size: Optional[int] = None + ) -> List[Document]: + """ + Generate captions for provided image files + + :param image_file_paths: Paths of the images + :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. + See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate + :param batch_size: Number of images to process at a time. + :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path. + """ + generate_kwargs = generate_kwargs or self.generate_kwargs + batch_size = batch_size or self.batch_size + + if len(image_file_paths) == 0: + raise AttributeError("ImageToText needs at least one filepath to produce a caption.") + + images_dataset = ListDataset(image_file_paths) + + captions: List[str] = [] + + for captions_batch in tqdm( + self.model(images_dataset, generate_kwargs=generate_kwargs, batch_size=batch_size), + disable=not self.progress_bar, + total=len(images_dataset), + desc="Generating captions", + ): + captions.append("".join([el["generated_text"] for el in captions_batch]).strip()) + + result: List[Document] = [] + for caption, image_file_path in zip(captions, image_file_paths): + document = Document(content=caption, content_type="text", meta={"image_file_path": image_file_path}) + result.append(document) + + return result From bfac94f4f32b16843d1e6899bc9479fae0439c8e Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Sat, 14 Jan 2023 11:20:51 +0100 Subject: [PATCH 02/11] fix pylint and mypy --- haystack/nodes/image_to_text/transformers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py index 189be55e9b..90f1a98410 100644 --- a/haystack/nodes/image_to_text/transformers.py +++ b/haystack/nodes/image_to_text/transformers.py @@ -87,8 +87,9 @@ def __init__( self.devices, _ = initialize_device_settings(devices=devices, use_cuda=use_gpu, multi_gpu=False) if len(self.devices) > 1: logger.warning( - f"Multiple devices are not supported in {self.__class__.__name__} inference, " - f"using the first device {self.devices[0]}." + "Multiple devices are not supported in %s inference, using the first device %s.", + self.__class__.__name__, + self.devices[0], ) self.model = pipeline( From ae414dab400510919822ef5f73f80ab631d10d84 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Sat, 14 Jan 2023 11:28:51 +0100 Subject: [PATCH 03/11] retry w mypy --- haystack/nodes/image_to_text/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py index 3c0542d94f..f66507bb6c 100644 --- a/haystack/nodes/image_to_text/base.py +++ b/haystack/nodes/image_to_text/base.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Union from abc import abstractmethod @@ -49,7 +49,7 @@ def run(self, file_paths: Optional[List[str]] = None, documents: Optional[List[D return results, "output_1" def run_batch( - self, file_paths: Optional[List[str]] = None, documents: Optional[List[Document]] = None + self, file_paths: Optional[List[str]] = None, documents: Union[List[Document], None] = None ): # type: ignore return self.run(file_paths=file_paths, documents=documents) From 79b720e408bb3f71bdb25b5054da2f0601f7665d Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Sat, 14 Jan 2023 11:33:40 +0100 Subject: [PATCH 04/11] mypy :-) --- haystack/nodes/image_to_text/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py index f66507bb6c..dc9d1d627b 100644 --- a/haystack/nodes/image_to_text/base.py +++ b/haystack/nodes/image_to_text/base.py @@ -48,8 +48,8 @@ def run(self, file_paths: Optional[List[str]] = None, documents: Optional[List[D return results, "output_1" - def run_batch( - self, file_paths: Optional[List[str]] = None, documents: Union[List[Document], None] = None - ): # type: ignore + def run_batch( # type: ignore + self, file_paths: Optional[List[str]] = None, documents: Optional[List[Document]] = None + ): return self.run(file_paths=file_paths, documents=documents) From 6b14f2e81e244d990645935946ea7e90cb4726a3 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Sat, 14 Jan 2023 11:37:06 +0100 Subject: [PATCH 05/11] rem unused import --- haystack/nodes/image_to_text/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py index dc9d1d627b..abd33e6769 100644 --- a/haystack/nodes/image_to_text/base.py +++ b/haystack/nodes/image_to_text/base.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union +from typing import List, Optional from abc import abstractmethod From 191660432332fbbcc2d41ae748f56893ca7187c3 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Tue, 17 Jan 2023 18:49:03 +0100 Subject: [PATCH 06/11] incorporate feedback and initial tests --- haystack/nodes/image_to_text/base.py | 14 ++-- haystack/nodes/image_to_text/transformers.py | 16 ++-- test/nodes/test_image_to_text.py | 87 ++++++++++++++++++++ 3 files changed, 101 insertions(+), 16 deletions(-) create mode 100644 test/nodes/test_image_to_text.py diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py index abd33e6769..841c162c7a 100644 --- a/haystack/nodes/image_to_text/base.py +++ b/haystack/nodes/image_to_text/base.py @@ -15,13 +15,13 @@ class BaseImageToText(BaseComponent): @abstractmethod def generate_captions( - self, image_file_paths: List[str], generate_kwargs: Optional[dict] = None, batch_size: Optional[int] = None + self, image_file_paths: List[str], generation_kwargs: Optional[dict] = None, batch_size: Optional[int] = None ) -> List[Document]: """ Abstract method for generating captions. :param image_file_paths: Paths of the images - :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. + :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate :param batch_size: Number of images to process at a time. :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path. @@ -32,16 +32,14 @@ def run(self, file_paths: Optional[List[str]] = None, documents: Optional[List[D if file_paths is None and documents is None: raise ValueError("You must either specify documents or image file_paths to process.") - if file_paths is not None and documents is not None: - raise ValueError( - "You specified both documents and image_file_paths. You need to specify only one of the two parameters." - ) + + image_file_paths = [] if file_paths is not None: - image_file_paths = file_paths + image_file_paths.extend(file_paths) if documents is not None: if any((doc.content_type != "image" for doc in documents)): raise ValueError("The ImageToText node only supports image documents.") - image_file_paths = [doc.content for doc in documents] + image_file_paths.extend([doc.content for doc in documents]) results: dict = {} results["documents"] = self.generate_captions(image_file_paths=image_file_paths) diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py index 90f1a98410..9e7c18d0f3 100644 --- a/haystack/nodes/image_to_text/transformers.py +++ b/haystack/nodes/image_to_text/transformers.py @@ -52,7 +52,7 @@ def __init__( self, model_name_or_path: str = "nlpconnect/vit-gpt2-image-captioning", model_version: Optional[str] = None, - generate_kwargs: Optional[dict] = None, + generation_kwargs: Optional[dict] = None, use_gpu: bool = True, batch_size: int = 16, progress_bar: bool = True, @@ -67,7 +67,7 @@ def __init__( :param model_name_or_path: Directory of a saved model or the name of a public model. See https://huggingface.co/models?pipeline_tag=image-to-text for full list of available models. :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. - :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. + :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate :param use_gpu: Whether to use GPU (if available). :param batch_size: Number of documents to process at a time. @@ -99,23 +99,23 @@ def __init__( device=self.devices[0], use_auth_token=use_auth_token, ) - self.generate_kwargs = generate_kwargs + self.generation_kwargs = generation_kwargs self.batch_size = batch_size self.progress_bar = progress_bar def generate_captions( - self, image_file_paths: List[str], generate_kwargs: Optional[dict] = None, batch_size: Optional[int] = None + self, image_file_paths: List[str], generation_kwargs: Optional[dict] = None, batch_size: Optional[int] = None ) -> List[Document]: """ Generate captions for provided image files :param image_file_paths: Paths of the images - :param generate_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. + :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate :param batch_size: Number of images to process at a time. :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path. """ - generate_kwargs = generate_kwargs or self.generate_kwargs + generation_kwargs = generation_kwargs or self.generation_kwargs batch_size = batch_size or self.batch_size if len(image_file_paths) == 0: @@ -126,7 +126,7 @@ def generate_captions( captions: List[str] = [] for captions_batch in tqdm( - self.model(images_dataset, generate_kwargs=generate_kwargs, batch_size=batch_size), + self.model(images_dataset, generate_kwargs=generation_kwargs, batch_size=batch_size), disable=not self.progress_bar, total=len(images_dataset), desc="Generating captions", @@ -135,7 +135,7 @@ def generate_captions( result: List[Document] = [] for caption, image_file_path in zip(captions, image_file_paths): - document = Document(content=caption, content_type="text", meta={"image_file_path": image_file_path}) + document = Document(content=caption, content_type="text", meta={"image_path": image_file_path}) result.append(document) return result diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py new file mode 100644 index 0000000000..5e0d6d47cc --- /dev/null +++ b/test/nodes/test_image_to_text.py @@ -0,0 +1,87 @@ +import pytest +import os +from pathlib import Path + +from haystack import Document +from haystack.nodes.image_to_text.transformers import TransformersImageToText +from haystack.nodes.image_to_text.base import BaseImageToText + +from ..conftest import SAMPLES_PATH + +IMAGE_FILE_PATHS = sorted([str(image_path) for image_path in Path(SAMPLES_PATH / "images").glob("*.jpg")]) + +IMAGE_DOCS = [Document(content=image_path, content_type="image") for image_path in IMAGE_FILE_PATHS] + +EXPECTED_CAPTIONS = [ + "a red apple is sitting on a pile of hay", + "a white car parked in a parking lot", + "a cat laying in the grass", + "a blurry photo of a blurry shot of a black object", + "a city with a large building and a clock tower", +] + + +@pytest.fixture +def image_to_text(): + return TransformersImageToText( + model_name_or_path="nlpconnect/vit-gpt2-image-captioning", + devices=["cpu"], + generation_kwargs={"max_new_tokens": 50}, + ) + + +@pytest.mark.integration +def test_image_to_text(image_to_text): + assert isinstance(image_to_text, BaseImageToText) + + results = image_to_text.run(file_paths=IMAGE_FILE_PATHS) + generated_captions = [doc.content for doc in results[0]["documents"]] + + assert generated_captions == EXPECTED_CAPTIONS + + +# improve!!!! + +# no image! + +# docs = [ +# Document( +# content="""That's good. I like it.""" * 700, # extra long text to check truncation +# meta={"name": "0"}, +# id="1", +# ), +# Document(content="""That's bad. I don't like it.""", meta={"name": "1"}, id="2"), +# ] +# results = document_classifier.predict(documents=docs) +# expected_labels = ["joy", "sadness"] +# for i, doc in enumerate(results): +# assert doc.to_dict()["meta"]["classification"]["label"] == expected_labels[i] + + +# # test node +# ti2t = TransformersImageToText(model_name_or_path="nlpconnect/vit-gpt2-image-captioning", batch_size=1, generation_kwargs={'max_new_tokens':50}) +# # print(ti2t.generate_captions(image_file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg'))) + +# # # test in a pipeline, passing file_paths +# # from haystack.pipelines import Pipeline + +# # p = Pipeline() +# # p.add_node(component=ti2t, name="ti2t", inputs=["File"]) + + +# # print(p.run(file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg')[:2])) + +# # test in a pipeline, passing documents +# from haystack.pipelines import Pipeline +# # from haystack.document_stores import InMemoryDocumentStore +# from haystack import Document + +# # ds = InMemoryDocumentStore() +# file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg') + +# docs= [] +# for path in file_paths: +# doc = Document(content=path, content_type="image") +# docs.append(doc) + +# print(ti2t.run(documents=docs)) From dddf026b8e990a675c6b3070dd2570eec3a80dc8 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Tue, 17 Jan 2023 23:10:51 +0100 Subject: [PATCH 07/11] better tests --- test/nodes/test_image_to_text.py | 73 +++++++++++--------------------- 1 file changed, 24 insertions(+), 49 deletions(-) diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py index 5e0d6d47cc..9ed33b4310 100644 --- a/test/nodes/test_image_to_text.py +++ b/test/nodes/test_image_to_text.py @@ -1,6 +1,7 @@ import pytest + import os -from pathlib import Path +from PIL import UnidentifiedImageError from haystack import Document from haystack.nodes.image_to_text.transformers import TransformersImageToText @@ -8,9 +9,11 @@ from ..conftest import SAMPLES_PATH -IMAGE_FILE_PATHS = sorted([str(image_path) for image_path in Path(SAMPLES_PATH / "images").glob("*.jpg")]) +IMAGE_FILE_NAMES = ["apple.jpg", "car.jpg", "cat.jpg", "galaxy.jpg", "paris.jpg"] +IMAGE_FILE_PATHS = [os.path.join(SAMPLES_PATH, "images", file_name) for file_name in IMAGE_FILE_NAMES] IMAGE_DOCS = [Document(content=image_path, content_type="image") for image_path in IMAGE_FILE_PATHS] +INVALID_IMAGE_FILE_PATH = str(SAMPLES_PATH / "markdown" / "sample.md") EXPECTED_CAPTIONS = [ "a red apple is sitting on a pile of hay", @@ -34,54 +37,26 @@ def image_to_text(): def test_image_to_text(image_to_text): assert isinstance(image_to_text, BaseImageToText) - results = image_to_text.run(file_paths=IMAGE_FILE_PATHS) - generated_captions = [doc.content for doc in results[0]["documents"]] - - assert generated_captions == EXPECTED_CAPTIONS - - -# improve!!!! - -# no image! - -# docs = [ -# Document( -# content="""That's good. I like it.""" * 700, # extra long text to check truncation -# meta={"name": "0"}, -# id="1", -# ), -# Document(content="""That's bad. I don't like it.""", meta={"name": "1"}, id="2"), -# ] -# results = document_classifier.predict(documents=docs) -# expected_labels = ["joy", "sadness"] -# for i, doc in enumerate(results): -# assert doc.to_dict()["meta"]["classification"]["label"] == expected_labels[i] - + results_0 = image_to_text.run(file_paths=IMAGE_FILE_PATHS) + image_paths_0 = [doc.meta["image_path"] for doc in results_0[0]["documents"]] + assert image_paths_0 == IMAGE_FILE_PATHS + generated_captions_0 = [doc.content for doc in results_0[0]["documents"]] + assert generated_captions_0 == EXPECTED_CAPTIONS -# # test node -# ti2t = TransformersImageToText(model_name_or_path="nlpconnect/vit-gpt2-image-captioning", batch_size=1, generation_kwargs={'max_new_tokens':50}) -# # print(ti2t.generate_captions(image_file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg'))) + results_1 = image_to_text.run(documents=IMAGE_DOCS) + image_paths_1 = [doc.meta["image_path"] for doc in results_1[0]["documents"]] + assert image_paths_1 == IMAGE_FILE_PATHS + generated_captions_1 = [doc.content for doc in results_1[0]["documents"]] + assert generated_captions_1 == EXPECTED_CAPTIONS -# # # test in a pipeline, passing file_paths -# # from haystack.pipelines import Pipeline + results_2 = image_to_text.run(file_paths=IMAGE_FILE_PATHS[:3], documents=IMAGE_DOCS[3:]) + image_paths_2 = [doc.meta["image_path"] for doc in results_2[0]["documents"]] + assert image_paths_2 == IMAGE_FILE_PATHS + generated_captions_2 = [doc.content for doc in results_2[0]["documents"]] + assert generated_captions_2 == EXPECTED_CAPTIONS -# # p = Pipeline() -# # p.add_node(component=ti2t, name="ti2t", inputs=["File"]) - -# # print(p.run(file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg')[:2])) - -# # test in a pipeline, passing documents -# from haystack.pipelines import Pipeline -# # from haystack.document_stores import InMemoryDocumentStore -# from haystack import Document - -# # ds = InMemoryDocumentStore() -# file_paths=glob.glob('/home/anakin87/apps/haystack/test/samples/images/*.jpg') - -# docs= [] -# for path in file_paths: -# doc = Document(content=path, content_type="image") -# docs.append(doc) - -# print(ti2t.run(documents=docs)) +@pytest.mark.integration +def test_image_to_text_invalid_image(image_to_text): + with pytest.raises(UnidentifiedImageError, match="cannot identify image file"): + image_to_text.run(file_paths=[INVALID_IMAGE_FILE_PATH]) From 3b7fedf4464f193475d9884401343215c801d9ab Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Tue, 17 Jan 2023 23:11:49 +0100 Subject: [PATCH 08/11] fix import order --- test/nodes/test_image_to_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py index 9ed33b4310..fa752caf13 100644 --- a/test/nodes/test_image_to_text.py +++ b/test/nodes/test_image_to_text.py @@ -1,6 +1,6 @@ +import os import pytest -import os from PIL import UnidentifiedImageError from haystack import Document From 29504ee5a241e90d26a9421a25b0f88d7ea8f437 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Tue, 17 Jan 2023 23:16:10 +0100 Subject: [PATCH 09/11] fix docstring --- haystack/nodes/image_to_text/base.py | 2 +- haystack/nodes/image_to_text/transformers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/nodes/image_to_text/base.py b/haystack/nodes/image_to_text/base.py index 841c162c7a..e8e41d8af8 100644 --- a/haystack/nodes/image_to_text/base.py +++ b/haystack/nodes/image_to_text/base.py @@ -22,7 +22,7 @@ def generate_captions( :param image_file_paths: Paths of the images :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. - See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate + See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate :param batch_size: Number of images to process at a time. :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path. """ diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py index 9e7c18d0f3..233e640cd2 100644 --- a/haystack/nodes/image_to_text/transformers.py +++ b/haystack/nodes/image_to_text/transformers.py @@ -111,7 +111,7 @@ def generate_captions( :param image_file_paths: Paths of the images :param generation_kwargs: Dictionary containing arguments for the generate method of the Hugging Face model. - See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate + See https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate :param batch_size: Number of images to process at a time. :return: List of Documents. Document.content is the caption. Document.meta["image_file_path"] contains the image file path. """ From bac8393b7689104b84d85d98f0a2a78dcfa3d98f Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Tue, 17 Jan 2023 23:22:07 +0100 Subject: [PATCH 10/11] other fix docstring --- haystack/nodes/image_to_text/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py index 233e640cd2..8a5cc471ef 100644 --- a/haystack/nodes/image_to_text/transformers.py +++ b/haystack/nodes/image_to_text/transformers.py @@ -38,7 +38,7 @@ class TransformersImageToText(BaseImageToText): "content": "a red apple is sitting on a pile of hay", ... "meta": { - "image_file_path": "/path/to/images/apple.jpg", + "image_path": "/path/to/images/apple.jpg", ... }, ... From af7fbc25a442ab3b1389fad18963b3d2c5a9572e Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Wed, 18 Jan 2023 20:14:19 +0100 Subject: [PATCH 11/11] more and better tests --- haystack/errors.py | 7 ++ haystack/nodes/image_to_text/transformers.py | 35 +++++++--- test/nodes/test_image_to_text.py | 69 ++++++++++++++------ 3 files changed, 82 insertions(+), 29 deletions(-) diff --git a/haystack/errors.py b/haystack/errors.py index f4ca3acdba..59d134888a 100644 --- a/haystack/errors.py +++ b/haystack/errors.py @@ -147,3 +147,10 @@ def __init__( ): super().__init__(message=message, send_message_in_event=send_message_in_event) self.status_code = status_code + + +class ImageToTextError(NodeError): + """Exception for issues that occur in the ImageToText node""" + + def __init__(self, message: Optional[str] = None): + super().__init__(message=message) diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py index 8a5cc471ef..9786a58812 100644 --- a/haystack/nodes/image_to_text/transformers.py +++ b/haystack/nodes/image_to_text/transformers.py @@ -10,10 +10,16 @@ from haystack.nodes.image_to_text.base import BaseImageToText from haystack.modeling.utils import initialize_device_settings from haystack.utils.torch_utils import ListDataset +from haystack.errors import ImageToTextError logger = logging.getLogger(__name__) +# supported models classes should be extended when HF image-to-text pipeline willl support more classes +# see https://github.com/huggingface/transformers/issues/21110 +SUPPORTED_MODELS_CLASSES = ["VisionEncoderDecoderModel"] + + class TransformersImageToText(BaseImageToText): """ Transformer based model to generate captions for images using the HuggingFace's transformers framework @@ -99,6 +105,15 @@ def __init__( device=self.devices[0], use_auth_token=use_auth_token, ) + + model_class_name = self.model.model.__class__.__name__ + if model_class_name not in SUPPORTED_MODELS_CLASSES: + raise ValueError( + f"The model of class '{model_class_name}' is not supported for ImageToText." + f"The supported classes are: {SUPPORTED_MODELS_CLASSES}." + f"You can find the availaible models here: https://huggingface.co/models?pipeline_tag=image-to-text." + ) + self.generation_kwargs = generation_kwargs self.batch_size = batch_size self.progress_bar = progress_bar @@ -119,19 +134,23 @@ def generate_captions( batch_size = batch_size or self.batch_size if len(image_file_paths) == 0: - raise AttributeError("ImageToText needs at least one filepath to produce a caption.") + raise ImageToTextError("ImageToText needs at least one filepath to produce a caption.") images_dataset = ListDataset(image_file_paths) captions: List[str] = [] - for captions_batch in tqdm( - self.model(images_dataset, generate_kwargs=generation_kwargs, batch_size=batch_size), - disable=not self.progress_bar, - total=len(images_dataset), - desc="Generating captions", - ): - captions.append("".join([el["generated_text"] for el in captions_batch]).strip()) + try: + for captions_batch in tqdm( + self.model(images_dataset, generate_kwargs=generation_kwargs, batch_size=batch_size), + disable=not self.progress_bar, + total=len(images_dataset), + desc="Generating captions", + ): + captions.append("".join([el["generated_text"] for el in captions_batch]).strip()) + + except Exception as exc: + raise ImageToTextError(str(exc)) from exc result: List[Document] = [] for caption, image_file_path in zip(captions, image_file_paths): diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py index fa752caf13..2494b08c94 100644 --- a/test/nodes/test_image_to_text.py +++ b/test/nodes/test_image_to_text.py @@ -1,11 +1,11 @@ import os import pytest -from PIL import UnidentifiedImageError - from haystack import Document from haystack.nodes.image_to_text.transformers import TransformersImageToText from haystack.nodes.image_to_text.base import BaseImageToText +from haystack.errors import ImageToTextError + from ..conftest import SAMPLES_PATH @@ -13,7 +13,6 @@ IMAGE_FILE_NAMES = ["apple.jpg", "car.jpg", "cat.jpg", "galaxy.jpg", "paris.jpg"] IMAGE_FILE_PATHS = [os.path.join(SAMPLES_PATH, "images", file_name) for file_name in IMAGE_FILE_NAMES] IMAGE_DOCS = [Document(content=image_path, content_type="image") for image_path in IMAGE_FILE_PATHS] -INVALID_IMAGE_FILE_PATH = str(SAMPLES_PATH / "markdown" / "sample.md") EXPECTED_CAPTIONS = [ "a red apple is sitting on a pile of hay", @@ -34,29 +33,57 @@ def image_to_text(): @pytest.mark.integration -def test_image_to_text(image_to_text): +def test_image_to_text_from_files(image_to_text): assert isinstance(image_to_text, BaseImageToText) - results_0 = image_to_text.run(file_paths=IMAGE_FILE_PATHS) - image_paths_0 = [doc.meta["image_path"] for doc in results_0[0]["documents"]] - assert image_paths_0 == IMAGE_FILE_PATHS - generated_captions_0 = [doc.content for doc in results_0[0]["documents"]] - assert generated_captions_0 == EXPECTED_CAPTIONS + results = image_to_text.run(file_paths=IMAGE_FILE_PATHS) + image_paths = [doc.meta["image_path"] for doc in results[0]["documents"]] + assert image_paths == IMAGE_FILE_PATHS + generated_captions = [doc.content for doc in results[0]["documents"]] + assert generated_captions == EXPECTED_CAPTIONS + + +@pytest.mark.integration +def test_image_to_text_from_documents(image_to_text): + results = image_to_text.run(documents=IMAGE_DOCS) + image_paths = [doc.meta["image_path"] for doc in results[0]["documents"]] + assert image_paths == IMAGE_FILE_PATHS + generated_captions = [doc.content for doc in results[0]["documents"]] + assert generated_captions == EXPECTED_CAPTIONS - results_1 = image_to_text.run(documents=IMAGE_DOCS) - image_paths_1 = [doc.meta["image_path"] for doc in results_1[0]["documents"]] - assert image_paths_1 == IMAGE_FILE_PATHS - generated_captions_1 = [doc.content for doc in results_1[0]["documents"]] - assert generated_captions_1 == EXPECTED_CAPTIONS - results_2 = image_to_text.run(file_paths=IMAGE_FILE_PATHS[:3], documents=IMAGE_DOCS[3:]) - image_paths_2 = [doc.meta["image_path"] for doc in results_2[0]["documents"]] - assert image_paths_2 == IMAGE_FILE_PATHS - generated_captions_2 = [doc.content for doc in results_2[0]["documents"]] - assert generated_captions_2 == EXPECTED_CAPTIONS +@pytest.mark.integration +def test_image_to_text_from_files_and_documents(image_to_text): + results = image_to_text.run(file_paths=IMAGE_FILE_PATHS[:3], documents=IMAGE_DOCS[3:]) + image_paths = [doc.meta["image_path"] for doc in results[0]["documents"]] + assert image_paths == IMAGE_FILE_PATHS + generated_captions = [doc.content for doc in results[0]["documents"]] + assert generated_captions == EXPECTED_CAPTIONS @pytest.mark.integration def test_image_to_text_invalid_image(image_to_text): - with pytest.raises(UnidentifiedImageError, match="cannot identify image file"): - image_to_text.run(file_paths=[INVALID_IMAGE_FILE_PATH]) + markdown_path = str(SAMPLES_PATH / "markdown" / "sample.md") + with pytest.raises(ImageToTextError, match="cannot identify image file"): + image_to_text.run(file_paths=[markdown_path]) + + +@pytest.mark.integration +def test_image_to_text_incorrect_path(image_to_text): + with pytest.raises(ImageToTextError, match="Incorrect path"): + image_to_text.run(file_paths=["wrong_path.jpg"]) + + +@pytest.mark.integration +def test_image_to_text_not_image_document(image_to_text): + textual_document = Document(content="this document is textual", content_type="text") + with pytest.raises(ValueError, match="The ImageToText node only supports image documents."): + image_to_text.run(documents=[textual_document]) + + +@pytest.mark.integration +def test_image_to_text_unsupported_model(): + with pytest.raises( + ValueError, match="The model of class 'BertForQuestionAnswering' is not supported for ImageToText" + ): + _ = TransformersImageToText(model_name_or_path="deepset/minilm-uncased-squad2")