From 77eb0c2c879f44fb7db751ab7e80c8c88e131380 Mon Sep 17 00:00:00 2001
From: anakin87 <44616784+anakin87@users.noreply.github.com>
Date: Sun, 14 May 2023 18:38:44 +0200
Subject: [PATCH 1/2] add blip support

---
 haystack/nodes/image_to_text/transformers.py | 15 ++++++++-------
 test/nodes/test_image_to_text.py             |  9 ---------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py
index 25357c5411..5b3ea917b0 100644
--- a/haystack/nodes/image_to_text/transformers.py
+++ b/haystack/nodes/image_to_text/transformers.py
@@ -17,7 +17,11 @@
 
 # supported models classes should be extended when HF image-to-text pipeline willl support more classes
 # see /~https://github.com/huggingface/transformers/issues/21110
-SUPPORTED_MODELS_CLASSES = ["VisionEncoderDecoderModel"]
+SUPPORTED_MODELS_CLASSES = [
+    "VisionEncoderDecoderModel",
+    "BlipForConditionalGeneration",
+    "Blip2ForConditionalGeneration",
+]
 
 UNSUPPORTED_MODEL_MESSAGE = (
     f"The supported classes are: {SUPPORTED_MODELS_CLASSES}. \n"
@@ -33,8 +37,6 @@ class TransformersImageToText(BaseImageToText):
     """
     A transformer-based model to generate captions for images using the Hugging Face's transformers framework.
 
-    Currently, this node supports `VisionEncoderDecoderModel` models.
-
     **Example**
 
      ```python
@@ -64,7 +66,7 @@ class TransformersImageToText(BaseImageToText):
 
     def __init__(
         self,
-        model_name_or_path: str = "nlpconnect/vit-gpt2-image-captioning",
+        model_name_or_path: str = "Salesforce/blip-image-captioning-base",
         model_version: Optional[str] = None,
         generation_kwargs: Optional[dict] = None,
         use_gpu: bool = True,
@@ -74,15 +76,14 @@ def __init__(
         devices: Optional[List[Union[str, torch.device]]] = None,
     ):
         """
-        Load a `VisionEncoderDecoderModel` model from transformers.
+        Load a an Image-to-Text model from transformers.
 
         :param model_name_or_path: Directory of a saved model or the name of a public model.
-                                   Currently, only `VisionEncoderDecoderModel` models are supported.
                                    To find these models:
                                    1. Visit [Hugging Face image to text models](https://huggingface.co/models?pipeline_tag=image-to-text).`
                                    2. Open the model you want to check.
                                    3. On the model page, go to the "Files and Versions" tab.
-                                   4. Open the `config.json` file and make sure the `architectures` field contains `VisionEncoderDecoderModel`.
+                                   4. Open the `config.json` file and make sure the `architectures` field contains `VisionEncoderDecoderModel`, `BlipForConditionalGeneration`, or `Blip2ForConditionalGeneration`.
         :param model_version: The version of the model to use from the Hugging Face model hub. This can be the tag name, branch name, or commit hash.
         :param generation_kwargs: Dictionary containing arguments for the `generate()` method of the Hugging Face model.
                                 See [generate()](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) in Hugging Face documentation.
diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py
index 91ffe11f64..689fe7e13f 100644
--- a/test/nodes/test_image_to_text.py
+++ b/test/nodes/test_image_to_text.py
@@ -91,12 +91,3 @@ def test_image_to_text_unsupported_model_after_loading():
         match="The model 'deepset/minilm-uncased-squad2' \(class 'BertForQuestionAnswering'\) is not supported for ImageToText",
     ):
         _ = TransformersImageToText(model_name_or_path="deepset/minilm-uncased-squad2")
-
-
-@pytest.mark.integration
-def test_image_to_text_unsupported_model_before_loading():
-    with pytest.raises(
-        ValueError,
-        match=r"The model '.*' \(class '.*'\) is not supported for ImageToText. The supported classes are: \['VisionEncoderDecoderModel'\]",
-    ):
-        _ = TransformersImageToText(model_name_or_path="Salesforce/blip-image-captioning-base")

From 150fe006643f993361e2ed61fc14d71bd4f63d92 Mon Sep 17 00:00:00 2001
From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>
Date: Mon, 15 May 2023 18:08:26 +0200
Subject: [PATCH 2/2] fix typo

Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
---
 haystack/nodes/image_to_text/transformers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py
index 5b3ea917b0..d7f9c04bce 100644
--- a/haystack/nodes/image_to_text/transformers.py
+++ b/haystack/nodes/image_to_text/transformers.py
@@ -76,7 +76,7 @@ def __init__(
         devices: Optional[List[Union[str, torch.device]]] = None,
     ):
         """
-        Load a an Image-to-Text model from transformers.
+        Load an Image-to-Text model from transformers.
 
         :param model_name_or_path: Directory of a saved model or the name of a public model.
                                    To find these models: