From 77eb0c2c879f44fb7db751ab7e80c8c88e131380 Mon Sep 17 00:00:00 2001 From: anakin87 <44616784+anakin87@users.noreply.github.com> Date: Sun, 14 May 2023 18:38:44 +0200 Subject: [PATCH 1/2] add blip support --- haystack/nodes/image_to_text/transformers.py | 15 ++++++++------- test/nodes/test_image_to_text.py | 9 --------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py index 25357c5411..5b3ea917b0 100644 --- a/haystack/nodes/image_to_text/transformers.py +++ b/haystack/nodes/image_to_text/transformers.py @@ -17,7 +17,11 @@ # supported models classes should be extended when HF image-to-text pipeline willl support more classes # see /~https://github.com/huggingface/transformers/issues/21110 -SUPPORTED_MODELS_CLASSES = ["VisionEncoderDecoderModel"] +SUPPORTED_MODELS_CLASSES = [ + "VisionEncoderDecoderModel", + "BlipForConditionalGeneration", + "Blip2ForConditionalGeneration", +] UNSUPPORTED_MODEL_MESSAGE = ( f"The supported classes are: {SUPPORTED_MODELS_CLASSES}. \n" @@ -33,8 +37,6 @@ class TransformersImageToText(BaseImageToText): """ A transformer-based model to generate captions for images using the Hugging Face's transformers framework. - Currently, this node supports `VisionEncoderDecoderModel` models. - **Example** ```python @@ -64,7 +66,7 @@ class TransformersImageToText(BaseImageToText): def __init__( self, - model_name_or_path: str = "nlpconnect/vit-gpt2-image-captioning", + model_name_or_path: str = "Salesforce/blip-image-captioning-base", model_version: Optional[str] = None, generation_kwargs: Optional[dict] = None, use_gpu: bool = True, @@ -74,15 +76,14 @@ def __init__( devices: Optional[List[Union[str, torch.device]]] = None, ): """ - Load a `VisionEncoderDecoderModel` model from transformers. + Load a an Image-to-Text model from transformers. :param model_name_or_path: Directory of a saved model or the name of a public model. - Currently, only `VisionEncoderDecoderModel` models are supported. To find these models: 1. Visit [Hugging Face image to text models](https://huggingface.co/models?pipeline_tag=image-to-text).` 2. Open the model you want to check. 3. On the model page, go to the "Files and Versions" tab. - 4. Open the `config.json` file and make sure the `architectures` field contains `VisionEncoderDecoderModel`. + 4. Open the `config.json` file and make sure the `architectures` field contains `VisionEncoderDecoderModel`, `BlipForConditionalGeneration`, or `Blip2ForConditionalGeneration`. :param model_version: The version of the model to use from the Hugging Face model hub. This can be the tag name, branch name, or commit hash. :param generation_kwargs: Dictionary containing arguments for the `generate()` method of the Hugging Face model. See [generate()](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) in Hugging Face documentation. diff --git a/test/nodes/test_image_to_text.py b/test/nodes/test_image_to_text.py index 91ffe11f64..689fe7e13f 100644 --- a/test/nodes/test_image_to_text.py +++ b/test/nodes/test_image_to_text.py @@ -91,12 +91,3 @@ def test_image_to_text_unsupported_model_after_loading(): match="The model 'deepset/minilm-uncased-squad2' \(class 'BertForQuestionAnswering'\) is not supported for ImageToText", ): _ = TransformersImageToText(model_name_or_path="deepset/minilm-uncased-squad2") - - -@pytest.mark.integration -def test_image_to_text_unsupported_model_before_loading(): - with pytest.raises( - ValueError, - match=r"The model '.*' \(class '.*'\) is not supported for ImageToText. The supported classes are: \['VisionEncoderDecoderModel'\]", - ): - _ = TransformersImageToText(model_name_or_path="Salesforce/blip-image-captioning-base") From 150fe006643f993361e2ed61fc14d71bd4f63d92 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Mon, 15 May 2023 18:08:26 +0200 Subject: [PATCH 2/2] fix typo Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- haystack/nodes/image_to_text/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/nodes/image_to_text/transformers.py b/haystack/nodes/image_to_text/transformers.py index 5b3ea917b0..d7f9c04bce 100644 --- a/haystack/nodes/image_to_text/transformers.py +++ b/haystack/nodes/image_to_text/transformers.py @@ -76,7 +76,7 @@ def __init__( devices: Optional[List[Union[str, torch.device]]] = None, ): """ - Load a an Image-to-Text model from transformers. + Load an Image-to-Text model from transformers. :param model_name_or_path: Directory of a saved model or the name of a public model. To find these models: