From 4323e34dbcfb13b1ec1894fb14eee1a835358bee Mon Sep 17 00:00:00 2001
From: Akshay Ballal <arballal95@gmail.com>
Date: Mon, 21 Oct 2024 23:40:27 +0200
Subject: [PATCH 1/3] add colpali export support

---
 optimum/exporters/onnx/model_configs.py | 57 +++++++++++++++++++++++++
 optimum/exporters/onnx/model_patcher.py | 18 ++++++++
 optimum/exporters/tasks.py              |  4 ++
 3 files changed, 79 insertions(+)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 36963a986d0..79d8228ec6b 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -73,6 +73,7 @@
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
     CLIPModelPatcher,
+    ColPaliModelPatcher,
     FalconModelPatcher,
     MistralModelPatcher,
     MusicgenModelPatcher,
@@ -2310,3 +2311,59 @@ def overwrite_shape_and_generate_input(
 
 class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
+
+
+class PaliGemmaOnnxConfig(GemmaOnnxConfig):
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyVisionInputGenerator)
+
+    NORMALIZED_CONFIG_CLASS = NormalizedTextAndVisionConfig.with_args(
+        text_config="text_config", vision_config="vision_config"
+    )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        dynamic_axis = {0: "batch_size", 1: "sequence_length"}
+
+        if self.task == "feature-extraction":
+            return {
+                "input_ids": dynamic_axis,
+                "attention_mask": dynamic_axis,
+                "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            }
+        elif self.task == "text-generation":
+            return {
+                "input_ids": dynamic_axis,
+                "attention_mask": dynamic_axis,
+            }
+
+    def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
+
+        dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
+
+        if framework == "pt":
+
+            if self.task == "feature-extraction":
+                generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config)
+                prefix_tensor = generator.constant_tensor(
+                    shape=[dummy_inputs["input_ids"].shape[0], 1024],
+                    value=self._normalized_config.image_token_index,
+                    framework=framework,
+                )
+                dummy_inputs["input_ids"] = generator.concat_inputs([prefix_tensor, dummy_inputs["input_ids"]], dim=1)
+                dummy_inputs["attention_mask"] = generator.random_mask_tensor(
+                    shape=[generator.batch_size, generator.sequence_length + 1024],
+                    padding_side=generator.padding_side,
+                    framework=framework,
+                    dtype="int64",
+                )
+        return dummy_inputs
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+
+        if self.task == "feature-extraction":
+            return ColPaliModelPatcher(self, model, model_kwargs=model_kwargs)
+        else:
+            return super().patch_model_for_export(model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 34ed5fcae46..29874360233 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -510,6 +510,24 @@ def patched_forward(*args, **kwargs):
         self.patched_forward = patched_forward
 
 
+class ColPaliModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs):
+            outputs = self.orig_forward(
+                input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs
+            )
+            return outputs
+
+        self.patched_forward = patched_forward
+
+
 class SAMModelPatcher(ModelPatcher):
     def __init__(
         self,
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index a489f34fb06..83791db3eec 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -915,6 +915,10 @@ class TasksManager:
             "text-classification",
             onnx="LlamaOnnxConfig",
         ),
+        "paligemma": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="PaliGemmaOnnxConfig",
+        ),
         "pegasus": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",

From a1f7c06b559b139f270d3467dbabc1c122fcc9c6 Mon Sep 17 00:00:00 2001
From: Akshay Ballal <arballal95@gmail.com>
Date: Sun, 8 Dec 2024 23:23:34 +0100
Subject: [PATCH 2/3] colpali exporter

---
 optimum/exporters/onnx/model_configs.py | 45 +++++++++++++++++++++++++
 optimum/exporters/onnx/model_patcher.py | 15 +++++++++
 optimum/exporters/tasks.py              |  4 +++
 3 files changed, 64 insertions(+)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 16f8d38c836..16ae22ce1bd 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -2475,3 +2475,48 @@ class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
 
     DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
+
+class PaliGemmaOnnxConfig(GemmaOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyVisionInputGenerator)
+    NORMALIZED_CONFIG_CLASS = NormalizedTextAndVisionConfig.with_args(
+        text_config="text_config", vision_config="vision_config"
+    )
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        dynamic_axis = {0: "batch_size", 1: "sequence_length"}
+        if self.task == "feature-extraction":
+            return {
+                "input_ids": dynamic_axis,
+                "attention_mask": dynamic_axis,
+                "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            }
+        elif self.task == "text-generation":
+            return {
+                "input_ids": dynamic_axis,
+                "attention_mask": dynamic_axis,
+            }
+    def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
+        dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
+        if framework == "pt":
+            if self.task == "feature-extraction":
+                generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config)
+                prefix_tensor = generator.constant_tensor(
+                    shape=[dummy_inputs["input_ids"].shape[0], 1024],
+                    value=self._normalized_config.image_token_index,
+                    framework=framework,
+                )
+                dummy_inputs["input_ids"] = generator.concat_inputs([prefix_tensor, dummy_inputs["input_ids"]], dim=1)
+                dummy_inputs["attention_mask"] = generator.random_mask_tensor(
+                    shape=[generator.batch_size, generator.sequence_length + 1024],
+                    padding_side=generator.padding_side,
+                    framework=framework,
+                    dtype="int64",
+                )
+        return dummy_inputs
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        if self.task == "feature-extraction":
+            return ColPaliModelPatcher(self, model, model_kwargs=model_kwargs)
+        else:
+            return super().patch_model_for_export(model, model_kwargs=model_kwargs)
\ No newline at end of file
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 12433fafe73..73092ac1929 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -1172,3 +1172,18 @@ def __exit__(self, exc_type, exc_value, traceback):
             from transformers.models.clip.modeling_clip import CLIPSdpaAttention
 
             CLIPSdpaAttention.forward = self.original_sdpa_forward
+
+class ColPaliModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+        def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs):
+            outputs = self.orig_forward(
+                input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs
+            )
+            return outputs
+        self.patched_forward = patched_forward
\ No newline at end of file
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 0a3758e97cf..6cb4e8cd439 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -949,6 +949,10 @@ class TasksManager:
             "text-generation-with-past",
             onnx="GraniteOnnxConfig",
         ),
+        "paligemma": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="PaliGemmaOnnxConfig",
+        ),
         "pegasus": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",

From ea04509915bcd731ca3e86046bc5702f4fbd7cc1 Mon Sep 17 00:00:00 2001
From: Akshay Ballal <arballal95@gmail.com>
Date: Sat, 21 Dec 2024 14:02:39 +0100
Subject: [PATCH 3/3] add test and make num_image_tokens dynamic

---
 optimum/exporters/onnx/model_configs.py |  6 +++---
 optimum/exporters/onnx/model_patcher.py | 14 --------------
 optimum/exporters/tasks.py              |  1 +
 tests/exporters/exporters_utils.py      |  1 +
 4 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 16ae22ce1bd..400671cabd3 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -2490,7 +2490,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
                 "attention_mask": dynamic_axis,
                 "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
             }
-        elif self.task == "text-generation":
+        elif self.task == "image-to-text":
             return {
                 "input_ids": dynamic_axis,
                 "attention_mask": dynamic_axis,
@@ -2501,13 +2501,13 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
             if self.task == "feature-extraction":
                 generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config)
                 prefix_tensor = generator.constant_tensor(
-                    shape=[dummy_inputs["input_ids"].shape[0], 1024],
+                    shape=[dummy_inputs["input_ids"].shape[0], self._normalized_config.vision_config.num_image_tokens],
                     value=self._normalized_config.image_token_index,
                     framework=framework,
                 )
                 dummy_inputs["input_ids"] = generator.concat_inputs([prefix_tensor, dummy_inputs["input_ids"]], dim=1)
                 dummy_inputs["attention_mask"] = generator.random_mask_tensor(
-                    shape=[generator.batch_size, generator.sequence_length + 1024],
+                    shape=[generator.batch_size, generator.sequence_length + self._normalized_config.vision_config.num_image_tokens],
                     padding_side=generator.padding_side,
                     framework=framework,
                     dtype="int64",
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 73092ac1929..2c5e6e3e588 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -1173,17 +1173,3 @@ def __exit__(self, exc_type, exc_value, traceback):
 
             CLIPSdpaAttention.forward = self.original_sdpa_forward
 
-class ColPaliModelPatcher(ModelPatcher):
-    def __init__(
-        self,
-        config: "OnnxConfig",
-        model: Union["PreTrainedModel", "TFPreTrainedModel"],
-        model_kwargs: Optional[Dict[str, Any]] = None,
-    ):
-        super().__init__(config, model, model_kwargs)
-        def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs):
-            outputs = self.orig_forward(
-                input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs
-            )
-            return outputs
-        self.patched_forward = patched_forward
\ No newline at end of file
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 6cb4e8cd439..9f5fee49d93 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -951,6 +951,7 @@ class TasksManager:
         ),
         "paligemma": supported_tasks_mapping(
             "feature-extraction",
+            "image-to-text",
             onnx="PaliGemmaOnnxConfig",
         ),
         "pegasus": supported_tasks_mapping(
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 32156d9eebf..4dfddcd6bce 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -129,6 +129,7 @@
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "owlv2": "hf-internal-testing/tiny-random-Owlv2Model",
     "owlvit": "hf-tiny-model-private/tiny-random-OwlViTModel",
+    "paligemma": {"hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration": ["image-to-text", "feature-extraction"]},
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
     "perceiver": {
         "hf-internal-testing/tiny-random-language_perceiver": ["fill-mask", "text-classification"],