From 4323e34dbcfb13b1ec1894fb14eee1a835358bee Mon Sep 17 00:00:00 2001 From: Akshay Ballal Date: Mon, 21 Oct 2024 23:40:27 +0200 Subject: [PATCH 1/3] add colpali export support --- optimum/exporters/onnx/model_configs.py | 57 +++++++++++++++++++++++++ optimum/exporters/onnx/model_patcher.py | 18 ++++++++ optimum/exporters/tasks.py | 4 ++ 3 files changed, 79 insertions(+) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 36963a986d0..79d8228ec6b 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -73,6 +73,7 @@ from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME from .model_patcher import ( CLIPModelPatcher, + ColPaliModelPatcher, FalconModelPatcher, MistralModelPatcher, MusicgenModelPatcher, @@ -2310,3 +2311,59 @@ def overwrite_shape_and_generate_input( class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig + + +class PaliGemmaOnnxConfig(GemmaOnnxConfig): + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyVisionInputGenerator) + + NORMALIZED_CONFIG_CLASS = NormalizedTextAndVisionConfig.with_args( + text_config="text_config", vision_config="vision_config" + ) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + dynamic_axis = {0: "batch_size", 1: "sequence_length"} + + if self.task == "feature-extraction": + return { + "input_ids": dynamic_axis, + "attention_mask": dynamic_axis, + "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}, + } + elif self.task == "text-generation": + return { + "input_ids": dynamic_axis, + "attention_mask": dynamic_axis, + } + + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + + dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs) + + if framework == "pt": + + if self.task == "feature-extraction": + generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config) + prefix_tensor = generator.constant_tensor( + shape=[dummy_inputs["input_ids"].shape[0], 1024], + value=self._normalized_config.image_token_index, + framework=framework, + ) + dummy_inputs["input_ids"] = generator.concat_inputs([prefix_tensor, dummy_inputs["input_ids"]], dim=1) + dummy_inputs["attention_mask"] = generator.random_mask_tensor( + shape=[generator.batch_size, generator.sequence_length + 1024], + padding_side=generator.padding_side, + framework=framework, + dtype="int64", + ) + return dummy_inputs + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + + if self.task == "feature-extraction": + return ColPaliModelPatcher(self, model, model_kwargs=model_kwargs) + else: + return super().patch_model_for_export(model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 34ed5fcae46..29874360233 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -510,6 +510,24 @@ def patched_forward(*args, **kwargs): self.patched_forward = patched_forward +class ColPaliModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs): + outputs = self.orig_forward( + input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs + ) + return outputs + + self.patched_forward = patched_forward + + class SAMModelPatcher(ModelPatcher): def __init__( self, diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index a489f34fb06..83791db3eec 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -915,6 +915,10 @@ class TasksManager: "text-classification", onnx="LlamaOnnxConfig", ), + "paligemma": supported_tasks_mapping( + "feature-extraction", + onnx="PaliGemmaOnnxConfig", + ), "pegasus": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", From a1f7c06b559b139f270d3467dbabc1c122fcc9c6 Mon Sep 17 00:00:00 2001 From: Akshay Ballal Date: Sun, 8 Dec 2024 23:23:34 +0100 Subject: [PATCH 2/3] colpali exporter --- optimum/exporters/onnx/model_configs.py | 45 +++++++++++++++++++++++++ optimum/exporters/onnx/model_patcher.py | 15 +++++++++ optimum/exporters/tasks.py | 4 +++ 3 files changed, 64 insertions(+) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 16f8d38c836..16ae22ce1bd 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -2475,3 +2475,48 @@ class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. + +class PaliGemmaOnnxConfig(GemmaOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyVisionInputGenerator) + NORMALIZED_CONFIG_CLASS = NormalizedTextAndVisionConfig.with_args( + text_config="text_config", vision_config="vision_config" + ) + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + dynamic_axis = {0: "batch_size", 1: "sequence_length"} + if self.task == "feature-extraction": + return { + "input_ids": dynamic_axis, + "attention_mask": dynamic_axis, + "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}, + } + elif self.task == "text-generation": + return { + "input_ids": dynamic_axis, + "attention_mask": dynamic_axis, + } + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): + dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs) + if framework == "pt": + if self.task == "feature-extraction": + generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config) + prefix_tensor = generator.constant_tensor( + shape=[dummy_inputs["input_ids"].shape[0], 1024], + value=self._normalized_config.image_token_index, + framework=framework, + ) + dummy_inputs["input_ids"] = generator.concat_inputs([prefix_tensor, dummy_inputs["input_ids"]], dim=1) + dummy_inputs["attention_mask"] = generator.random_mask_tensor( + shape=[generator.batch_size, generator.sequence_length + 1024], + padding_side=generator.padding_side, + framework=framework, + dtype="int64", + ) + return dummy_inputs + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + if self.task == "feature-extraction": + return ColPaliModelPatcher(self, model, model_kwargs=model_kwargs) + else: + return super().patch_model_for_export(model, model_kwargs=model_kwargs) \ No newline at end of file diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 12433fafe73..73092ac1929 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -1172,3 +1172,18 @@ def __exit__(self, exc_type, exc_value, traceback): from transformers.models.clip.modeling_clip import CLIPSdpaAttention CLIPSdpaAttention.forward = self.original_sdpa_forward + +class ColPaliModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs): + outputs = self.orig_forward( + input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs + ) + return outputs + self.patched_forward = patched_forward \ No newline at end of file diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 0a3758e97cf..6cb4e8cd439 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -949,6 +949,10 @@ class TasksManager: "text-generation-with-past", onnx="GraniteOnnxConfig", ), + "paligemma": supported_tasks_mapping( + "feature-extraction", + onnx="PaliGemmaOnnxConfig", + ), "pegasus": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", From ea04509915bcd731ca3e86046bc5702f4fbd7cc1 Mon Sep 17 00:00:00 2001 From: Akshay Ballal Date: Sat, 21 Dec 2024 14:02:39 +0100 Subject: [PATCH 3/3] add test and make num_image_tokens dynamic --- optimum/exporters/onnx/model_configs.py | 6 +++--- optimum/exporters/onnx/model_patcher.py | 14 -------------- optimum/exporters/tasks.py | 1 + tests/exporters/exporters_utils.py | 1 + 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 16ae22ce1bd..400671cabd3 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -2490,7 +2490,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: "attention_mask": dynamic_axis, "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}, } - elif self.task == "text-generation": + elif self.task == "image-to-text": return { "input_ids": dynamic_axis, "attention_mask": dynamic_axis, @@ -2501,13 +2501,13 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs): if self.task == "feature-extraction": generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config) prefix_tensor = generator.constant_tensor( - shape=[dummy_inputs["input_ids"].shape[0], 1024], + shape=[dummy_inputs["input_ids"].shape[0], self._normalized_config.vision_config.num_image_tokens], value=self._normalized_config.image_token_index, framework=framework, ) dummy_inputs["input_ids"] = generator.concat_inputs([prefix_tensor, dummy_inputs["input_ids"]], dim=1) dummy_inputs["attention_mask"] = generator.random_mask_tensor( - shape=[generator.batch_size, generator.sequence_length + 1024], + shape=[generator.batch_size, generator.sequence_length + self._normalized_config.vision_config.num_image_tokens], padding_side=generator.padding_side, framework=framework, dtype="int64", diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 73092ac1929..2c5e6e3e588 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -1173,17 +1173,3 @@ def __exit__(self, exc_type, exc_value, traceback): CLIPSdpaAttention.forward = self.original_sdpa_forward -class ColPaliModelPatcher(ModelPatcher): - def __init__( - self, - config: "OnnxConfig", - model: Union["PreTrainedModel", "TFPreTrainedModel"], - model_kwargs: Optional[Dict[str, Any]] = None, - ): - super().__init__(config, model, model_kwargs) - def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs): - outputs = self.orig_forward( - input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs - ) - return outputs - self.patched_forward = patched_forward \ No newline at end of file diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 6cb4e8cd439..9f5fee49d93 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -951,6 +951,7 @@ class TasksManager: ), "paligemma": supported_tasks_mapping( "feature-extraction", + "image-to-text", onnx="PaliGemmaOnnxConfig", ), "pegasus": supported_tasks_mapping( diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 32156d9eebf..4dfddcd6bce 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -129,6 +129,7 @@ "opt": "hf-internal-testing/tiny-random-OPTModel", "owlv2": "hf-internal-testing/tiny-random-Owlv2Model", "owlvit": "hf-tiny-model-private/tiny-random-OwlViTModel", + "paligemma": {"hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration": ["image-to-text", "feature-extraction"]}, "pegasus": "hf-internal-testing/tiny-random-PegasusModel", "perceiver": { "hf-internal-testing/tiny-random-language_perceiver": ["fill-mask", "text-classification"],