Support Phi-4 and Phi-4-multimodal-instruct in LLM text-generation co…

…mps on gaudi mode Add support for phi-4 related model with optimum habana Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
opea-project · Feb 27, 2025 · bce8e1a · bce8e1a
1 parent c70f868
commit bce8e1a
Show file tree

Hide file tree

Showing 17 changed files with 13,237 additions and 5 deletions.
diff --git a/comps/llms/deployment/docker_compose/compose_text-generation.yaml b/comps/llms/deployment/docker_compose/compose_text-generation.yaml
@@ -45,6 +45,29 @@ services:
       - SYS_NICE
     restart: unless-stopped
 
+  textgen-phi4-gaudi:
+    image: ${REGISTRY:-opea}/llm-textgen-phi4-gaudi:${TAG:-latest}
+    container_name: llm-textgen-phi4-gaudi-server
+    ports:
+      - ${TEXTGEN_PORT:-9000}:9000
+    volumes:
+      - "${DATA_PATH:-./data}:/data"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      TOKENIZERS_PARALLELISM: False
+      LOGFLAG: ${LOGFLAG:-False}
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+
   textgen-service-tgi:
     extends: textgen
     container_name: textgen-service-tgi
@@ -100,6 +123,18 @@ services:
     environment:
       LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
 
+  textgen-native-phi4-gaudi:
+    extends: textgen-phi4-gaudi
+    container_name: textgen-native-phi4-gaudi
+    environment:
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
+
+  textgen-native-phi4-multimodal-gaudi:
+    extends: textgen-phi4-gaudi
+    container_name: textgen-native-phi4-multimodal-gaudi
+    environment:
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNativePhi4Multimodal}
+
 networks:
   default:
     driver: bridge
diff --git a/comps/llms/src/text-generation/Dockerfile.intel_hpu b/comps/llms/src/text-generation/Dockerfile.intel_hpu
@@ -29,7 +29,7 @@ RUN git clone ${REPO} /home/user/optimum-habana && \
     cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
     cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \
     cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir --upgrade --force-reinstall pydantic numpy==1.23.5
+    pip install --no-cache-dir --upgrade --force-reinstall pydantic
 
 ENV PYTHONPATH=/root:/home/user
 

diff --git a/comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4 b/comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# HABANA environment
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1 AS hpu
+
+ENV LANG=en_US.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    git-lfs \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN mkdir -p /home/user
+
+RUN git lfs install
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
+    pip install --no-cache-dir git+/~https://github.com/HabanaAI/DeepSpeed.git@1.19.0
+
+RUN pip install git+/~https://github.com/huggingface/optimum-habana.git@transformers_future && \
+    cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
+    pip install soundfile peft backoff
+
+ENV PYTHONPATH=/root:/home/user
+
+WORKDIR /home/user/comps/llms/src/text-generation/
+
+ENTRYPOINT ["bash", "entrypoint_phi4.sh"]
diff --git a/comps/llms/src/text-generation/entrypoint_phi4.sh b/comps/llms/src/text-generation/entrypoint_phi4.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#!/bin/bash
+
+#LLM_MODEL_ID mush be a model path
+llm_name=$LLM_MODEL_ID
+WORKPATH="/home/user/comps/llms/src/text-generation/"
+
+if [[ $llm_name == *"Phi-4-multimodal-instruct"* ]]; then
+    cd $WORKPATH
+    echo -e "Patching into the multimodal models"
+    cp patch/phi4-multimodal-patch/*.py $llm_name/
+    export PT_HPU_LAZY_MODE=1
+elif [[ $llm_name == *"Phi-4-mini-instruct"* ]]; then
+    cd $WORKPATH
+    git clone -b transformers_future /~https://github.com/huggingface/optimum-habana
+    cd optimum-habana
+    cp ../patch/optimum-habana-phi4.patch .
+    git apply optimum-habana-phi4.patch
+    pip install -e .
+    cd examples/text-generation/
+    pip install -r requirements.txt
+    cd phi-4-mini-instruct/
+    bash ./01-patch-transformer.sh
+fi
+
+cd $WORKPATH
+python opea_llm_microservice.py
diff --git a/comps/llms/src/text-generation/integrations/native.py b/comps/llms/src/text-generation/integrations/native.py
@@ -48,8 +48,8 @@
     "device": "hpu",
     "model_name_or_path": MODEL_NAME,
     "bf16": True,
-    "max_new_tokens": 100,
-    "max_input_tokens": 0,
+    "max_new_tokens": 32,
+    "max_input_tokens": 128,
     "batch_size": 1,
     "warmup": 3,
     "n_iterations": 5,
@@ -105,6 +105,21 @@
     "penalty_alpha": None,
 }
 
+if "Phi-4-mini-instruct" in MODEL_NAME:
+    args_dict_phi4 = {
+        "use_kv_cache": False,
+        "attn_softmax_bf16": True,
+        "limit_hpu_graphs": True,
+        "use_flash_attention": True,
+        "flash_attention_recompute": True,
+        "flash_attention_causal_mask": True,
+        "flash_attention_fast_softmax": True,
+    }
+    args_dict.update(args_dict_phi4)
+
+if logflag:
+    logger.info(args_dict)
+
 
 class Args:
     def __init__(self, **entries):
@@ -123,6 +138,7 @@ def __init__(self, **entries):
 def generate(
     input_query: list,
     device="hpu",
+    max_new_tokens=32,
     use_lazy_mode=True,
     use_hpu_graphs=True,
     profiling_steps=0,
@@ -159,6 +175,7 @@ def generate(
         **input_tokens,
         generation_config=generation_config,
         assistant_model=assistant_model,
+        max_new_tokens=max_new_tokens,
         lazy_mode=use_lazy_mode,
         hpu_graphs=use_hpu_graphs,
         profiling_steps=profiling_steps,
@@ -262,7 +279,7 @@ async def invoke(self, input: ChatCompletionRequest):
         else:
             if input.documents:
                 prompt = ChatTemplate.generate_rag_prompt(message, input.documents)
-        res = generate([prompt])
+        res = generate([prompt], max_new_tokens=input.max_tokens)
 
         if logflag:
             logger.info(f"[llm - native] inference result: {res}")

diff --git a/comps/llms/src/text-generation/integrations/native_phi4_multimodal.py b/comps/llms/src/text-generation/integrations/native_phi4_multimodal.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,  either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+sys.path.append("/test/GenAIComps/")
+
+import os
+import threading
+import time
+
+import habana_frameworks.torch.core as htcore
+import soundfile
+import torch
+from langchain_core.prompts import PromptTemplate
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+
+from comps import CustomLogger, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+from comps.cores.proto.api_protocol import ChatCompletionRequest
+
+from .template import ChatTemplate
+
+logger = CustomLogger("opea_textgen_native_phi_multimodal")
+logflag = os.getenv("LOGFLAG", False)
+
+MODEL_NAME = os.getenv("LLM_MODEL_ID", "microsoft/Phi-4-multimodal-instruct")
+
+model = None
+processor = None
+generation_config = None
+initialization_lock = threading.Lock()
+initialized = False
+
+kwargs = {}
+kwargs["torch_dtype"] = torch.bfloat16
+
+user_prompt = "<|user|>"
+assistant_prompt = "<|assistant|>"
+prompt_suffix = "<|end|>"
+IMAGE_SPECIAL = "<|endoftext10|>"
+AUDIO_SPECIAL = "<|endoftext11|>"
+sample_prompt = f"{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}"
+if logflag:
+    logger.info(f">>> Prompt\n{sample_prompt}")
+
+generation_config = GenerationConfig.from_pretrained(MODEL_NAME, "generation_config.json")
+
+# generation_config.max_new_tokens = args.max_new_tokens
+# generation_config.use_cache = args.use_kv_cache
+generation_config.static_shapes = False  # There's a list of models optimized with static shapes
+generation_config.bucket_size = -1
+generation_config.bucket_internal = False
+# generation_config.do_sample = args.do_sample
+# generation_config.num_beams = args.num_beams
+# generation_config.top_k = args.top_k
+# generation_config.penalty_alpha = args.penalty_alpha
+# generation_config.bad_words_ids = bad_words_ids
+# generation_config.force_words_ids = force_words_ids
+# generation_config.num_return_sequences = args.num_return_sequences
+generation_config.trim_logits = True
+generation_config.attn_softmax_bf16 = False
+generation_config.limit_hpu_graphs = False
+generation_config.clear_hpu_graphs_cache = False
+generation_config.reuse_cache = False
+generation_config.reduce_recompile = False
+# if generation_config.reduce_recompile:
+#     assert generation_config.bucket_size > 0
+generation_config.use_flash_attention = False
+generation_config.flash_attention_recompute = False
+generation_config.flash_attention_causal_mask = False
+generation_config.flash_attention_fast_softmax = False
+# generation_config.trust_remote_code = args.trust_remote_code
+generation_config.valid_sequence_lengths = None  # OkS
+generation_config.attn_batch_split = False
+generation_config.ignore_eos = None
+
+
+def generate(
+    query,
+    max_tokens=128,
+):
+    """Generates sequences from the input sentences and returns them."""
+    logger.info(f"[llm - generate] starting to inference with prompt {query}")
+    inputs = processor(query, images=None, return_tensors="pt").to("hpu:0")
+
+    generate_ids = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        generation_config=generation_config,
+    )
+    generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
+    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+    if logflag:
+        logger.info(response)
+    print(f">>> Response\n{response}")
+
+    return response
+
+
+def initialize():
+    global model, processor, generation_config, initialized
+    with initialization_lock:
+        if not initialized:
+            processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_NAME,
+                trust_remote_code=True,
+                torch_dtype="auto",
+                _attn_implementation="sdpa",
+            )
+            model = model.to("hpu")
+            if logflag:
+                logger.info(processor.tokenizer)
+                logger.info(f"model.config._attn_implementation: {model.config._attn_implementation}")
+            logger.info("[llm] model and processor initialized.")
+
+            # Must put after the models are downloaded because this has custom remote code that needs to be loaded first for the OH to load the override functions
+            from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+            adapt_transformers_to_gaudi()
+
+            logger.info("[llm - native] Ready to inference")
+            res = generate(sample_prompt)
+            logger.info(f"[llm - native] test result: {res}")
+            initialized = True
+
+
+@OpeaComponentRegistry.register("OpeaTextGenNativePhi4Multimodal")
+class OpeaTextGenNativePhi4Multimodal(OpeaComponent):
+    """A specialized OPEA TextGen component derived from OpeaComponent for interacting with LLM services based on native optimum habana."""
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.LLM.name.lower(), description, config)
+        initialize()
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OpeaTextGenNativePhi4Multimodal health check failed.")
+        else:
+            logger.info("OpeaTextGenNativePhi4Multimodal health check success.")
+
+    def check_health(self) -> bool:
+        """Checks the health of the LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            return initialized
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: ChatCompletionRequest):
+        """Invokes the LLM service to generate output for the provided input.
+
+        Args:
+            input (ChatCompletionRequest): The input text(s).
+        """
+
+        message = None
+        if isinstance(input.messages, str):
+            message = input.messages
+        else:  # List[Dict]
+            for input_data in input.messages:
+                if "role" in input_data and input_data["role"] == "user" and "content" in input_data:
+                    message = input_data["content"]
+                    if logflag:
+                        logger.info(f"Get input text:\n {message}")
+        if message is None:
+            logger.error("Don't receive any input text, exit!")
+            return GeneratedDoc(text=None, prompt=None)
+
+        prompt = message
+        prompt_template = None
+        if input.chat_template:
+            prompt_template = PromptTemplate.from_template(input.chat_template)
+            input_variables = prompt_template.input_variables
+        if prompt_template:
+            if sorted(input_variables) == ["context", "question"]:
+                prompt = prompt_template.format(question=message, context="\n".join(input.documents))
+            elif input_variables == ["question"]:
+                prompt = prompt_template.format(question=message)
+            else:
+                logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+        else:
+            if input.documents:
+                prompt = ChatTemplate.generate_rag_prompt(message, input.documents)
+        res = generate(prompt, input.max_tokens)
+
+        if logflag:
+            logger.info(f"[llm - native] inference result: {res}")
+        return GeneratedDoc(text=res, prompt=message)
diff --git a/comps/llms/src/text-generation/opea_llm_microservice.py b/comps/llms/src/text-generation/opea_llm_microservice.py
@@ -28,6 +28,8 @@
 
 if llm_component_name == "OpeaTextGenNative":
     from integrations.native import OpeaTextGenNative
+elif llm_component_name == "OpeaTextGenNativePhi4Multimodal":
+    from integrations.native_phi4_multimodal import OpeaTextGenNativePhi4Multimodal
 elif llm_component_name == "OpeaTextGenBedrock":
     from integrations.bedrock import OpeaTextGenBedrock
 else: