Skip to content

Commit

Permalink
Support Phi-4 and Phi-4-multimodal-instruct in LLM text-generation co…
Browse files Browse the repository at this point in the history
…mps on gaudi mode

Add support for phi-4 related model with optimum habana

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
  • Loading branch information
XinyaoWa committed Feb 27, 2025
1 parent c70f868 commit bce8e1a
Show file tree
Hide file tree
Showing 17 changed files with 13,237 additions and 5 deletions.
35 changes: 35 additions & 0 deletions comps/llms/deployment/docker_compose/compose_text-generation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,29 @@ services:
- SYS_NICE
restart: unless-stopped

textgen-phi4-gaudi:
image: ${REGISTRY:-opea}/llm-textgen-phi4-gaudi:${TAG:-latest}
container_name: llm-textgen-phi4-gaudi-server
ports:
- ${TEXTGEN_PORT:-9000}:9000
volumes:
- "${DATA_PATH:-./data}:/data"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HF_TOKEN: ${HF_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
TOKENIZERS_PARALLELISM: False
LOGFLAG: ${LOGFLAG:-False}
runtime: habana
cap_add:
- SYS_NICE
restart: unless-stopped

textgen-service-tgi:
extends: textgen
container_name: textgen-service-tgi
Expand Down Expand Up @@ -100,6 +123,18 @@ services:
environment:
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}

textgen-native-phi4-gaudi:
extends: textgen-phi4-gaudi
container_name: textgen-native-phi4-gaudi
environment:
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}

textgen-native-phi4-multimodal-gaudi:
extends: textgen-phi4-gaudi
container_name: textgen-native-phi4-multimodal-gaudi
environment:
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNativePhi4Multimodal}

networks:
default:
driver: bridge
2 changes: 1 addition & 1 deletion comps/llms/src/text-generation/Dockerfile.intel_hpu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ RUN git clone ${REPO} /home/user/optimum-habana && \
cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \
cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir --upgrade --force-reinstall pydantic numpy==1.23.5
pip install --no-cache-dir --upgrade --force-reinstall pydantic

ENV PYTHONPATH=/root:/home/user

Expand Down
32 changes: 32 additions & 0 deletions comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# HABANA environment
FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1 AS hpu

ENV LANG=en_US.UTF-8

RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
git-lfs \
libgl1-mesa-glx \
libjemalloc-dev

RUN mkdir -p /home/user

RUN git lfs install

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
pip install --no-cache-dir git+/~https://github.com/HabanaAI/DeepSpeed.git@1.19.0

RUN pip install git+/~https://github.com/huggingface/optimum-habana.git@transformers_future && \
cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
pip install soundfile peft backoff

ENV PYTHONPATH=/root:/home/user

WORKDIR /home/user/comps/llms/src/text-generation/

ENTRYPOINT ["bash", "entrypoint_phi4.sh"]
30 changes: 30 additions & 0 deletions comps/llms/src/text-generation/entrypoint_phi4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#!/bin/bash

#LLM_MODEL_ID mush be a model path
llm_name=$LLM_MODEL_ID
WORKPATH="/home/user/comps/llms/src/text-generation/"

if [[ $llm_name == *"Phi-4-multimodal-instruct"* ]]; then
cd $WORKPATH
echo -e "Patching into the multimodal models"
cp patch/phi4-multimodal-patch/*.py $llm_name/
export PT_HPU_LAZY_MODE=1
elif [[ $llm_name == *"Phi-4-mini-instruct"* ]]; then
cd $WORKPATH
git clone -b transformers_future /~https://github.com/huggingface/optimum-habana
cd optimum-habana
cp ../patch/optimum-habana-phi4.patch .
git apply optimum-habana-phi4.patch
pip install -e .
cd examples/text-generation/
pip install -r requirements.txt
cd phi-4-mini-instruct/
bash ./01-patch-transformer.sh
fi

cd $WORKPATH
python opea_llm_microservice.py
23 changes: 20 additions & 3 deletions comps/llms/src/text-generation/integrations/native.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@
"device": "hpu",
"model_name_or_path": MODEL_NAME,
"bf16": True,
"max_new_tokens": 100,
"max_input_tokens": 0,
"max_new_tokens": 32,
"max_input_tokens": 128,
"batch_size": 1,
"warmup": 3,
"n_iterations": 5,
Expand Down Expand Up @@ -105,6 +105,21 @@
"penalty_alpha": None,
}

if "Phi-4-mini-instruct" in MODEL_NAME:
args_dict_phi4 = {
"use_kv_cache": False,
"attn_softmax_bf16": True,
"limit_hpu_graphs": True,
"use_flash_attention": True,
"flash_attention_recompute": True,
"flash_attention_causal_mask": True,
"flash_attention_fast_softmax": True,
}
args_dict.update(args_dict_phi4)

if logflag:
logger.info(args_dict)


class Args:
def __init__(self, **entries):
Expand All @@ -123,6 +138,7 @@ def __init__(self, **entries):
def generate(
input_query: list,
device="hpu",
max_new_tokens=32,
use_lazy_mode=True,
use_hpu_graphs=True,
profiling_steps=0,
Expand Down Expand Up @@ -159,6 +175,7 @@ def generate(
**input_tokens,
generation_config=generation_config,
assistant_model=assistant_model,
max_new_tokens=max_new_tokens,
lazy_mode=use_lazy_mode,
hpu_graphs=use_hpu_graphs,
profiling_steps=profiling_steps,
Expand Down Expand Up @@ -262,7 +279,7 @@ async def invoke(self, input: ChatCompletionRequest):
else:
if input.documents:
prompt = ChatTemplate.generate_rag_prompt(message, input.documents)
res = generate([prompt])
res = generate([prompt], max_new_tokens=input.max_tokens)

if logflag:
logger.info(f"[llm - native] inference result: {res}")
Expand Down
206 changes: 206 additions & 0 deletions comps/llms/src/text-generation/integrations/native_phi4_multimodal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys

sys.path.append("/test/GenAIComps/")

import os
import threading
import time

import habana_frameworks.torch.core as htcore
import soundfile
import torch
from langchain_core.prompts import PromptTemplate
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

from comps import CustomLogger, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
from comps.cores.proto.api_protocol import ChatCompletionRequest

from .template import ChatTemplate

logger = CustomLogger("opea_textgen_native_phi_multimodal")
logflag = os.getenv("LOGFLAG", False)

MODEL_NAME = os.getenv("LLM_MODEL_ID", "microsoft/Phi-4-multimodal-instruct")

model = None
processor = None
generation_config = None
initialization_lock = threading.Lock()
initialized = False

kwargs = {}
kwargs["torch_dtype"] = torch.bfloat16

user_prompt = "<|user|>"
assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>"
IMAGE_SPECIAL = "<|endoftext10|>"
AUDIO_SPECIAL = "<|endoftext11|>"
sample_prompt = f"{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}"
if logflag:
logger.info(f">>> Prompt\n{sample_prompt}")

generation_config = GenerationConfig.from_pretrained(MODEL_NAME, "generation_config.json")

# generation_config.max_new_tokens = args.max_new_tokens
# generation_config.use_cache = args.use_kv_cache
generation_config.static_shapes = False # There's a list of models optimized with static shapes
generation_config.bucket_size = -1
generation_config.bucket_internal = False
# generation_config.do_sample = args.do_sample
# generation_config.num_beams = args.num_beams
# generation_config.top_k = args.top_k
# generation_config.penalty_alpha = args.penalty_alpha
# generation_config.bad_words_ids = bad_words_ids
# generation_config.force_words_ids = force_words_ids
# generation_config.num_return_sequences = args.num_return_sequences
generation_config.trim_logits = True
generation_config.attn_softmax_bf16 = False
generation_config.limit_hpu_graphs = False
generation_config.clear_hpu_graphs_cache = False
generation_config.reuse_cache = False
generation_config.reduce_recompile = False
# if generation_config.reduce_recompile:
# assert generation_config.bucket_size > 0
generation_config.use_flash_attention = False
generation_config.flash_attention_recompute = False
generation_config.flash_attention_causal_mask = False
generation_config.flash_attention_fast_softmax = False
# generation_config.trust_remote_code = args.trust_remote_code
generation_config.valid_sequence_lengths = None # OkS
generation_config.attn_batch_split = False
generation_config.ignore_eos = None


def generate(
query,
max_tokens=128,
):
"""Generates sequences from the input sentences and returns them."""
logger.info(f"[llm - generate] starting to inference with prompt {query}")
inputs = processor(query, images=None, return_tensors="pt").to("hpu:0")

generate_ids = model.generate(
**inputs,
max_new_tokens=max_tokens,
generation_config=generation_config,
)
generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

if logflag:
logger.info(response)
print(f">>> Response\n{response}")

return response


def initialize():
global model, processor, generation_config, initialized
with initialization_lock:
if not initialized:
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
torch_dtype="auto",
_attn_implementation="sdpa",
)
model = model.to("hpu")
if logflag:
logger.info(processor.tokenizer)
logger.info(f"model.config._attn_implementation: {model.config._attn_implementation}")
logger.info("[llm] model and processor initialized.")

# Must put after the models are downloaded because this has custom remote code that needs to be loaded first for the OH to load the override functions
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi

adapt_transformers_to_gaudi()

logger.info("[llm - native] Ready to inference")
res = generate(sample_prompt)
logger.info(f"[llm - native] test result: {res}")
initialized = True


@OpeaComponentRegistry.register("OpeaTextGenNativePhi4Multimodal")
class OpeaTextGenNativePhi4Multimodal(OpeaComponent):
"""A specialized OPEA TextGen component derived from OpeaComponent for interacting with LLM services based on native optimum habana."""

def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LLM.name.lower(), description, config)
initialize()
health_status = self.check_health()
if not health_status:
logger.error("OpeaTextGenNativePhi4Multimodal health check failed.")
else:
logger.info("OpeaTextGenNativePhi4Multimodal health check success.")

def check_health(self) -> bool:
"""Checks the health of the LLM service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""

try:
return initialized
except Exception as e:
logger.error(e)
logger.error("Health check failed")
return False

async def invoke(self, input: ChatCompletionRequest):
"""Invokes the LLM service to generate output for the provided input.
Args:
input (ChatCompletionRequest): The input text(s).
"""

message = None
if isinstance(input.messages, str):
message = input.messages
else: # List[Dict]
for input_data in input.messages:
if "role" in input_data and input_data["role"] == "user" and "content" in input_data:
message = input_data["content"]
if logflag:
logger.info(f"Get input text:\n {message}")
if message is None:
logger.error("Don't receive any input text, exit!")
return GeneratedDoc(text=None, prompt=None)

prompt = message
prompt_template = None
if input.chat_template:
prompt_template = PromptTemplate.from_template(input.chat_template)
input_variables = prompt_template.input_variables
if prompt_template:
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=message, context="\n".join(input.documents))
elif input_variables == ["question"]:
prompt = prompt_template.format(question=message)
else:
logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
else:
if input.documents:
prompt = ChatTemplate.generate_rag_prompt(message, input.documents)
res = generate(prompt, input.max_tokens)

if logflag:
logger.info(f"[llm - native] inference result: {res}")
return GeneratedDoc(text=res, prompt=message)
2 changes: 2 additions & 0 deletions comps/llms/src/text-generation/opea_llm_microservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@

if llm_component_name == "OpeaTextGenNative":
from integrations.native import OpeaTextGenNative
elif llm_component_name == "OpeaTextGenNativePhi4Multimodal":
from integrations.native_phi4_multimodal import OpeaTextGenNativePhi4Multimodal
elif llm_component_name == "OpeaTextGenBedrock":
from integrations.bedrock import OpeaTextGenBedrock
else:
Expand Down
Loading

0 comments on commit bce8e1a

Please sign in to comment.