Skip to content

Commit

Permalink
Enhance LLM text-generation native on gaudi mode
Browse files Browse the repository at this point in the history
Add support for enhanced native model in microservice

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
  • Loading branch information
XinyaoWa committed Feb 26, 2025
1 parent c70f868 commit 46a4f98
Show file tree
Hide file tree
Showing 14 changed files with 13,383 additions and 0 deletions.
35 changes: 35 additions & 0 deletions comps/llms/deployment/docker_compose/compose_text-generation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,29 @@ services:
- SYS_NICE
restart: unless-stopped

textgen-gaudi-enhance:
image: ${REGISTRY:-opea}/llm-textgen-gaudi-enhance:${TAG:-latest}
container_name: llm-textgen-gaudi-enhance-server
ports:
- ${TEXTGEN_PORT:-9000}:9000
volumes:
- "${DATA_PATH:-./data}:/data"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
LLM_MODEL_ID: ${LLM_MODEL_ID}
HF_TOKEN: ${HF_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
TOKENIZERS_PARALLELISM: False
LOGFLAG: ${LOGFLAG:-False}
runtime: habana
cap_add:
- SYS_NICE
restart: unless-stopped

textgen-service-tgi:
extends: textgen
container_name: textgen-service-tgi
Expand Down Expand Up @@ -100,6 +123,18 @@ services:
environment:
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}

textgen-native-gaudi-enhance:
extends: textgen-gaudi-enhance
container_name: textgen-native-gaudi-enhance
environment:
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNativeEnhance}

textgen-native-gaudi-enhance-multimodal:
extends: textgen-gaudi-enhance
container_name: textgen-native-gaudi-enhance-multimodal
environment:
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNativeEnhanceMultimodal}

networks:
default:
driver: bridge
32 changes: 32 additions & 0 deletions comps/llms/src/text-generation/Dockerfile.intel_hpu_enhance
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# HABANA environment
FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1 AS hpu

ENV LANG=en_US.UTF-8

RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
git-lfs \
libgl1-mesa-glx \
libjemalloc-dev

RUN mkdir -p /home/user

RUN git lfs install

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
pip install --no-cache-dir git+/~https://github.com/HabanaAI/DeepSpeed.git@1.19.0

RUN pip install git+/~https://github.com/huggingface/optimum-habana.git@transformers_future && \
cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
pip install soundfile peft backoff

ENV PYTHONPATH=/root:/home/user

WORKDIR /home/user/comps/llms/src/text-generation/

ENTRYPOINT ["bash", "entrypoint_enhance.sh"]
30 changes: 30 additions & 0 deletions comps/llms/src/text-generation/entrypoint_enhance.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#!/bin/bash

#LLM_MODEL_ID mush be a model path
llm_name=$LLM_MODEL_ID
WORKPATH="/home/user/comps/llms/src/text-generation/"

if [[ $llm_name == *"phi-4-multimodel"* ]]; then
cd $WORKPATH
echo -e "Patching into the multimodal models"
cp patch/enhance-multimodal-patch/*.py $llm_name/
export PT_HPU_LAZY_MODE=1
elif [[ $llm_name == *"phi-4"* ]]; then
cd $WORKPATH
git clone -b transformers_future /~https://github.com/huggingface/optimum-habana
cd optimum-habana
cp ../patch/optimum-habana-enhance.patch .
git apply optimum-habana-enhance.patch
pip install -e .
cd examples/text-generation/
pip install -r requirements.txt
cd phi-4-mini-instruct/
bash ./01-patch-transformer.sh
fi

cd $WORKPATH
python opea_llm_microservice.py
269 changes: 269 additions & 0 deletions comps/llms/src/text-generation/integrations/native_enhance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys

sys.path.append("/test/GenAIComps/")

import os
import threading
import time

import torch
from langchain_core.prompts import PromptTemplate

from comps import CustomLogger, GeneratedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
from comps.cores.proto.api_protocol import ChatCompletionRequest

from .template import ChatTemplate
from .utils import initialize_model

logger = CustomLogger("opea_textgen_native")
logflag = os.getenv("LOGFLAG", False)

MODEL_NAME = os.getenv("LLM_MODEL_ID", "phi/phi-4")

input_sentences = [
"DeepSpeed is a machine learning framework",
"He is working on",
"He has a",
"He got all",
"Everyone is happy and I can",
"The new movie that got Oscar this year",
"In the far far distance from our galaxy,",
"Peace is the only way",
]

args_dict = {
"device": "hpu",
"model_name_or_path": MODEL_NAME,
"bf16": True,
"max_new_tokens": 128,
"max_input_tokens": 128,
"batch_size": 1,
"warmup": 3,
"n_iterations": 1,
"local_rank": 0,
"use_kv_cache": False,
"use_hpu_graphs": True,
"dataset_name": None,
"column_name": None,
"do_sample": False,
"num_beams": 1,
"trim_logits": False,
"seed": 27,
"profiling_warmup_steps": 0,
"profiling_steps": 0,
"profiling_record_shapes": False,
"prompt": None,
"bad_words": None,
"force_words": None,
"assistant_model": None,
"peft_model": None,
"num_return_sequences": 1,
"token": None,
"model_revision": "main",
"attn_softmax_bf16": True,
"output_dir": None,
"bucket_size": -1,
"bucket_internal": False,
"dataset_max_samples": -1,
"limit_hpu_graphs": True,
"reuse_cache": False,
"verbose_workers": False,
"simulate_dyn_prompt": None,
"reduce_recompile": False,
"use_flash_attention": True,
"flash_attention_recompute": True,
"flash_attention_causal_mask": True,
"flash_attention_fast_softmax": False,
"book_source": False,
"torch_compile": False,
"ignore_eos": True,
"temperature": 1.0,
"top_p": 1.0,
"top_k": None,
"const_serialization_path": None,
"disk_offload": False,
"trust_remote_code": False,
"quant_config": "",
"world_size": 0,
"show_graphs_count": False,
"load_quantized_model_with_inc": False,
"local_quantized_inc_model_path": None,
"load_quantized_model_with_autogptq": False,
"penalty_alpha": None,
}


class Args:
def __init__(self, **entries):
self.__dict__.update(entries)


model = None
assistant_model = None
tokenizer = None
generation_config = None
args = Args(**args_dict)
initialization_lock = threading.Lock()
initialized = False


def generate(
input_query: list,
device="hpu",
use_lazy_mode=True,
use_hpu_graphs=True,
profiling_steps=0,
profiling_warmup_steps=0,
ignore_eos=True,
profiling_record_shapes=False,
):
"""Generates sequences from the input sentences and returns them."""
logger.info(f"[llm - generate] starting to inference with prompt {input_query}")
encode_t0 = time.perf_counter()

# Tokenization
input_tokens = tokenizer.batch_encode_plus(
input_query,
return_tensors="pt",
padding=True,
return_token_type_ids=False, # token_type_ids is not needed for falcon-three model
)
encode_duration = time.perf_counter() - encode_t0
logger.info(f"[llm - generate] input tokenized: {input_tokens}")

# Move inputs to target device(s)
for t in input_tokens:
logger.info(f"[llm - generate] t: {t}")
if torch.is_tensor(input_tokens[t]):
logger.info("[llm - generate] input[t] is tensor")
logger.info(f"[llm - generate] device: {model.device}")
input_tokens[t] = input_tokens[t].to(model.device)

logger.info("[llm - generate] inputs transferred.")

iteration_times = []
outputs = model.generate(
**input_tokens,
generation_config=generation_config,
assistant_model=assistant_model,
lazy_mode=use_lazy_mode,
hpu_graphs=use_hpu_graphs,
profiling_steps=profiling_steps,
profiling_warmup_steps=profiling_warmup_steps,
ignore_eos=ignore_eos,
iteration_times=iteration_times,
profiling_record_shapes=profiling_record_shapes,
).cpu()
logger.info("[llm - generate] result generated")
first_token_time = iteration_times[0] + encode_duration
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
logger.info(f"[llm - generate] result: {result}")
logger.info(f"[llm - generate] Time to first token = {first_token_time*1000}ms")
return result


def initialize():
global model, assistant_model, tokenizer, generation_config, initialized
with initialization_lock:
if not initialized:
# initialize model and tokenizer
import habana_frameworks.torch.hpu as torch_hpu
from optimum.habana.utils import HabanaProfile

model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
logger.info("[llm] model and tokenizer initialized.")

# compilation and model warmup
HabanaProfile.disable()
logger.info("[llm - native] Graph compilation...")
for _ in range(args.warmup):
generate(input_sentences)
logger.info("[llm - native] model warm up finished.")
torch_hpu.synchronize()
HabanaProfile.enable()
logger.info("[llm - native] Ready to inference")
res = generate(["What is Deep Learning?"])
logger.info(f"[llm - native] test result: {res}")
initialized = True


@OpeaComponentRegistry.register("OpeaTextGenNativeEnhance")
class OpeaTextGenNativeEnhance(OpeaComponent):
"""A specialized OPEA TextGen component derived from OpeaComponent for interacting with LLM services based on native optimum habana."""

def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LLM.name.lower(), description, config)
initialize()
health_status = self.check_health()
if not health_status:
logger.error("OpeaTextGenNativeEnhance health check failed.")
else:
logger.info("OpeaTextGenNativeEnhance health check success.")

def check_health(self) -> bool:
"""Checks the health of the LLM service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""

try:
return initialized
except Exception as e:
logger.error(e)
logger.error("Health check failed")
return False

async def invoke(self, input: ChatCompletionRequest):
"""Invokes the LLM service to generate output for the provided input.
Args:
input (ChatCompletionRequest): The input text(s).
"""

message = None
if isinstance(input.messages, str):
message = input.messages
else: # List[Dict]
for input_data in input.messages:
if "role" in input_data and input_data["role"] == "user" and "content" in input_data:
message = input_data["content"]
if logflag:
logger.info(f"Get input text:\n {message}")
if message is None:
logger.error("Don't receive any input text, exit!")
return GeneratedDoc(text=None, prompt=None)

prompt = message
prompt_template = None
if input.chat_template:
prompt_template = PromptTemplate.from_template(input.chat_template)
input_variables = prompt_template.input_variables
if prompt_template:
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=message, context="\n".join(input.documents))
elif input_variables == ["question"]:
prompt = prompt_template.format(question=message)
else:
logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
else:
if input.documents:
prompt = ChatTemplate.generate_rag_prompt(message, input.documents)
res = generate([prompt])

if logflag:
logger.info(f"[llm - native] inference result: {res}")
return GeneratedDoc(text=res[0], prompt=message)
Loading

0 comments on commit 46a4f98

Please sign in to comment.