PR for text2graph extractor

opea-project · Feb 28, 2025 · 2a8b211 · 2a8b211
1 parent fb703f0
commit 2a8b211
Show file tree

Hide file tree

Showing 15 changed files with 1,087 additions and 0 deletions.
diff --git a/comps/cores/mega/constants.py b/comps/cores/mega/constants.py
@@ -34,6 +34,7 @@ class ServiceType(Enum):
     ANIMATION = 17
     IMAGE2IMAGE = 18
     TEXT2SQL = 19
+    TEXT2GRAPH = 20
 
 
 class MegaServiceEndpoint(Enum):

diff --git a/comps/text2graph/deployment/docker_compose/README.md b/comps/text2graph/deployment/docker_compose/README.md
diff --git a/comps/text2graph/deployment/docker_compose/compose.yaml b/comps/text2graph/deployment/docker_compose/compose.yaml
@@ -0,0 +1,34 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+#include:
+#  - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml
+
+services:
+  text2graph:
+     image: opea/text2graph:latest
+     container_name: text2graph
+     ports:
+         - ${TEXT2GRAPH_PORT:-8090}:8090
+     environment:
+         - no_proxy=${no_proxy}
+         - https_proxy=${https_proxy}
+         - http_proxy=${http_proxy}
+         - LLM_MODEL_ID=${LLM_MODEL_ID:-"Babelscape/rebel-large"}
+         - HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+     ipc: host
+     restart: always          
+
+  text2graph-gaudi:
+    image: opea/text2graph:${TAG:-latest}
+    container_name: text2graph-gaudi-server
+    ports:
+      - ${TEXT2GRAPH_PORT:-9090}:8080
+    environment:
+      - TGI_LLM_ENDPOINT=${TGI_LLM_ENDPOINT:-8080}:8080
+        #    depends_on:
+        #      - tgi-gaudi-server
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/text2graph/deployment/docker_compose/compose.yaml_BACK b/comps/text2graph/deployment/docker_compose/compose.yaml_BACK
@@ -0,0 +1,29 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+include:
+  - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml
+
+services:
+  text2graph:
+    image: opea/text2graph:${TAG:-latest}
+    container_name: text2graph-server
+    ports:
+      - ${TEXT2GRAPH_PORT:-9090}:8080
+    environment:
+      - TGI_LLM_ENDPOINT=${TGI_LLM_ENDPOINT}
+    depends_on:
+      - tgi-server
+
+  text2graph-gaudi:
+    image: opea/text2graph:${TAG:-latest}
+    container_name: text2graph-gaudi-server
+    ports:
+      - ${TEXT2GRAPH_PORT:-9090}:8080
+    environment:
+      - TGI_LLM_ENDPOINT=${TGI_LLM_ENDPOINT}
+    depends_on:
+      - tgi-gaudi-server
+networks:
+  default:
+    driver: bridge
diff --git a/comps/text2graph/deployment/docker_compose/start_services_with_compose.log b/comps/text2graph/deployment/docker_compose/start_services_with_compose.log
diff --git a/comps/text2graph/deployment/docker_compose/tmpdata/paul_graham_essay.txt b/comps/text2graph/deployment/docker_compose/tmpdata/paul_graham_essay.txt
diff --git a/comps/text2graph/src/Dockerfile b/comps/text2graph/src/Dockerfile
@@ -0,0 +1,48 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM ubuntu:22.04
+
+WORKDIR /home/graph_extract
+
+FROM python:3.11-slim
+ENV LANG=C.UTF-8
+ARG ARCH=cpu
+
+RUN apt-get update -y && apt-get install vim -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential 
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+COPY ../../../comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    if [ ${ARCH} = "cpu" ]; then \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/text2graph/src/requirements.txt; \
+    else \
+        pip install --no-cache-dir -r /home/user/comps/text2graph/src/requirements.txt; \
+    fi
+
+#ENV PYTHONPATH=$PYTHONPATH:/home/user
+ENV https_proxy=${https_proxy}
+ENV http_proxy=${http_proxy}
+ENV no_proxy=${no_proxy}
+ENV LLM_ID=${LLM_ID:-"Babelscape/rebel-large"}
+ENV SPAN_LENGTH=${SPAN_LENGTH:-"1024"}
+ENV OVERLAP=${OVERLAP:-"100"}
+ENV MAX_LENGTH=${MAX_NEW_TOKENS:-"256"}
+ENV HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
+ENV HF_TOKEN=${HF_TOKEN}
+ENV LLM_MODEL_ID=${LLM_ID}
+ENV TGI_PORT=8008
+ENV PYTHONPATH="/home/user/":$PYTHONPATH
+
+USER user
+
+WORKDIR /home/user/comps/text2graph/src/
+
+RUN bash -c 'source /home/user/comps/text2graph/src/setup_service_env.sh'
+
+ENTRYPOINT ["python", "opea_text2graph_microservice.py"]
diff --git a/comps/text2graph/src/README.md b/comps/text2graph/src/README.md
@@ -0,0 +1,109 @@
+# Text to graph triplet extractor
+
+Creating graphs from text is about converting unstructured text into structured data is challenging. 
+It's gained significant traction with the advent of Large Language Models (LLMs), bringing it more into the mainstream.
+There are approaches to extract graph triplets using different types of LLMs. 
+
+##Encoder-decoder models 
+such as REBEL, is based on the BART model and fine-tuned for relation extraction and classification tasks26.  
+The other approach is Decoder only models. Depending on the applications and data source, the approach works better.
+Encoder decoder models often achieve high performance on benchmarks due to their ability to encode contextual 
+information effectively.  It is suitable for tasks requiring detailed parsing of text into structured formats, 
+such as knowledge graph construction from unstructured data26.
+
+##Decoder-Only Models
+Decoder-only models are faster during inference as they skip the encoding. This is ideal for tasks where the 
+input-output mapping is simpler or where multitasking is required.  It is suitable for generating outputs based on 
+prompts or when computational efficiency is a priority.  In certain cases, the decoder only models struggle with 
+tasks requiring deep contextual understanding or when input-output structures are highly heterogeneous.
+This microservice provides an encoder decoder architecture approach to graph triplet extraction
+
+---
+# Features
+
+**Provide text input and the graph triplets and nodes are identified**
+
+## Implementation
+
+The text-to-graph microservice able to extract from unstructured text 
+
+#### 🚀 Start Microservice with Python（Option 1）
+
+#### Install Requirements
+```bash
+pip install -r requirements.txt
+```
+---
+### Environment variables : Configure LLM Parameters based on the model selected.
+export LLM_ID=${LLM_ID:-"Babelscape/rebel-large"}
+export SPAN_LENGTH=${SPAN_LENGTH:-"1024"}
+export OVERLAP=${OVERLAP:-"100"}
+export MAX_LENGTH=${MAX_NEW_TOKENS:-"256"}
+export HUGGINGFACEHUB_API_TOKEN=""
+export LLM_MODEL_ID=${LLM_ID}
+export TGI_PORT=8008
+---
+
+---
+###Echo env variables
+echo "Extractor details"
+echo LLM_ID=${LLM_ID}
+echo SPAN_LENGTH=${SPAN_LENGTH}
+echo OVERLAP=${OVERLAP}
+echo MAX_LENGTH=${MAX_LENGTH}
+---
+#### Start TGI Service
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3"
+export TGI_PORT=8008
+
+docker run -d --name="text2graph-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e model=${LLM_MODEL_ID} ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id $LLM_MODEL_ID
+```
+
+#### Verify the TGI Service
+
+```bash
+export your_ip=$(hostname -I | awk '{print $1}')
+curl http://${your_ip}:${TGI_PORT}/generate \
+  -X POST \
+  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+  -H 'Content-Type: application/json'
+```
+#### Setup Environment Variables
+
+```bash
+export TGI_LLM_ENDPOINT="http://${your_ip}:${TGI_PORT}"
+```
+
+#### Start Text2Graph Microservice with Python Script
+
+**Command to build text2graph microservice
+docker build -f Dockerfile -t user_name:graph_extractor ../../../
+
+**Command to launch text2graph microservice
+docker run -i -t --net=host --ipc=host -p 8090 user_name:graph_extractor 
+
+The docker launches the text2graph microservice.  To run it interactive 
+```bash
+python3 opea_text2graph_microservice.py
+```
+---
+
+# Validation and testing
+
+## Text to triplets
+GenAIComps/tests/text2graph/
+
+There are a few examples provided to help with the extraction. 
+test_few_sentences.py generates triplets from couple of sentences. 
+test_from_file.py download and feed a file. 
+how to use it ? 
+   python test_few_sentences.py
+   python test_from_file.py
+
+## Check if services are up
+### Setup validation process 
+   For set up use http://localhost:8090/docs for swagger documentation and list of commands 
+