diff --git a/comps/text2graph/deployment/docker_compose/compose.yaml b/comps/text2graph/deployment/docker_compose/compose.yaml
index 11f7e9539..6624734f9 100644
--- a/comps/text2graph/deployment/docker_compose/compose.yaml
+++ b/comps/text2graph/deployment/docker_compose/compose.yaml
@@ -17,7 +17,7 @@ services:
- LLM_MODEL_ID=${LLM_MODEL_ID:-"Babelscape/rebel-large"}
- HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
ipc: host
- restart: always
+ restart: always
text2graph-gaudi:
image: opea/text2graph:${TAG:-latest}
diff --git a/comps/text2graph/deployment/docker_compose/tmpdata/paul_graham_essay.txt b/comps/text2graph/deployment/docker_compose/tmpdata/paul_graham_essay.txt
index 30ffafc6c..e8079fb90 100644
--- a/comps/text2graph/deployment/docker_compose/tmpdata/paul_graham_essay.txt
+++ b/comps/text2graph/deployment/docker_compose/tmpdata/paul_graham_essay.txt
@@ -60,25 +60,25 @@ Then one day in April 1990 a crack appeared in the wall. I ran into professor Ch
I picked applications of continuations as the topic. In retrospect I should have written about macros and embedded languages. There's a whole world there that's barely been explored. But all I wanted was to get out of grad school, and my rapidly written dissertation sufficed, just barely.
-Meanwhile I was applying to art schools. I applied to two: RISD in the US, and the Accademia di Belli Arti in Florence, which, because it was the oldest art school, I imagined would be good. RISD accepted me, and I never heard back from the Accademia, so off to Providence I went.
+Meanwhile I was applying to art schools. I applied to two: RISD in the US, and the Academia di Belli Arti in Florence, which, because it was the oldest art school, I imagined would be good. RISD accepted me, and I never heard back from the Academia, so off to Providence I went.
I'd applied for the BFA program at RISD, which meant in effect that I had to go to college again. This was not as strange as it sounds, because I was only 25, and art schools are full of people of different ages. RISD counted me as a transfer sophomore and said I had to do the foundation that summer. The foundation means the classes that everyone has to take in fundamental subjects like drawing, color, and design.
-Toward the end of the summer I got a big surprise: a letter from the Accademia, which had been delayed because they'd sent it to Cambridge England instead of Cambridge Massachusetts, inviting me to take the entrance exam in Florence that fall. This was now only weeks away. My nice landlady let me leave my stuff in her attic. I had some money saved from consulting work I'd done in grad school; there was probably enough to last a year if I lived cheaply. Now all I had to do was learn Italian.
+Toward the end of the summer I got a big surprise: a letter from the Academia, which had been delayed because they'd sent it to Cambridge England instead of Cambridge Massachusetts, inviting me to take the entrance exam in Florence that fall. This was now only weeks away. My nice landlady let me leave my stuff in her attic. I had some money saved from consulting work I'd done in grad school; there was probably enough to last a year if I lived cheaply. Now all I had to do was learn Italian.
Only stranieri (foreigners) had to take this entrance exam. In retrospect it may well have been a way of excluding them, because there were so many stranieri attracted by the idea of studying art in Florence that the Italian students would otherwise have been outnumbered. I was in decent shape at painting and drawing from the RISD foundation that summer, but I still don't know how I managed to pass the written exam. I remember that I answered the essay question by writing about Cezanne, and that I cranked up the intellectual level as high as I could to make the most of my limited vocabulary. [2]
-I'm only up to age 25 and already there are such conspicuous patterns. Here I was, yet again about to attend some august institution in the hopes of learning about some prestigious subject, and yet again about to be disappointed. The students and faculty in the painting department at the Accademia were the nicest people you could imagine, but they had long since arrived at an arrangement whereby the students wouldn't require the faculty to teach anything, and in return the faculty wouldn't require the students to learn anything. And at the same time all involved would adhere outwardly to the conventions of a 19th century atelier. We actually had one of those little stoves, fed with kindling, that you see in 19th century studio paintings, and a nude model sitting as close to it as possible without getting burned. Except hardly anyone else painted her besides me. The rest of the students spent their time chatting or occasionally trying to imitate things they'd seen in American art magazines.
+I'm only up to age 25 and already there are such conspicuous patterns. Here I was, yet again about to attend some august institution in the hopes of learning about some prestigious subject, and yet again about to be disappointed. The students and faculty in the painting department at the Academia were the nicest people you could imagine, but they had long since arrived at an arrangement whereby the students wouldn't require the faculty to teach anything, and in return the faculty wouldn't require the students to learn anything. And at the same time all involved would adhere outwardly to the conventions of a 19th century atelier. We actually had one of those little stoves, fed with kindling, that you see in 19th century studio paintings, and a nude model sitting as close to it as possible without getting burned. Except hardly anyone else painted her besides me. The rest of the students spent their time chatting or occasionally trying to imitate things they'd seen in American art magazines.
Our model turned out to live just down the street from me. She made a living from a combination of modelling and making fakes for a local antique dealer. She'd copy an obscure old painting out of a book, and then he'd take the copy and maltreat it to make it look old. [3]
-While I was a student at the Accademia I started painting still lives in my bedroom at night. These paintings were tiny, because the room was, and because I painted them on leftover scraps of canvas, which was all I could afford at the time. Painting still lives is different from painting people, because the subject, as its name suggests, can't move. People can't sit for more than about 15 minutes at a time, and when they do they don't sit very still. So the traditional m.o. for painting people is to know how to paint a generic person, which you then modify to match the specific person you're painting. Whereas a still life you can, if you want, copy pixel by pixel from what you're seeing. You don't want to stop there, of course, or you get merely photographic accuracy, and what makes a still life interesting is that it's been through a head. You want to emphasize the visual cues that tell you, for example, that the reason the color changes suddenly at a certain point is that it's the edge of an object. By subtly emphasizing such things you can make paintings that are more realistic than photographs not just in some metaphorical sense, but in the strict information-theoretic sense. [4]
+While I was a student at the Academia I started painting still lives in my bedroom at night. These paintings were tiny, because the room was, and because I painted them on leftover scraps of canvas, which was all I could afford at the time. Painting still lives is different from painting people, because the subject, as its name suggests, can't move. People can't sit for more than about 15 minutes at a time, and when they do they don't sit very still. So the traditional m.o. for painting people is to know how to paint a generic person, which you then modify to match the specific person you're painting. Whereas a still life you can, if you want, copy pixel by pixel from what you're seeing. You don't want to stop there, of course, or you get merely photographic accuracy, and what makes a still life interesting is that it's been through a head. You want to emphasize the visual cues that tell you, for example, that the reason the color changes suddenly at a certain point is that it's the edge of an object. By subtly emphasizing such things you can make paintings that are more realistic than photographs not just in some metaphorical sense, but in the strict information-theoretic sense. [4]
I liked painting still lives because I was curious about what I was seeing. In everyday life, we aren't consciously aware of much we're seeing. Most visual perception is handled by low-level processes that merely tell your brain "that's a water droplet" without telling you details like where the lightest and darkest points are, or "that's a bush" without telling you the shape and position of every leaf. This is a feature of brains, not a bug. In everyday life it would be distracting to notice every leaf on every bush. But when you have to paint something, you have to look more closely, and when you do there's a lot to see. You can still be noticing new things after days of trying to paint something people usually take for granted, just as you can after days of trying to write an essay about something people usually take for granted.
This is not the only way to paint. I'm not 100% sure it's even a good way to paint. But it seemed a good enough bet to be worth trying.
-Our teacher, professor Ulivi, was a nice guy. He could see I worked hard, and gave me a good grade, which he wrote down in a sort of passport each student had. But the Accademia wasn't teaching me anything except Italian, and my money was running out, so at the end of the first year I went back to the US.
+Our teacher, professor Ulivi, was a nice guy. He could see I worked hard, and gave me a good grade, which he wrote down in a sort of passport each student had. But the Academia wasn't teaching me anything except Italian, and my money was running out, so at the end of the first year I went back to the US.
I wanted to go back to RISD, but I was now broke and RISD was very expensive, so I decided to get a job for a year and then return to RISD the next fall. I got one at a company called Interleaf, which made software for creating documents. You mean like Microsoft Word? Exactly. That was how I learned that low end software tends to eat high end software. But Interleaf still had a few years to live yet. [5]
@@ -92,7 +92,7 @@ But the most important thing I learned, and which I used in both Viaweb and Y Co
When I left to go back to RISD the next fall, I arranged to do freelance work for the group that did projects for customers, and this was how I survived for the next several years. When I came back to visit for a project later on, someone told me about a new thing called HTML, which was, as he described it, a derivative of SGML. Markup language enthusiasts were an occupational hazard at Interleaf and I ignored him, but this HTML thing later became a big part of my life.
-In the fall of 1992 I moved back to Providence to continue at RISD. The foundation had merely been intro stuff, and the Accademia had been a (very civilized) joke. Now I was going to see what real art school was like. But alas it was more like the Accademia than not. Better organized, certainly, and a lot more expensive, but it was now becoming clear that art school did not bear the same relationship to art that medical school bore to medicine. At least not the painting department. The textile department, which my next door neighbor belonged to, seemed to be pretty rigorous. No doubt illustration and architecture were too. But painting was post-rigorous. Painting students were supposed to express themselves, which to the more worldly ones meant to try to cook up some sort of distinctive signature style.
+In the fall of 1992 I moved back to Providence to continue at RISD. The foundation had merely been intro stuff, and the Academia had been a (very civilized) joke. Now I was going to see what real art school was like. But alas it was more like the Academia than not. Better organized, certainly, and a lot more expensive, but it was now becoming clear that art school did not bear the same relationship to art that medical school bore to medicine. At least not the painting department. The textile department, which my next door neighbor belonged to, seemed to be pretty rigorous. No doubt illustration and architecture were too. But painting was post-rigorous. Painting students were supposed to express themselves, which to the more worldly ones meant to try to cook up some sort of distinctive signature style.
A signature style is the visual equivalent of what in show business is known as a "schtick": something that immediately identifies the work as yours and no one else's. For example, when you see a painting that looks like a certain kind of cartoon, you know it's by Roy Lichtenstein. So if you see a big painting of this type hanging in the apartment of a hedge fund manager, you know he paid millions of dollars for it. That's not always why artists have a signature style, but it's usually why buyers pay a lot for such work. [6]
@@ -304,7 +304,7 @@ Notes
[2] Italian words for abstract concepts can nearly always be predicted from their English cognates (except for occasional traps like polluzione). It's the everyday words that differ. So if you string together a lot of abstract concepts with a few simple verbs, you can make a little Italian go a long way.
-[3] I lived at Piazza San Felice 4, so my walk to the Accademia went straight down the spine of old Florence: past the Pitti, across the bridge, past Orsanmichele, between the Duomo and the Baptistery, and then up Via Ricasoli to Piazza San Marco. I saw Florence at street level in every possible condition, from empty dark winter evenings to sweltering summer days when the streets were packed with tourists.
+[3] I lived at Piazza San Felice 4, so my walk to the Academia went straight down the spine of old Florence: past the Pitti, across the bridge, past Orsanmichele, between the Duomo and the Baptistery, and then up Via Ricasoli to Piazza San Marco. I saw Florence at street level in every possible condition, from empty dark winter evenings to sweltering summer days when the streets were packed with tourists.
[4] You can of course paint people like still lives if you want to, and they're willing. That sort of portrait is arguably the apex of still life painting, though the long sitting does tend to produce pained expressions in the sitters.
diff --git a/comps/text2graph/src/README.md b/comps/text2graph/src/README.md
index b5d9427bb..c08423fd0 100644
--- a/comps/text2graph/src/README.md
+++ b/comps/text2graph/src/README.md
@@ -1,40 +1,45 @@
# Text to graph triplet extractor
-Creating graphs from text is about converting unstructured text into structured data is challenging.
+Creating graphs from text is about converting unstructured text into structured data is challenging.
It's gained significant traction with the advent of Large Language Models (LLMs), bringing it more into the mainstream.
-There are approaches to extract graph triplets using different types of LLMs.
+There are approaches to extract graph triplets using different types of LLMs.
-##Encoder-decoder models
+##Encoder-decoder models
such as REBEL, is based on the BART model and fine-tuned for relation extraction and classification tasks26.
The other approach is Decoder only models. Depending on the applications and data source, the approach works better.
-Encoder decoder models often achieve high performance on benchmarks due to their ability to encode contextual
-information effectively. It is suitable for tasks requiring detailed parsing of text into structured formats,
+Encoder decoder models often achieve high performance on benchmarks due to their ability to encode contextual
+information effectively. It is suitable for tasks requiring detailed parsing of text into structured formats,
such as knowledge graph construction from unstructured data26.
##Decoder-Only Models
-Decoder-only models are faster during inference as they skip the encoding. This is ideal for tasks where the
-input-output mapping is simpler or where multitasking is required. It is suitable for generating outputs based on
-prompts or when computational efficiency is a priority. In certain cases, the decoder only models struggle with
+Decoder-only models are faster during inference as they skip the encoding. This is ideal for tasks where the
+input-output mapping is simpler or where multitasking is required. It is suitable for generating outputs based on
+prompts or when computational efficiency is a priority. In certain cases, the decoder only models struggle with
tasks requiring deep contextual understanding or when input-output structures are highly heterogeneous.
This microservice provides an encoder decoder architecture approach to graph triplet extraction
---
+
# Features
**Provide text input and the graph triplets and nodes are identified**
## Implementation
-The text-to-graph microservice able to extract from unstructured text
+The text-to-graph microservice able to extract from unstructured text
#### 🚀 Start Microservice with Python(Option 1)
#### Install Requirements
+
```bash
pip install -r requirements.txt
```
+
---
+
### Environment variables : Configure LLM Parameters based on the model selected.
+
export LLM_ID=${LLM_ID:-"Babelscape/rebel-large"}
export SPAN_LENGTH=${SPAN_LENGTH:-"1024"}
export OVERLAP=${OVERLAP:-"100"}
@@ -42,16 +47,20 @@ export MAX_LENGTH=${MAX_NEW_TOKENS:-"256"}
export HUGGINGFACEHUB_API_TOKEN=""
export LLM_MODEL_ID=${LLM_ID}
export TGI_PORT=8008
+
---
---
+
###Echo env variables
echo "Extractor details"
echo LLM_ID=${LLM_ID}
echo SPAN_LENGTH=${SPAN_LENGTH}
echo OVERLAP=${OVERLAP}
echo MAX_LENGTH=${MAX_LENGTH}
+
---
+
#### Start TGI Service
```bash
@@ -71,6 +80,7 @@ curl http://${your_ip}:${TGI_PORT}/generate \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'
```
+
#### Setup Environment Variables
```bash
@@ -79,31 +89,35 @@ export TGI_LLM_ENDPOINT="http://${your_ip}:${TGI_PORT}"
#### Start Text2Graph Microservice with Python Script
-**Command to build text2graph microservice
+\*\*Command to build text2graph microservice
docker build -f Dockerfile -t user_name:graph_extractor ../../../
-**Command to launch text2graph microservice
-docker run -i -t --net=host --ipc=host -p 8090 user_name:graph_extractor
+\*\*Command to launch text2graph microservice
+docker run -i -t --net=host --ipc=host -p 8090 user_name:graph_extractor
+
+The docker launches the text2graph microservice. To run it interactive
-The docker launches the text2graph microservice. To run it interactive
```bash
python3 opea_text2graph_microservice.py
```
+
---
# Validation and testing
## Text to triplets
+
GenAIComps/tests/text2graph/
-There are a few examples provided to help with the extraction.
-test_few_sentences.py generates triplets from couple of sentences.
-test_from_file.py download and feed a file.
-how to use it ?
- python test_few_sentences.py
- python test_from_file.py
+There are a few examples provided to help with the extraction.
+test_few_sentences.py generates triplets from couple of sentences.
+test_from_file.py download and feed a file.
+how to use it ?
+python test_few_sentences.py
+python test_from_file.py
## Check if services are up
-### Setup validation process
- For set up use http://localhost:8090/docs for swagger documentation and list of commands
+### Setup validation process
+
+For set up use http://localhost:8090/docs for swagger documentation and list of commands
diff --git a/comps/text2graph/src/integrations/graph_agent.py b/comps/text2graph/src/integrations/graph_agent.py
index dfbab11b4..57cc45d1c 100644
--- a/comps/text2graph/src/integrations/graph_agent.py
+++ b/comps/text2graph/src/integrations/graph_agent.py
@@ -1,140 +1,138 @@
-import torch, re, math, os, csv
-import pandas as pd
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import csv
+import math
+import os
+import re
from typing import List, Tuple
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+import pandas as pd
+import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-class TripletExtractor():
- def triplet_extractor(self,text):
- triplets = []
- relation, subject, relation, object_ = '', '', '', ''
- text = text.strip()
- current = 'x'
- for token in text.replace("", "").replace("", "").replace("", "").split():
- if token == "":
- current = 't'
- if relation != '':
- triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
- relation = ''
- subject = ''
- elif token == "":
- current = 's'
- if relation != '':
- triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
- object_ = ''
- elif token == "":
- current = 'o'
- relation = ''
- else:
- if current == 't':
- subject += ' ' + token
- elif current == 's':
- object_ += ' ' + token
- elif current == 'o':
- relation += ' ' + token
- if subject != '' and relation != '' and object_ != '':
- triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
- return triplets
-
-class TripletBuilder():
- def __init__(self):
- # Load model and tokenizer
- MODEL_NAME="Babelscape/rebel-large"
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
-
- ## Defines
- self.span_length = int(os.environ.get('SPAN_LENGTH','1024'))
- self.overlap = int(os.environ.get('OVERLAP','100'))
- self.model = model
- self.tokenizer = tokenizer
-
- async def cal_index_span(self, total_tokens, span_length, overlap):
- num_spans = math.ceil(total_tokens/span_length) +1 # Calculate number of spans and assign to num_spans
- spans=[] # Initialize an empty list to store the spans
- start = 0
- for i in range(num_spans): # Iterate using the calculated num_spans
- start = i * (span_length - overlap)
- end = min(start + span_length, total_tokens) # Calculate end
- if(end >= total_tokens):
- end = total_tokens
- start = end - span_length
- if span_length <= overlap:
- raise ValueError("Indexing is incorrect something is wrong")
-
- spans.append([start,end]) # Append the span to the list
- return spans
-
- async def gen_tokenize(self, text: str) -> List[str]:
- #print(f'entering tokenizer {text[:100]}')
- tensor_tokens = self.tokenizer([text], return_tensors="pt")
- #print(f'done entering tokenizer {tensor_tokens}')
- return tensor_tokens
-
- async def rearrange_array(self, data_array, index_list):
- tensor_ids = [inputs["input_ids"][0][[0]:[1]]
- for boundary in spans_boundaries]
- tensor_masks = [inputs["attention_mask"][0][[0]:[1]]
- for boundary in spans_boundaries]
- inputs = {
- "input_ids": torch.stack(tensor_ids),
- "attention_mask": torch.stack(tensor_masks)
- }
-
- return tensors
-
-
- ## code
- async def extract_graph(self, text):
- #print(f'Entering graph extraction')
- tokenize_input = await self.gen_tokenize(text)
- total_tokens = len(tokenize_input['input_ids'][0])
- span_index_gen = await self.cal_index_span(total_tokens, self.span_length,self.overlap)
- tensor_ids = [torch.tensor(tokenize_input["input_ids"][0][start:end]) for start, end in span_index_gen]
- tensor_masks = [torch.tensor(tokenize_input["attention_mask"][0][start:end]) for start, end in span_index_gen]
- rearrange_inputs = {
- "input_ids": torch.stack(tensor_ids),
- "attention_mask": torch.stack(tensor_masks)
- }
-
- # generate relations
- MAX_LENGTH = int(os.environ.get('MAX_LENGTH','256'))
- num_return_sequences = 3
- gen_kwargs = {
- "max_length": MAX_LENGTH,
- "length_penalty": 0,
- "num_beams": 3,
- "num_return_sequences": num_return_sequences
- }
-
- generated_tokens = self.model.generate( **rearrange_inputs, **gen_kwargs)
-
- # decode relations
- decoded_preds = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
-
- # create kb
- tm = TripletManager()
- te = TripletExtractor()
- i = 0
-
- for sentence_pred in decoded_preds:
- current_span_index = i // num_return_sequences
- relations = te.triplet_extractor(sentence_pred)
- for relation in relations:
- tm.add_relation(relation)
- i += 1
- return tm
-
-
-class TripletManager():
+
+class TripletExtractor:
+ def triplet_extractor(self, text):
+ triplets = []
+ relation, subject, relation, object_ = "", "", "", ""
+ text = text.strip()
+ current = "x"
+ for token in text.replace("", "").replace("", "").replace("", "").split():
+ if token == "":
+ current = "t"
+ if relation != "":
+ triplets.append({"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()})
+ relation = ""
+ subject = ""
+ elif token == "":
+ current = "s"
+ if relation != "":
+ triplets.append({"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()})
+ object_ = ""
+ elif token == "":
+ current = "o"
+ relation = ""
+ else:
+ if current == "t":
+ subject += " " + token
+ elif current == "s":
+ object_ += " " + token
+ elif current == "o":
+ relation += " " + token
+ if subject != "" and relation != "" and object_ != "":
+ triplets.append({"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()})
+ return triplets
+
+
+class TripletBuilder:
def __init__(self):
- self.entities = {} # { entity_title: {...} }
- self.relations = [] # [ head: entity_title, type: category, tail: entity_title]
+ # Load model and tokenizer
+ MODEL_NAME = "Babelscape/rebel-large"
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
+
+ ## Defines
+ self.span_length = int(os.environ.get("SPAN_LENGTH", "1024"))
+ self.overlap = int(os.environ.get("OVERLAP", "100"))
+ self.model = model
+ self.tokenizer = tokenizer
+
+ async def cal_index_span(self, total_tokens, span_length, overlap):
+ num_spans = math.ceil(total_tokens / span_length) + 1 # Calculate number of spans and assign to num_spans
+ spans = [] # Initialize an empty list to store the spans
+ start = 0
+ for i in range(num_spans): # Iterate using the calculated num_spans
+ start = i * (span_length - overlap)
+ end = min(start + span_length, total_tokens) # Calculate end
+ if end >= total_tokens:
+ end = total_tokens
+ start = end - span_length
+ if span_length <= overlap:
+ raise ValueError("Indexing is incorrect something is wrong")
+
+ spans.append([start, end]) # Append the span to the list
+ return spans
+
+ async def gen_tokenize(self, text: str) -> List[str]:
+ # print(f'entering tokenizer {text[:100]}')
+ tensor_tokens = self.tokenizer([text], return_tensors="pt")
+ # print(f'done entering tokenizer {tensor_tokens}')
+ return tensor_tokens
+
+ async def rearrange_array(self, data_array, index_list):
+ tensor_ids = [inputs["input_ids"][0][[0]:[1]] for boundary in spans_boundaries]
+ tensor_masks = [inputs["attention_mask"][0][[0]:[1]] for boundary in spans_boundaries]
+ inputs = {"input_ids": torch.stack(tensor_ids), "attention_mask": torch.stack(tensor_masks)}
+
+ return tensors
+
+ ## code
+ async def extract_graph(self, text):
+ # print(f'Entering graph extraction')
+ tokenize_input = await self.gen_tokenize(text)
+ total_tokens = len(tokenize_input["input_ids"][0])
+ span_index_gen = await self.cal_index_span(total_tokens, self.span_length, self.overlap)
+ tensor_ids = [torch.tensor(tokenize_input["input_ids"][0][start:end]) for start, end in span_index_gen]
+ tensor_masks = [torch.tensor(tokenize_input["attention_mask"][0][start:end]) for start, end in span_index_gen]
+ rearrange_inputs = {"input_ids": torch.stack(tensor_ids), "attention_mask": torch.stack(tensor_masks)}
+
+ # generate relations
+ MAX_LENGTH = int(os.environ.get("MAX_LENGTH", "256"))
+ num_return_sequences = 3
+ gen_kwargs = {
+ "max_length": MAX_LENGTH,
+ "length_penalty": 0,
+ "num_beams": 3,
+ "num_return_sequences": num_return_sequences,
+ }
+
+ generated_tokens = self.model.generate(**rearrange_inputs, **gen_kwargs)
+
+ # decode relations
+ decoded_preds = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
+
+ # create kb
+ tm = TripletManager()
+ te = TripletExtractor()
+ i = 0
+
+ for sentence_pred in decoded_preds:
+ current_span_index = i // num_return_sequences
+ relations = te.triplet_extractor(sentence_pred)
+ for relation in relations:
+ tm.add_relation(relation)
+ i += 1
+ return tm
+
+
+class TripletManager:
+ def __init__(self):
+ self.entities = {} # { entity_title: {...} }
+ self.relations = [] # [ head: entity_title, type: category, tail: entity_title]
def are_relations_equal(self, relation1, relation2):
- """
- Check if two relations are equal.
- """
+ """Check if two relations are equal."""
head_match = relation1["head"] == relation2["head"]
type_match = relation1["type"] == relation2["type"]
tail_match = relation1["tail"] == relation2["tail"]
@@ -142,39 +140,30 @@ def are_relations_equal(self, relation1, relation2):
return all_match
def exists_relation(self, relation1):
- """
- Check if relation exists.
- """
+ """Check if relation exists."""
return any(self.are_relations_equal(relation1, relation2) for relation2 in self.relations)
def merge_relations(self, relation2):
- """
- Merge two relations.
- """
- relation1 = [r for r in self.relations
- if self.are_relations_equal(relation2, r)][0]
+ """Merge two relations."""
+ relation1 = [r for r in self.relations if self.are_relations_equal(relation2, r)][0]
def exists_entity(self, entity_title):
return entity_title in self.entities
def add_entity(self, entity):
- """
- Check if entry exists and add if not.
- """
- if self.exists_entity(entity): # Directly check if the entity exists
+ """Check if entry exists and add if not."""
+ if self.exists_entity(entity): # Directly check if the entity exists
return
- self.entities[entity] = {"title": entity} # Create a dictionary for the entity
+ self.entities[entity] = {"title": entity} # Create a dictionary for the entity
return
def add_relation(self, relation):
- """
- Add entry checking to see if it needs merge or create a new entry.
- """
+ """Add entry checking to see if it needs merge or create a new entry."""
candidate_entities = [relation["head"], relation["tail"]]
# manage new entities
for entity in candidate_entities:
- self.add_entity(entity)
+ self.add_entity(entity)
# manage new relation
if not self.exists_relation(relation):
@@ -183,36 +172,27 @@ def add_relation(self, relation):
self.merge_relations(relation)
def write_to_csv(self, WRITE_TO_CSV=None):
- """
- Saves the entities and relations to a CSV file.
- """
- struct_entity = {
- 'entity' : [],
- 'details' : []
- }
- struct_triplets = {
- 'head': [],
- 'type': [],
- 'tail': []
- }
+ """Saves the entities and relations to a CSV file."""
+ struct_entity = {"entity": [], "details": []}
+ struct_triplets = {"head": [], "type": [], "tail": []}
# Instead of appending, build lists of entities and relations
entity_data = []
for entity in self.entities.items():
- entity_data.append(entity)
-
+ entity_data.append(entity)
+
relation_data = []
for relation in self.relations:
relation_data.append(relation)
# Create DataFrames from the collected data
- df_entity = pd.DataFrame(entity_data, columns=['entity', 'details'])
- df_relation = pd.DataFrame(relation_data)
+ df_entity = pd.DataFrame(entity_data, columns=["entity", "details"])
+ df_relation = pd.DataFrame(relation_data)
# Write to CSV if requested
if WRITE_TO_CSV == True:
- df_entity.to_csv('entities.csv', index=True)
- df_relation.to_csv('relations.csv', index=True)
+ df_entity.to_csv("entities.csv", index=True)
+ df_relation.to_csv("relations.csv", index=True)
return df_entity, df_relation
diff --git a/comps/text2graph/src/integrations/opea.py b/comps/text2graph/src/integrations/opea.py
index 5ab35d4fa..05502d16f 100644
--- a/comps/text2graph/src/integrations/opea.py
+++ b/comps/text2graph/src/integrations/opea.py
@@ -6,11 +6,14 @@
import os
import time
from typing import Annotated, Optional
+
from langchain.agents.agent_types import AgentType
from langchain_huggingface import HuggingFaceEndpoint
from pydantic import BaseModel, Field
+
from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType
-from comps.text2graph.src.integrations.graph_agent import TripletManager, TripletBuilder, TripletExtractor
+from comps.text2graph.src.integrations.graph_agent import TripletBuilder, TripletExtractor, TripletManager
+
##from comps.text2graph.src.integrations.triplet_manager import TripletManager
##from comps.text2graph.src.integrations.triplet_builder import TripletBuilder
##from comps.text2graph.src.integrations.triplet_extractor import TripletExtractor
@@ -31,23 +34,23 @@
"streaming": True,
}
-#TGI_LLM_ENDPOINT = os.environ.get("TGI_LLM_ENDPOINT")
+# TGI_LLM_ENDPOINT = os.environ.get("TGI_LLM_ENDPOINT")
#
-#llm = HuggingFaceEndpoint(
+# llm = HuggingFaceEndpoint(
# endpoint_url=TGI_LLM_ENDPOINT,
# task="text-generation",
# **generation_params,
# )
+
class Input(BaseModel):
input_text: str
- #conn_str: Optional[PostgresConnection] = None
+ # conn_str: Optional[PostgresConnection] = None
+
@OpeaComponentRegistry.register("OPEA_TEXT2GRAPH")
class OpeaText2GRAPH(OpeaComponent):
- """
- A specialized text to graph triplet converter
- """
+ """A specialized text to graph triplet converter."""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.TEXT2GRAPH.name.lower(), description, config)
@@ -57,29 +60,31 @@ def __init__(self, name: str, description: str, config: dict = None):
async def check_health(self) -> bool:
"""Checks the health of the TGI service.
+
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
- #response = llm.generate(["Hello, how are you?"])
+ # response = llm.generate(["Hello, how are you?"])
return True
except Exception as e:
return False
async def invoke(self, input_text: str):
- """Invokes the text2graph service to generate graph(s) for the provided input.
+ """Invokes the text2graph service to generate graph(s) for the provided input.
+
input:
input: text document
Returns:
text : dict
- """
+ """
- tb = TripletBuilder()
- graph_triplets = await tb.extract_graph(input_text)
+ tb = TripletBuilder()
+ graph_triplets = await tb.extract_graph(input_text)
- #tm = TripletManager()
- #entity, relation = tm.write_to_csv(WRITE_TO_CSV=False)
+ # tm = TripletManager()
+ # entity, relation = tm.write_to_csv(WRITE_TO_CSV=False)
- result = {"graph_triplets": graph_triplets}
+ result = {"graph_triplets": graph_triplets}
- return result
+ return result
diff --git a/comps/text2graph/src/opea_text2graph_microservice.py b/comps/text2graph/src/opea_text2graph_microservice.py
index bc46c61cd..139508df1 100644
--- a/comps/text2graph/src/opea_text2graph_microservice.py
+++ b/comps/text2graph/src/opea_text2graph_microservice.py
@@ -1,8 +1,12 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
import os
import pathlib
import sys
from fastapi.exceptions import HTTPException
+
from comps import CustomLogger, OpeaComponentLoader, opea_microservices, register_microservice
from comps.text2graph.src.integrations.opea import Input, OpeaText2GRAPH
@@ -21,6 +25,7 @@
description=f"OPEA RERANK Component: {text2graph_component_name}",
)
+
@register_microservice(
name="opea_service@text2graph",
endpoint="/v1/text2graph",
@@ -29,16 +34,17 @@
)
async def execute_agent(input_text: str):
"""Execute triplet extraction from text file.
+
This function takes an Input object containing the input text and database connection information.
It uses the execute function from the text2graph module to execute the graph query and returns the result.
Args:
- input (Input): An Input object with the input text
+ input (Input): An Input object with the input text
Returns:
dict: A dictionary with head, tail and type linking head and tail
"""
- print(f'===============================================================')
- print(f'===================ENTERING THIS EXECUTE AGENT=================')
- print(f'===============================================================')
+ print("===============================================================")
+ print("===================ENTERING THIS EXECUTE AGENT=================")
+ print("===============================================================")
results = await loader.invoke(input_text)
return {"result": results}
diff --git a/comps/text2graph/src/requirements.txt b/comps/text2graph/src/requirements.txt
index b4c35d5be..45ff5d437 100644
--- a/comps/text2graph/src/requirements.txt
+++ b/comps/text2graph/src/requirements.txt
@@ -1,5 +1,6 @@
docarray[full]
fastapi
+hanging_threads
langchain==0.2.9
langchain-huggingface
langchain_community==0.2.7
@@ -15,4 +16,3 @@ pydantic
shortuuid
sqlalchemy
uvicorn
-hanging_threads
diff --git a/comps/text2graph/src/setup_service_env.sh b/comps/text2graph/src/setup_service_env.sh
index 505b0ef4c..68e271926 100755
--- a/comps/text2graph/src/setup_service_env.sh
+++ b/comps/text2graph/src/setup_service_env.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
#######################################################################
# Proxy
#######################################################################
@@ -5,7 +8,7 @@ export https_proxy=${https_proxy}
export http_proxy=${http_proxy}
export no_proxy=${no_proxy}
################################################################
-# Configure LLM Parameters based on the model selected.
+# Configure LLM Parameters based on the model selected.
################################################################
export LLM_ID=${LLM_ID:-"Babelscape/rebel-large"}
export SPAN_LENGTH=${SPAN_LENGTH:-"1024"}
diff --git a/tests/text2graph/example_from_file.py b/tests/text2graph/example_from_file.py
index ed7f52597..44e3b4cdb 100755
--- a/tests/text2graph/example_from_file.py
+++ b/tests/text2graph/example_from_file.py
@@ -1,42 +1,52 @@
-import os, sys
-import requests
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
import subprocess
+import sys
from urllib.parse import quote
+
+import requests
+
################################################################
-# Download the text file to extract fraph from
+# Download the text file to extract fraph from
################################################################
# Define the input data : big text file and feed it
curr_dir = os.getcwd()
-append_dir = '/tmpdata'
+append_dir = "/tmpdata"
PATH = curr_dir + append_dir
os.system(f"mkdir -p {PATH}")
-os.system(f"wget -P {PATH} 'https://gist.githubusercontent.com/wey-gu/75d49362d011a0f0354d39e396404ba2/raw/0844351171751ebb1ce54ea62232bf5e59445bb7/paul_graham_essay.txt'")
-text = open(f'{PATH}/paul_graham_essay.txt').read()
-encoded_data2 = quote(text)
+os.system(
+ f"wget -P {PATH} 'https://gist.githubusercontent.com/wey-gu/75d49362d011a0f0354d39e396404ba2/raw/0844351171751ebb1ce54ea62232bf5e59445bb7/paul_graham_essay.txt'"
+)
+text = open(f"{PATH}/paul_graham_essay.txt").read()
+encoded_data2 = quote(text)
+
##################################################################
-# Function to parse the output to decipher if
-# triplets head->relation->tail was extracted
+# Function to parse the output to decipher if
+# triplets head->relation->tail was extracted
##################################################################
def run_check_keywords(response):
# Check for keywords in the response
- if all(key in response.text.lower() for key in ['head', 'tail', 'type']):
+ if all(key in response.text.lower() for key in ["head", "tail", "type"]):
print("TEST PASS :: All three keys (head, tail, type) exist in the response.")
return True
print("TEST FAIL: No keyword found")
return False
+
##################################################################
# Extract graph from text2graph
##################################################################
PORT = 8090
BASE_URL = f"http://localhost:{PORT}/v1/text2graph"
-headers = {'accept': 'application/json'}
+headers = {"accept": "application/json"}
# Send the text as a query parameter
-response = requests.post(url=BASE_URL, params={'input_text': text}, headers=headers)
+response = requests.post(url=BASE_URL, params={"input_text": text}, headers=headers)
print(f"{response.json()}")
if response.status_code == 200:
print(f"Microservice response code: {response.status_code}")
diff --git a/tests/text2graph/native_text2graph.sh b/tests/text2graph/native_text2graph.sh
index d00ad8d7b..ae9edf18c 100755
--- a/tests/text2graph/native_text2graph.sh
+++ b/tests/text2graph/native_text2graph.sh
@@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}')
service_name="text2graph"
function build_docker_graph() {
- echo "=================== START BUILD DOCKER ========================"
+ echo "=================== START BUILD DOCKER ========================"
cd $WORKPATH
echo $(pwd)
docker build --no-cache -t opea/text2graph:latest -f comps/text2graph/src/Dockerfile .
@@ -20,29 +20,29 @@ function build_docker_graph() {
else
echo "opea/text2graph built successful"
fi
- echo "=================== END BUILD DOCKER ========================"
+ echo "=================== END BUILD DOCKER ========================"
}
function start_service() {
- echo "=================== START SERVICE ========================"
+ echo "=================== START SERVICE ========================"
cd $WORKPATH/comps/text2graph/deployment/docker_compose
docker compose -f compose.yaml up ${service_name} -d > start_services_with_compose.log
sleep 10s
- echo "=================== END SERVICE ========================"
+ echo "=================== END SERVICE ========================"
}
function validate_microservice() {
- echo "=================== START VALIDATE ========================"
+ echo "=================== START VALIDATE ========================"
cd $WORKPATH/tests/text2graph
python3 example_from_file.py
- echo "=================== END VALIDATE ========================"
+ echo "=================== END VALIDATE ========================"
}
function stop_docker() {
- echo "=================== START STOP DOCKER ========================"
+ echo "=================== START STOP DOCKER ========================"
cd $WORKPATH/comps/text2graph/deployment/docker_compose
docker compose -f compose.yaml down ${service_name} --remove-orphans
- echo "=================== END STOP DOCKER ========================"
+ echo "=================== END STOP DOCKER ========================"
}
function main() {