Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SentenceTransformersRanker with pre-trained Cross-Encoder #1209

Merged
merged 6 commits into from
Jul 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions haystack/ranker/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from haystack.ranker.farm import FARMRanker
from haystack.ranker.sentence_transformers import SentenceTransformersRanker
16 changes: 14 additions & 2 deletions haystack/ranker/farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,24 @@
class FARMRanker(BaseRanker):
"""
Transformer based model for Document Re-ranking using the TextPairClassifier of FARM framework (/~https://github.com/deepset-ai/FARM).
Re-Ranking can be used on top of a retriever to boost the performance for document search. This is particularly useful if the retriever has a high recall but is bad in sorting the documents by relevance.
While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same.
FARMRanker handles Cross-Encoder models that internally use two logits and output the classifier's probability of label "1" as similarity score.
This includes TextPairClassification models trained within FARM.
In contrast, SentenceTransformersRanker handles Cross-Encoder models that use a single logit as similarity score.
https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers

| With a FARMRanker, you can:

- directly get predictions via predict()
- fine-tune the model on TextPair data via train()

Usage example:
...
retriever = ElasticsearchRetriever(document_store=document_store)
ranker = FARMRanker(model_name_or_path="deepset/gbert-base-germandpr-reranking")
p = Pipeline()
p.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"])
"""

def __init__(
Expand Down Expand Up @@ -232,7 +244,7 @@ def predict_batch(self, query_doc_list: List[dict], top_k: int = None, batch_siz
"""
raise NotImplementedError

def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document]:
"""
Use loaded ranker model to re-rank the supplied list of Document.

Expand Down
103 changes: 103 additions & 0 deletions haystack/ranker/sentence_transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import logging
from pathlib import Path
from typing import List, Optional, Union

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from haystack import Document
from haystack.ranker.base import BaseRanker

logger = logging.getLogger(__name__)


class SentenceTransformersRanker(BaseRanker):
"""
Sentence Transformer based pre-trained Cross-Encoder model for Document Re-ranking (https://huggingface.co/cross-encoder).
Re-Ranking can be used on top of a retriever to boost the performance for document search. This is particularly useful if the retriever has a high recall but is bad in sorting the documents by relevance.

SentenceTransformerRanker handles Cross-Encoder models that use a single logit as similarity score.
https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers
In contrast, FARMRanker handles Cross-Encoder models that internally use two logits and output the classifier's probability of label "1" as similarity score.
This includes TextPairClassification models trained within FARM.

| With a SentenceTransformersRanker, you can:
- directly get predictions via predict()

Usage example:
...
retriever = ElasticsearchRetriever(document_store=document_store)
ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")
p = Pipeline()
p.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"])
"""

def __init__(
self,
model_name_or_path: Union[str, Path],
model_version: Optional[str] = None,
top_k: int = 10
):

"""
:param model_name_or_path: Directory of a saved model or the name of a public model e.g.
'cross-encoder/ms-marco-MiniLM-L-12-v2'.
See https://huggingface.co/cross-encoder for full list of available models
:param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
:param top_k: The maximum number of documents to return
"""

# save init parameters to enable export of component config as YAML
self.set_config(
model_name_or_path=model_name_or_path, model_version=model_version,
top_k=top_k,
)

self.top_k = top_k

self.transformer_model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, revision=model_version)
self.transformer_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path, revision=model_version)
self.transformer_model.eval()

def predict_batch(self, query_doc_list: List[dict], top_k: int = None, batch_size: int = None):
"""
Use loaded Ranker model to, for a list of queries, rank each query's supplied list of Document.

Returns list of dictionary of query and list of document sorted by (desc.) similarity with query

:param query_doc_list: List of dictionaries containing queries with their retrieved documents
:param top_k: The maximum number of answers to return for each query
:param batch_size: Number of samples the model receives in one batch for inference
:return: List of dictionaries containing query and ranked list of Document
"""
raise NotImplementedError

def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document]:
"""
Use loaded ranker model to re-rank the supplied list of Document.

Returns list of Document sorted by (desc.) TextPairClassification similarity with the query.

:param query: Query string
:param documents: List of Document to be re-ranked
:param top_k: The maximum number of documents to return
:return: List of Document
"""
if top_k is None:
top_k = self.top_k

features = self.transformer_tokenizer([query for doc in documents], [doc.text for doc in documents],
padding=True, truncation=True, return_tensors="pt")

# In contrast to FARMRanker, SentenceTransformerRanker uses the logit as similarity score and not the classifier's probability of label "1"
# https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers
with torch.no_grad():
similarity_scores = self.transformer_model(**features).logits

# rank documents according to scores
sorted_scores_and_documents = sorted(zip(similarity_scores, documents),
key=lambda similarity_document_tuple: similarity_document_tuple[0],
reverse=True)
sorted_documents = [doc for _, doc in sorted_scores_and_documents]
return sorted_documents[:top_k]
12 changes: 12 additions & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from haystack.document_store.milvus import MilvusDocumentStore
from haystack.generator.transformers import RAGenerator, RAGeneratorType
from haystack.ranker import FARMRanker, SentenceTransformersRanker

from haystack.retriever.sparse import ElasticsearchFilterOnlyRetriever, ElasticsearchRetriever, TfidfRetriever

Expand Down Expand Up @@ -263,6 +264,17 @@ def reader(request):
use_gpu=-1
)

@pytest.fixture(params=["farm", "sentencetransformers"], scope="module")
def ranker(request):
if request.param == "farm":
return FARMRanker(
model_name_or_path="deepset/gbert-base-germandpr-reranking"
)
if request.param == "sentencetransformers":
return SentenceTransformersRanker(
model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2",
)


# TODO Fix bug in test_no_answer_output when using
# @pytest.fixture(params=["farm", "transformers"])
Expand Down
65 changes: 65 additions & 0 deletions test/test_ranker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from haystack import Document
from haystack.ranker import FARMRanker
from haystack.ranker.base import BaseRanker
from haystack.ranker.sentence_transformers import SentenceTransformersRanker


def test_ranker(ranker):
assert isinstance(ranker, BaseRanker)

if isinstance(ranker, FARMRanker):
query = "Welches ist die zweitgrößte Stadt in den Alpen?"
docs = [
Document(
text="""Deçan\n\n== Geographie ==\nDeçan liegt im Westen des Kosovo auf etwa 550 Meter über Meer nahe den Grenzen zu Montenegro und Albanien. Westlich der Stadt liegt das Prokletije (auch ''Albanische Alpen'' genannt). Etwas nordwestlich tritt der Fluss Bistrica e Deçanit aus dem Gebirge, der Deçan nördlich des Zentrums passiert. Etwa zehn Kilometer im Südosten befindet sich der Radoniq-Stausee, welcher der zweitgrößte See im Land ist. Deçan befindet sich zirka auf halbem Weg zwischen Gjakova und Peja. Die Hauptstadt Pristina liegt rund 70 Kilometer im Osten.""",
meta={"name": "0"},
id="1",
),
Document(
text="""Alpen\n\n=== Städte ===\nInnerhalb der Alpen ist das französische Grenoble die größte Stadt, gefolgt von Innsbruck in Österreich sowie von Trient und Bozen in Italien. In der Schweiz liegen Chur, Thun und Lugano in den Alpen. Weitere Alpenstädte in Österreich sind Klagenfurt und Villach, sowie im Rheintal Bregenz, Dornbirn und Feldkirch. Ferner zu nennen ist Vaduz, die Hauptstadt Liechtensteins. Die höchste Stadt der Alpen (und Europas) ist das schweizerische Davos.\nIn direkter Alpenrandlage ist Wien die weitaus größte Stadt, gefolgt von Genf (Schweiz) und Nizza (Frankreich). Weitere wichtige Städte sind – von Ost nach West – Maribor (Slowenien), Graz (Österreich), Ljubljana (Slowenien), Udine (Italien), Salzburg (Österreich), Vicenza (Italien), Verona (Italien), Brescia (Italien), Bergamo (Italien), St. Gallen (Schweiz), Lecco (Italien), Como (Italien), Varese (Italien), Luzern (Schweiz), Savona (Italien), Biella (Italien), San Remo (Italien), Cuneo (Italien), Bern (Schweiz) und Monaco.""",
meta={"name": "1"},
id="2",
),
Document(
text="""Latumer_Bruch\nDer Latumer Bruch, lokal auch ''Lohbruch'' genannt, ist ein Bruchwald- und Feuchtgebiet im Südosten der Stadt Krefeld, welches unter gleichem Namen das zweitgrößte Naturschutzgebiet der Stadt bildet (Nr. ''KR-001'').\nDer Bruch liegt am südlichen Rand des Krefelder Stadtteils Linn. Im Nordwesten grenzt das Gebiet an Oppum, im Nordosten an Gellep-Stratum, im Südwesten und Südosten liegen die Meerbuscher Stadtteile Ossum-Bösinghoven und Lank-Latum. Benannt ist der Latumer Bruch nach dem Haus Latum, einem Gutshof am Ortsrand von Lank-Latum, zu dessen Ländereien das Gebiet historisch gehörte.""",
meta={"name": "2"},
id="3",
),
Document(
text="""Großglockner\n\n=== Lage und Umgebung ===\nDer Großglockner ist Teil des ''Glocknerkamms'', eines Gebirgskamms der Glocknergruppe (Österreichische Zentralalpen), der am Eiskögele in südöstlicher Richtung vom Alpenhauptkamm abzweigt und dort die Grenze zwischen den Bundesländern Tirol im Südwesten und Kärnten im Nordosten bildet. Diese Grenze ist auch die Wasserscheide zwischen dem Kalser Tal mit seinen Seitentälern auf der Osttiroler und dem Mölltal mit der Pasterze auf der Kärntner Seite. Die Gegend um den Berg ist außerdem seit 1986 Bestandteil des ''Sonderschutzgebietes Großglockner-Pasterze'' innerhalb des Nationalparks Hohe Tauern.\nDer Großglockner ist der höchste Berg der Alpen östlich der 175 km entfernten Ortlergruppe und weist damit nach dem Mont Blanc die zweitgrößte geografische Dominanz aller Berge der Alpen auf. Auch seine Schartenhöhe ist mit 2.424 Metern nach dem Montblanc die zweitgrößte aller Alpengipfel. Somit ist der Berg eine der eigenständigsten Erhebungen der Alpen. Die Aussicht vom Großglockner gilt als die weiteste aller Berge der Ostalpen, sie reicht 220 Kilometer weit, unter Berücksichtigung der terrestrischen Refraktion fast 240 Kilometer. Der Blick über mehr als 150.000 Quadratkilometer Erdoberfläche reicht bis zur Schwäbisch-Bayerischen Ebene im Nordwesten, bis Regensburg und zum Böhmerwald im Norden, zum Ortler im Westen, zur Poebene im Süden, zum Triglav und zum Toten Gebirge im Osten.\nDie bedeutendsten Orte in der Umgebung des Berges sind Kals am Großglockner () im Kalser Tal in Osttirol, vom Gipfel aus ungefähr acht Kilometer in südwestlicher Richtung gelegen, und Heiligenblut am Großglockner () im Mölltal in Kärnten, vom Gipfel aus ca. zwölf Kilometer in südöstlicher Richtung.""",
meta={"name": "3"},
id="4",
),
]
results = ranker.predict(query=query, documents=docs)
assert results[0] == docs[1]
elif isinstance(ranker, SentenceTransformersRanker):
query = "What is the most important building in King's Landing that has a religious background?"
docs = [
Document(
text="""Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""",
meta={"name": "0"},
id="1",
),
Document(
text="""Democratic Republic of the Congo to the south. Angola's capital, Luanda, lies on the Atlantic coast in the northwest of the country. Angola, although located in a tropical zone, has a climate that is not characterized for this region, due to the confluence of three factors: As a result, Angola's climate is characterized by two seasons: rainfall from October to April and drought, known as ""Cacimbo"", from May to August, drier, as the name implies, and with lower temperatures. On the other hand, while the coastline has high rainfall rates, decreasing from North to South and from to , with""",
id="2",
),
Document(
text="""Schopenhauer, describing him as an ultimately shallow thinker: ""Schopenhauer has quite a crude mind ... where real depth starts, his comes to an end."" His friend Bertrand Russell had a low opinion on the philosopher, and attacked him in his famous ""History of Western Philosophy"" for hypocritically praising asceticism yet not acting upon it. On the opposite isle of Russell on the foundations of mathematics, the Dutch mathematician L. E. J. Brouwer incorporated the ideas of Kant and Schopenhauer in intuitionism, where mathematics is considered a purely mental activity, instead of an analytic activity wherein objective properties of reality are""",
meta={"name": "1"},
id="3",
),
Document(
text="""The Dothraki vocabulary was created by David J. Peterson well in advance of the adaptation. HBO hired the Language Creatio""",
meta={"name": "2"},
id="4",
),
Document(
text="""The title of the episode refers to the Great Sept of Baelor, the main religious building in King's Landing, where the episode's pivotal scene takes place. In the world created by George R. R. Martin""",
meta={},
id="5",
),
]
results = ranker.predict(query=query, documents=docs)
assert results[0] == docs[4]