Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix scoring in Elasticsearch for dot product #517

Merged
merged 2 commits into from
Oct 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
- name: Run Elasticsearch
uses: elastic/elastic-github-actions/elasticsearch@25ad91e35aeee806711d335fc9dec7927ae49bc6
with:
stack-version: 7.6.0
stack-version: 7.9.2

- name: Run Apache Tika
run: docker run -d -p 9998:9998 apache/tika:1.24.1
Expand Down
5 changes: 3 additions & 2 deletions haystack/document_store/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,8 @@ def query_by_embedding(self,
"script_score": {
"query": {"match_all": {}},
"script": {
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1.0",
# offset score to ensure a positive range as required by Elasticsearch
"source": f"{self.similarity_fn_name}(params.query_vector,'{self.embedding_field}') + 1000",
"params": {
"query_vector": query_emb.tolist()
}
Expand Down Expand Up @@ -497,7 +498,7 @@ def _convert_es_hit_to_document(self, hit: dict, adapt_score_for_embedding: bool
score = hit["_score"] if hit["_score"] else None
if score:
if adapt_score_for_embedding:
score -= 1
score -= 1000
probability = (score + 1) / 2 # scaling probability from cosine similarity
else:
probability = float(expit(np.asarray(score / 8))) # scaling probability from TFIDF/BM25
Expand Down
8 changes: 4 additions & 4 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def elasticsearch_fixture():
shell=True
)
status = subprocess.run(
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.1'],
['docker run -d --name haystack_test_elastic -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'],
shell=True
)
if status.returncode:
Expand Down Expand Up @@ -160,12 +160,12 @@ def document_store(request, test_docs_xs, elasticsearch_fixture):
return get_document_store(request.param)


@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
def retriever(request, document_store):
return get_retriever(request.param, document_store)


@pytest.fixture(params=["es_filter_only", "elsticsearch", "dpr", "embedding", "tfidf"])
@pytest.fixture(params=["es_filter_only", "elasticsearch", "dpr", "embedding", "tfidf"])
def retriever_with_docs(request, document_store_with_docs):
return get_retriever(request.param, document_store_with_docs)

Expand Down Expand Up @@ -206,7 +206,7 @@ def get_retriever(retriever_type, document_store):
retriever = EmbeddingRetriever(document_store=document_store,
embedding_model="deepset/sentence_bert",
use_gpu=False)
elif retriever_type == "elsticsearch":
elif retriever_type == "elasticsearch":
retriever = ElasticsearchRetriever(document_store=document_store)
elif retriever_type == "es_filter_only":
retriever = ElasticsearchFilterOnlyRetriever(document_store=document_store)
Expand Down
4 changes: 2 additions & 2 deletions test/test_elastic_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):
res = retriever_with_docs.retrieve(query="Who lives in Berlin?")
assert res[0].text == "My name is Carla and I live in Berlin"
Expand All @@ -11,7 +11,7 @@ def test_elasticsearch_retrieval(retriever_with_docs, document_store_with_docs):


@pytest.mark.parametrize("document_store_with_docs", [("elasticsearch")], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
def test_elasticsearch_retrieval_filters(retriever_with_docs, document_store_with_docs):
res = retriever_with_docs.retrieve(query="Who lives in Berlin?", filters={"name": ["filename1"]})
assert res[0].text == "My name is Carla and I live in Berlin"
Expand Down
4 changes: 2 additions & 2 deletions test/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_eval_reader(reader, document_store: BaseDocumentStore):

@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("open_domain", [True, False])
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain, retriever):
# add eval data (SQUAD format)
document_store.delete_all_documents(index="test_eval_document")
Expand All @@ -81,7 +81,7 @@ def test_eval_elastic_retriever(document_store: BaseDocumentStore, open_domain,

@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
@pytest.mark.parametrize("retriever", ["elsticsearch"], indirect=True)
@pytest.mark.parametrize("retriever", ["elasticsearch"], indirect=True)
def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):
finder = Finder(reader=reader, retriever=retriever)

Expand Down
10 changes: 5 additions & 5 deletions tutorials/Tutorial4_FAQ_style_QA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
"outputs": [],
"source": [
"# Recommended: Start Elasticsearch using Docker\n",
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.6.2"
"# ! docker run -d -p 9200:9200 -e \"discovery.type=single-node\" elasticsearch:7.9.2"
]
},
{
Expand All @@ -81,13 +81,13 @@
"outputs": [],
"source": [
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
"\n",
"import os\n",
"from subprocess import Popen, PIPE, STDOUT\n",
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
" stdout=PIPE, stderr=STDOUT,\n",
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
" )\n",
Expand Down
2 changes: 1 addition & 1 deletion tutorials/Tutorial4_FAQ_style_QA.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
if LAUNCH_ELASTICSEARCH:
logging.info("Starting Elasticsearch ...")
status = subprocess.run(
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
)
if status.returncode:
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
Expand Down
8 changes: 4 additions & 4 deletions tutorials/Tutorial5_Evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@
"outputs": [],
"source": [
"# In Colab / No Docker environments: Start Elasticsearch from source\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.6.2\n",
"! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q\n",
"! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz\n",
"! chown -R daemon:daemon elasticsearch-7.9.2\n",
"\n",
"import os\n",
"from subprocess import Popen, PIPE, STDOUT\n",
"es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],\n",
"es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],\n",
" stdout=PIPE, stderr=STDOUT,\n",
" preexec_fn=lambda: os.setuid(1) # as daemon\n",
" )\n",
Expand Down
2 changes: 1 addition & 1 deletion tutorials/Tutorial5_Evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
if LAUNCH_ELASTICSEARCH:
logging.info("Starting Elasticsearch ...")
status = subprocess.run(
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2'], shell=True
['docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2'], shell=True
)
if status.returncode:
raise Exception("Failed to launch Elasticsearch. If you want to connect to an existing Elasticsearch instance"
Expand Down