Skip to content

Commit

Permalink
Fixed the Search Field mapping in ElasticSearch DocumentStore (#2080)
Browse files Browse the repository at this point in the history
* Review changes

* Added the synonym analyser for search fields

* Added the review requests.

* Added the synonyms the OpenSearchDocumentStore and review requests.
  • Loading branch information
SjSnowball authored Jan 31, 2022
1 parent bbb65a1 commit 7d769d8
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 3 deletions.
27 changes: 25 additions & 2 deletions haystack/document_stores/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def _create_document_index(self, index_name: str, headers: Optional[Dict[str, st
"mappings": {
"properties": {
self.name_field: {"type": "keyword"},
self.content_field: {"type": "text"},
self.content_field: {"type": "text"}
},
"dynamic_templates": [
{
Expand All @@ -301,13 +301,21 @@ def _create_document_index(self, index_name: str, headers: Optional[Dict[str, st
}
}
}

if self.synonyms:
for field in self.search_fields:
mapping["mappings"]["properties"].update({field: {"type": "text", "analyzer": "synonym"}})
mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"}

mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace",
"filter": ["lowercase",
"synonym"]}
mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}}

else:
for field in self.search_fields:
mapping["mappings"]["properties"].update({field: {"type": "text"}})

if self.embedding_field:
mapping["mappings"]["properties"][self.embedding_field] = {"type": "dense_vector", "dims": self.embedding_dim}

Expand Down Expand Up @@ -1353,7 +1361,7 @@ def _create_document_index(self, index_name: str, headers: Optional[Dict[str, st
"mappings": {
"properties": {
self.name_field: {"type": "keyword"},
self.content_field: {"type": "text"},
self.content_field: {"type": "text"}
},
"dynamic_templates": [
{
Expand All @@ -1373,6 +1381,21 @@ def _create_document_index(self, index_name: str, headers: Optional[Dict[str, st
}
}
}

if self.synonyms:
for field in self.search_fields:
mapping["mappings"]["properties"].update({field: {"type": "text", "analyzer": "synonym"}})
mapping["mappings"]["properties"][self.content_field] = {"type": "text", "analyzer": "synonym"}

mapping["settings"]["analysis"]["analyzer"]["synonym"] = {"tokenizer": "whitespace",
"filter": ["lowercase",
"synonym"]}
mapping["settings"]["analysis"]["filter"] = {"synonym": {"type": self.synonym_type, "synonyms": self.synonyms}}

else:
for field in self.search_fields:
mapping["mappings"]["properties"].update({field: {"type": "text"}})

if self.embedding_field:

if self.similarity == "cosine":
Expand Down
28 changes: 27 additions & 1 deletion test/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -1196,4 +1196,30 @@ def test_DeepsetCloudDocumentStore_query_by_embedding(deepset_cloud_document_sto
responses.add_passthru(DC_API_ENDPOINT)

emb_docs = deepset_cloud_document_store.query_by_embedding(query_emb)
assert len(emb_docs) == 0
assert len(emb_docs) == 0


@pytest.mark.elasticsearch
def test_elasticsearch_search_field_mapping():

client = Elasticsearch()
client.indices.delete(index='haystack_search_field_mapping', ignore=[404])

index_data = [
{"title": "Green tea components",
"meta": {"content": "The green tea plant contains a range of healthy compounds that make it into the final drink","sub_content":"Drink tip"},"id": "1"},
{"title": "Green tea catechin",
"meta": {"content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).","sub_content":"Ingredients tip"}, "id": "2"},
{"title": "Minerals in Green tea",
"meta": {"content": "Green tea also has small amounts of minerals that can benefit your health.","sub_content":"Minerals tip"}, "id": "3"},
{"title": "Green tea Benefits",
"meta": {"content": "Green tea does more than just keep you alert, it may also help boost brain function.","sub_content":"Health tip"},"id": "4"}
]

document_store = ElasticsearchDocumentStore(index="haystack_search_field_mapping",search_fields=["content", "sub_content"],content_field= "title")
document_store.write_documents(index_data)

indexed_settings = client.indices.get_mapping(index="haystack_search_field_mapping")

assert indexed_settings["haystack_search_field_mapping"]["mappings"]["properties"]["content"]["type"] == 'text'
assert indexed_settings["haystack_search_field_mapping"]["mappings"]["properties"]["sub_content"]["type"] == 'text'

0 comments on commit 7d769d8

Please sign in to comment.