Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom id hashing on documentstore level #1910

Merged
merged 16 commits into from
Jan 3, 2022
Merged
20 changes: 20 additions & 0 deletions docs/_src/api/api/document_store.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,26 @@ from disk and also indexed batchwise to the DocumentStore in order to prevent ou
- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the
same question might be found in different contexts.

<a name="base.BaseDocumentStore.run"></a>
#### run

```python
| run(documents: List[dict], index: Optional[str] = None, id_hash_keys: Optional[List[str]] = None)
```

Run requests of document stores

Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents.
In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function
is therefore only an interim solution until the run function also accepts documents.

**Arguments**:

- `documents`: A list of dicts that are documents.
- `index`: Optional name of index where the documents shall be written to.
If None, the DocumentStore's default index (self.index) will be used.
- `id_hash_keys`: List of the fields that the hashes of the ids are generated from.

<a name="base.get_batches_from_generator"></a>
#### get\_batches\_from\_generator

Expand Down
7 changes: 4 additions & 3 deletions docs/_src/api/api/primitives.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
In the range of [0,1], where 1 means extremely relevant.
- `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
- `embedding`: Vector encoding of the text
- `id_hash_keys`: Generate the document id from a custom list of strings.
- `id_hash_keys`: Generate the document id from a custom list of strings that refere to the documents attributes.
If you want ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"].
not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]).
In this case the id will be generated by using the content and the defined metadata.

<a name="schema.Document.to_dict"></a>
#### to\_dict
Expand Down Expand Up @@ -71,7 +72,7 @@ dict with content of the Document

```python
| @classmethod
| from_dict(cls, dict, field_map={})
| from_dict(cls, dict, field_map={}, id_hash_keys=None)
```

Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
Expand Down
28 changes: 26 additions & 2 deletions haystack/document_stores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
from abc import abstractmethod
from pathlib import Path

try:
from typing import Literal
except ImportError:
from typing_extensions import Literal #type: ignore

from haystack.schema import Document, Label, MultiLabel
from haystack.nodes.base import BaseComponent
from haystack.errors import DuplicateDocumentError
Expand Down Expand Up @@ -303,9 +308,28 @@ def delete_documents(self, index: Optional[str] = None, ids: Optional[List[str]]
@abstractmethod
def delete_labels(self, index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None):
pass

@abstractmethod
def _create_document_field_map(self) -> Dict:
pass

def run(self, documents: List[dict], index: Optional[str] = None, id_hash_keys: Optional[List[str]] = None ): # type: ignore
"""
Run requests of document stores

Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents.
In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function
is therefore only an interim solution until the run function also accepts documents.

def run(self, documents: List[dict], index: Optional[str] = None): # type: ignore
self.write_documents(documents=documents, index=index)
:param documents: A list of dicts that are documents.
:param index: Optional name of index where the documents shall be written to.
If None, the DocumentStore's default index (self.index) will be used.
:param id_hash_keys: List of the fields that the hashes of the ids are generated from.
"""

field_map = self._create_document_field_map()
doc_objects = [Document.from_dict(d, field_map=field_map, id_hash_keys=id_hash_keys) for d in documents]
self.write_documents(documents=doc_objects, index=index)
return {}, "output_1"

@abstractmethod
Expand Down
8 changes: 7 additions & 1 deletion haystack/document_stores/graphdb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Dict

import requests
from pathlib import Path
Expand Down Expand Up @@ -118,6 +118,12 @@ def get_all_predicates(self, index: Optional[str] = None):
results = self.query(sparql_query=sparql_query, index=index)
return results

def _create_document_field_map(self)->Dict:
"""
There is no field mapping required
"""
return {}

def get_all_objects(self, index: Optional[str] = None):
"""
Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered.
Expand Down
6 changes: 6 additions & 0 deletions haystack/document_stores/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,12 @@ def get_all_documents_generator(
batch_size=batch_size,
)
yield from result

def _create_document_field_map(self)->Dict:
"""
There is no field mapping required
"""
return {}

def _query(
self,
Expand Down
44 changes: 37 additions & 7 deletions haystack/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,10 @@ def __init__(
In the range of [0,1], where 1 means extremely relevant.
:param meta: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
:param embedding: Vector encoding of the text
:param id_hash_keys: Generate the document id from a custom list of strings.
:param id_hash_keys: Generate the document id from a custom list of strings that refere to the documents attributes.
If you want ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"].
not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]).
In this case the id will be generated by using the content and the defined metadata.
"""

if content is None:
Expand All @@ -92,6 +93,13 @@ def __init__(
self.score = score
self.meta = meta or {}

allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ]
self.id_hash_keys = id_hash_keys
if self.id_hash_keys != None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think is not is preferred over != when comparing to None

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is a brief explanation of the reasons: https://stackoverflow.com/a/2209781

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this line is still failing with mypy I'd say we put a # type: ignore here at the end of the line and maybe another comment explaining that mypy fails because it thinks set(None) might be called.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 I added a comment at that line :)

raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)")


if embedding is not None:
embedding = np.asarray(embedding)
self.embedding = embedding
Expand All @@ -100,11 +108,30 @@ def __init__(
if id:
self.id: str = str(id)
else:
self.id: str = self._get_id(id_hash_keys)
self.id: str = self._get_id(id_hash_keys=id_hash_keys)


def _get_id(self,
id_hash_keys: Optional[List[str]] = None
):
"""
Generate the id of a document by creating the hash of strings. By default the content of a document is
used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys
or a selection of the content.
:param id_hash_keys: Optional list of fields that should be dynamically used to generate the hash.
"""

if id_hash_keys is None:
return '{:02x}'.format(mmh3.hash128(str(self.content), signed=False))

final_hash_key = ""
for attr in id_hash_keys:
final_hash_key += ":" + str(getattr(self,attr))

if final_hash_key == "":
raise ValueError(f"Cant't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta']")

def _get_id(self, id_hash_keys):
final_hash_key = ":".join(id_hash_keys) if id_hash_keys else str(self.content)
return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))
return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))

def to_dict(self, field_map={}) -> Dict:
"""
Expand All @@ -131,7 +158,7 @@ def to_dict(self, field_map={}) -> Dict:
return _doc

@classmethod
def from_dict(cls, dict, field_map={}):
def from_dict(cls, dict, field_map={}, id_hash_keys=None):
"""
Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
Expand Down Expand Up @@ -160,6 +187,9 @@ def from_dict(cls, dict, field_map={}):
elif k in field_map:
k = field_map[k]
_new_doc[k] = v

if _doc.get("id") is None:
_new_doc["id_hash_keys"]=id_hash_keys

# Convert list of rows to pd.DataFrame
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
Expand Down
51 changes: 31 additions & 20 deletions test/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,56 +37,57 @@ def test_init_elastic_client():


def test_write_with_duplicate_doc_ids(document_store):
documents = [
duplicate_documents = [
Document(
content="Doc1",
id_hash_keys=["key1"]
id_hash_keys=["content"]
),
Document(
content="Doc2",
id_hash_keys=["key1"]
content="Doc1",
id_hash_keys=["content"]
)
]
document_store.write_documents(documents, duplicate_documents="skip")
document_store.write_documents(duplicate_documents, duplicate_documents="skip")
assert len(document_store.get_all_documents()) == 1
with pytest.raises(Exception):
document_store.write_documents(documents, duplicate_documents="fail")
document_store.write_documents(duplicate_documents, duplicate_documents="fail")


@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True)
def test_write_with_duplicate_doc_ids_custom_index(document_store):
documents = [
duplicate_documents = [
Document(
content="Doc1",
id_hash_keys=["key1"]
id_hash_keys=["content"]
),
Document(
content="Doc2",
id_hash_keys=["key1"]
content="Doc1",
id_hash_keys=["content"]
)
]
document_store.delete_documents(index="haystack_custom_test")
document_store.write_documents(documents, index="haystack_custom_test", duplicate_documents="skip")
document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="skip")
assert len(document_store.get_all_documents()) == 1
with pytest.raises(DuplicateDocumentError):
document_store.write_documents(documents, index="haystack_custom_test", duplicate_documents="fail")
document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="fail")

# Weaviate manipulates document objects in-place when writing them to an index.
# It generates a uuid based on the provided id and the index name where the document is added to.
# We need to get rid of these generated uuids for this test and therefore reset the document objects.
# As a result, the documents will receive a fresh uuid based on their id_hash_keys and a different index name.
if isinstance(document_store, WeaviateDocumentStore):
documents = [
duplicate_documents = [
Document(
content="Doc1",
id_hash_keys=["key1"]
id_hash_keys=["content"]
),
Document(
content="Doc2",
id_hash_keys=["key1"]
content="Doc1",
id_hash_keys=["content"]
)
]
# writing to the default, empty index should still work
document_store.write_documents(documents, duplicate_documents="fail")
document_store.write_documents(duplicate_documents, duplicate_documents="fail")


def test_get_all_documents_without_filters(document_store_with_docs):
Expand All @@ -102,17 +103,17 @@ def test_get_all_document_filter_duplicate_text_value(document_store):
Document(
content="Doc1",
meta={"f1": "0"},
id_hash_keys=["Doc1", "1"]
id_hash_keys=["meta"]
),
Document(
content="Doc1",
meta={"f1": "1", "meta_id": "0"},
id_hash_keys=["Doc1", "2"]
id_hash_keys=["meta"]
),
Document(
content="Doc2",
meta={"f3": "0"},
id_hash_keys=["Doc2", "3"]
id_hash_keys=["meta"]
)
]
document_store.write_documents(documents)
Expand All @@ -121,6 +122,16 @@ def test_get_all_document_filter_duplicate_text_value(document_store):
assert len(documents) == 1
assert {d.meta["meta_id"] for d in documents} == {"0"}

documents = document_store.get_all_documents(filters={"f1": ["0"]})
assert documents[0].content == "Doc1"
assert len(documents) == 1
assert {d.meta["meta_id"] for d in documents} == {}

documents = document_store.get_all_documents(filters={"f3": ["0"]})
assert documents[0].content == "Doc2"
assert len(documents) == 1
assert {d.meta["meta_id"] for d in documents} == {}


def test_get_all_documents_with_correct_filters(document_store_with_docs):
documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]})
Expand Down
18 changes: 14 additions & 4 deletions test/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from haystack.schema import Document, Label, Answer, Span
import pytest
import numpy as np

LABELS = [
Expand Down Expand Up @@ -152,9 +153,18 @@ def test_generate_doc_id_using_custom_list():
text1 = "text1"
text2 = "text2"

doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1])
doc2_text1 = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1])
doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2])
doc1_meta1_id_by_content = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"])
doc1_meta2_id_by_content = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content"])
assert doc1_meta1_id_by_content.id == doc1_meta2_id_by_content.id

assert doc1_text1.id == doc2_text1.id
doc1_meta1_id_by_content_and_meta = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content","meta"])
doc1_meta2_id_by_content_and_meta = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content", "meta"])
assert doc1_meta1_id_by_content_and_meta.id != doc1_meta2_id_by_content_and_meta.id

doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"])
doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["content"])
assert doc1_text1.id != doc3_text2.id


with pytest.raises(ValueError):
_ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content","non_existing_field"])