Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom id hashing on documentstore level #1910

Merged
merged 16 commits into from
Jan 3, 2022
Merged
4 changes: 2 additions & 2 deletions docs/_src/api/api/primitives.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Document()
#### \_\_init\_\_

```python
| __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None)
| __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None, id_hash_from: Optional[List[Literal["content", "meta"]]] = None)
```

One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
Expand Down Expand Up @@ -71,7 +71,7 @@ dict with content of the Document

```python
| @classmethod
| from_dict(cls, dict, field_map={})
| from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None)
```

Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
Expand Down
15 changes: 13 additions & 2 deletions haystack/document_stores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
from abc import abstractmethod
from pathlib import Path

try:
from typing import Literal
except ImportError:
from typing_extensions import Literal #type: ignore

from haystack.schema import Document, Label, MultiLabel
from haystack.nodes.base import BaseComponent
from haystack.errors import DuplicateDocumentError
Expand Down Expand Up @@ -303,9 +308,15 @@ def delete_documents(self, index: Optional[str] = None, ids: Optional[List[str]]
@abstractmethod
def delete_labels(self, index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None):
pass

@abstractmethod
def _create_document_field_map(self) -> Dict:
pass

def run(self, documents: List[dict], index: Optional[str] = None): # type: ignore
self.write_documents(documents=documents, index=index)
def run(self, documents: List[dict], index: Optional[str] = None, id_hash_from: Optional[Literal["content", "meta"]] = None ): # type: ignore
field_map = self._create_document_field_map()
doc_objects = [Document.from_dict(d, field_map=field_map, id_hash_from=id_hash_from) for d in documents]
self.write_documents(documents=doc_objects, index=index)
return {}, "output_1"

@abstractmethod
Expand Down
8 changes: 7 additions & 1 deletion haystack/document_stores/graphdb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Dict

import requests
from pathlib import Path
Expand Down Expand Up @@ -118,6 +118,12 @@ def get_all_predicates(self, index: Optional[str] = None):
results = self.query(sparql_query=sparql_query, index=index)
return results

def _create_document_field_map(self)->Dict:
"""
There is no field mapping required
"""
return {}

def get_all_objects(self, index: Optional[str] = None):
"""
Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered.
Expand Down
6 changes: 6 additions & 0 deletions haystack/document_stores/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,12 @@ def get_all_documents_generator(
batch_size=batch_size,
)
yield from result

def _create_document_field_map(self)->Dict:
"""
There is no field mapping required
"""
return {}

def _query(
self,
Expand Down
45 changes: 39 additions & 6 deletions haystack/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ def __init__(
score: Optional[float] = None,
meta: Dict[str, Any] = None,
embedding: Optional[np.ndarray] = None,
id_hash_keys: Optional[List[str]] = None
id_hash_keys: Optional[List[str]] = None,
id_hash_from: Optional[List[Literal["content", "meta"]]] = None
):
"""
One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
Expand Down Expand Up @@ -91,6 +92,8 @@ def __init__(
self.content_type = content_type
self.score = score
self.meta = meta or {}
self.id_hash_keys = id_hash_keys
self.id_hash_from = id_hash_from

if embedding is not None:
embedding = np.asarray(embedding)
Expand All @@ -100,11 +103,37 @@ def __init__(
if id:
self.id: str = str(id)
else:
self.id: str = self._get_id(id_hash_keys)
self.id: str = self._get_id(id_hash_keys=id_hash_keys, id_hash_from=id_hash_from)

def _get_id(self, id_hash_keys):
final_hash_key = ":".join(id_hash_keys) if id_hash_keys else str(self.content)
return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))

def _get_id(self,
id_hash_keys: Optional[List[str]] = None,
id_hash_from: Optional[List[Literal["content", "meta"]]] = None
):
"""
Generate the id of a document by creating the hash of strings. By default the content of a document is
used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys
or a selection of the content.
:param id_hash_keys: Optional list of strings that are used to generate the hash.
:param id_hash_from: Optional list of fields that should be dynamically used to generate the hash.
"""

if id_hash_keys is None and id_hash_from is None:
return '{:02x}'.format(mmh3.hash128(str(self.content), signed=False))

final_hash_key = ""
if id_hash_keys is not None:
final_hash_key += ":".join(id_hash_keys)

if id_hash_from is not None:
if "content" in id_hash_from:
final_hash_key += ":"+ str(self.content)
if "meta" in id_hash_from:
final_hash_key += ":"+ str(self.meta)

if final_hash_key == "":
raise ValueError(f"Cant't create 'Document': 'id_hash_from' must contain at least one of ['content', 'meta']")
return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))

def to_dict(self, field_map={}) -> Dict:
"""
Expand All @@ -131,7 +160,7 @@ def to_dict(self, field_map={}) -> Dict:
return _doc

@classmethod
def from_dict(cls, dict, field_map={}):
def from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None):
"""
Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
Expand Down Expand Up @@ -160,6 +189,10 @@ def from_dict(cls, dict, field_map={}):
elif k in field_map:
k = field_map[k]
_new_doc[k] = v

if _doc.get("id") is None:
_new_doc["id_hash_keys"]=id_hash_keys
_new_doc["id_hash_from"]=id_hash_from

# Convert list of rows to pd.DataFrame
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
Expand Down