deepset-ai · ArzelaAscoIi · Jan 3, 2022 · Dec 18, 2021 · Dec 18, 2021 · Dec 21, 2021
diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md
@@ -13,7 +13,7 @@ class Document()
 #### \_\_init\_\_
 
 ```python
- | __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None)
+ | __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None, id_hash_from: Optional[List[Literal["content", "meta"]]] = None)
 ```
 
 One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
@@ -71,7 +71,7 @@ dict with content of the Document
 
 ```python
  | @classmethod
- | from_dict(cls, dict, field_map={})
+ | from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None)
 ```
 
 Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the

diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py
@@ -7,6 +7,11 @@
 from abc import abstractmethod
 from pathlib import Path
 
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal #type: ignore
+
 from haystack.schema import Document, Label, MultiLabel
 from haystack.nodes.base import BaseComponent
 from haystack.errors import DuplicateDocumentError
@@ -303,9 +308,15 @@ def delete_documents(self, index: Optional[str] = None, ids: Optional[List[str]]
     @abstractmethod
     def delete_labels(self, index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None):
         pass
+
+    @abstractmethod
+    def _create_document_field_map(self) -> Dict:
+        pass
 
-    def run(self, documents: List[dict], index: Optional[str] = None):  # type: ignore
-        self.write_documents(documents=documents, index=index)
+    def run(self, documents: List[dict], index: Optional[str] = None, id_hash_from: Optional[Literal["content", "meta"]] = None  ):  # type: ignore
+        field_map = self._create_document_field_map()
+        doc_objects = [Document.from_dict(d, field_map=field_map, id_hash_from=id_hash_from) for d in documents]
+        self.write_documents(documents=doc_objects, index=index)
         return {}, "output_1"
 
     @abstractmethod

diff --git a/haystack/document_stores/graphdb.py b/haystack/document_stores/graphdb.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional, Dict
 
 import requests
 from pathlib import Path
@@ -118,6 +118,12 @@ def get_all_predicates(self, index: Optional[str] = None):
         results = self.query(sparql_query=sparql_query, index=index)
         return results
 
+    def _create_document_field_map(self)->Dict:
+        """
+        There is no field mapping required
+        """
+        return {}
+
     def get_all_objects(self, index: Optional[str] = None):
         """
         Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered.

diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py
@@ -210,6 +210,12 @@ def get_all_documents_generator(
             batch_size=batch_size,
         )
         yield from result
+
+    def _create_document_field_map(self)->Dict:
+        """
+        There is no field mapping required
+        """
+        return {}
 
     def _query(
         self,

diff --git a/haystack/schema.py b/haystack/schema.py
@@ -55,7 +55,8 @@ def __init__(
             score: Optional[float] = None,
             meta: Dict[str, Any] = None,
             embedding: Optional[np.ndarray] = None,
-            id_hash_keys: Optional[List[str]] = None
+            id_hash_keys: Optional[List[str]] = None,
+            id_hash_from:  Optional[List[Literal["content", "meta"]]] = None
     ):
         """
         One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
@@ -91,6 +92,8 @@ def __init__(
         self.content_type = content_type
         self.score = score
         self.meta = meta or {}
+        self.id_hash_keys = id_hash_keys
+        self.id_hash_from = id_hash_from
 
         if embedding is not None:
             embedding = np.asarray(embedding)
@@ -100,11 +103,37 @@ def __init__(
         if id:
             self.id: str = str(id)
         else:
-            self.id: str = self._get_id(id_hash_keys)
+            self.id: str = self._get_id(id_hash_keys=id_hash_keys, id_hash_from=id_hash_from)
 
-    def _get_id(self, id_hash_keys):
-        final_hash_key = ":".join(id_hash_keys) if id_hash_keys else str(self.content)
-        return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False))
+
+    def _get_id(self, 
+        id_hash_keys: Optional[List[str]] = None,
+        id_hash_from:  Optional[List[Literal["content", "meta"]]] = None
+    ):
+        """
+        Generate the id of a document by creating the hash of strings. By default the content of a document is 
+        used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys
+        or a selection of the content. 
+        :param id_hash_keys: Optional list of strings that are used to generate the hash. 
+        :param id_hash_from: Optional list of fields that should be dynamically used to generate the hash. 
+        """
+
+        if id_hash_keys is None and id_hash_from is None: 
+            return '{:02x}'.format(mmh3.hash128(str(self.content), signed=False)) 
+
+        final_hash_key = "" 
+        if id_hash_keys is not None:
+            final_hash_key += ":".join(id_hash_keys)
+
+        if id_hash_from is not None:
+            if "content" in id_hash_from: 
+                final_hash_key += ":"+ str(self.content)
+            if "meta" in id_hash_from: 
+                final_hash_key += ":"+ str(self.meta)
+
+        if final_hash_key == "":
+            raise ValueError(f"Cant't create 'Document': 'id_hash_from' must contain at least one of ['content', 'meta']")
+        return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) 
 
     def to_dict(self, field_map={}) -> Dict:
         """
@@ -131,7 +160,7 @@ def to_dict(self, field_map={}) -> Dict:
         return _doc
 
     @classmethod
-    def from_dict(cls, dict, field_map={}):
+    def from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None):
         """
         Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
         input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
@@ -160,6 +189,10 @@ def from_dict(cls, dict, field_map={}):
             elif k in field_map:
                 k = field_map[k]
                 _new_doc[k] = v
+
+        if _doc.get("id") is None:
+            _new_doc["id_hash_keys"]=id_hash_keys
+            _new_doc["id_hash_from"]=id_hash_from
 
         # Convert list of rows to pd.DataFrame
         if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):