From bf91c19aa4c1b322a44556f4adc92914b60a0b7a Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Sat, 18 Dec 2021 22:34:21 +0100 Subject: [PATCH 01/15] adding dynamic id hashing --- haystack/document_stores/base.py | 15 ++++++++-- haystack/document_stores/graphdb.py | 8 ++++- haystack/document_stores/sql.py | 6 ++++ haystack/schema.py | 45 +++++++++++++++++++++++++---- 4 files changed, 65 insertions(+), 9 deletions(-) diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index 541e7655bd..e3d45ec28e 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -7,6 +7,11 @@ from abc import abstractmethod from pathlib import Path +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal #type: ignore + from haystack.schema import Document, Label, MultiLabel from haystack.nodes.base import BaseComponent from haystack.errors import DuplicateDocumentError @@ -303,9 +308,15 @@ def delete_documents(self, index: Optional[str] = None, ids: Optional[List[str]] @abstractmethod def delete_labels(self, index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, List[str]]] = None): pass + + @abstractmethod + def _create_document_field_map(self) -> Dict: + pass - def run(self, documents: List[dict], index: Optional[str] = None): # type: ignore - self.write_documents(documents=documents, index=index) + def run(self, documents: List[dict], index: Optional[str] = None, id_hash_from: Optional[Literal["content", "meta"]] = None ): # type: ignore + field_map = self._create_document_field_map() + doc_objects = [Document.from_dict(d, field_map=field_map, id_hash_from=id_hash_from) for d in documents] + self.write_documents(documents=doc_objects, index=index) return {}, "output_1" @abstractmethod diff --git a/haystack/document_stores/graphdb.py b/haystack/document_stores/graphdb.py index f6c9d6402a..7e35242357 100644 --- a/haystack/document_stores/graphdb.py +++ b/haystack/document_stores/graphdb.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Dict import requests from pathlib import Path @@ -118,6 +118,12 @@ def get_all_predicates(self, index: Optional[str] = None): results = self.query(sparql_query=sparql_query, index=index) return results + def _create_document_field_map(self)->Dict: + """ + There is no field mapping required + """ + return {} + def get_all_objects(self, index: Optional[str] = None): """ Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered. diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index 014c31d13d..bfaf3bd819 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -210,6 +210,12 @@ def get_all_documents_generator( batch_size=batch_size, ) yield from result + + def _create_document_field_map(self)->Dict: + """ + There is no field mapping required + """ + return {} def _query( self, diff --git a/haystack/schema.py b/haystack/schema.py index 51dd85a70f..0c345e3e7f 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -55,7 +55,8 @@ def __init__( score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, - id_hash_keys: Optional[List[str]] = None + id_hash_keys: Optional[List[str]] = None, + id_hash_from: Optional[List[Literal["content", "meta"]]] = None ): """ One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. @@ -91,6 +92,8 @@ def __init__( self.content_type = content_type self.score = score self.meta = meta or {} + self.id_hash_keys = id_hash_keys + self.id_hash_from = id_hash_from if embedding is not None: embedding = np.asarray(embedding) @@ -100,11 +103,37 @@ def __init__( if id: self.id: str = str(id) else: - self.id: str = self._get_id(id_hash_keys) + self.id: str = self._get_id(id_hash_keys=id_hash_keys, id_hash_from=id_hash_from) - def _get_id(self, id_hash_keys): - final_hash_key = ":".join(id_hash_keys) if id_hash_keys else str(self.content) - return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) + + def _get_id(self, + id_hash_keys: Optional[List[str]] = None, + id_hash_from: Optional[List[Literal["content", "meta"]]] = None + ): + """ + Generate the id of a document by creating the hash of strings. By default the content of a document is + used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys + or a selection of the content. + :param id_hash_keys: Optional list of strings that are used to generate the hash. + :param id_hash_from: Optional list of fields that should be dynamically used to generate the hash. + """ + + if id_hash_keys is None and id_hash_from is None: + return '{:02x}'.format(mmh3.hash128(str(self.content), signed=False)) + + final_hash_key = "" + if id_hash_keys is not None: + final_hash_key += ":".join(id_hash_keys) + + if id_hash_from is not None: + if "content" in id_hash_from: + final_hash_key += ":"+ str(self.content) + if "meta" in id_hash_from: + final_hash_key += ":"+ str(self.meta) + + if final_hash_key == "": + raise ValueError(f"Cant't create 'Document': 'id_hash_from' must contain at least one of ['content', 'meta']") + return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) def to_dict(self, field_map={}) -> Dict: """ @@ -131,7 +160,7 @@ def to_dict(self, field_map={}) -> Dict: return _doc @classmethod - def from_dict(cls, dict, field_map={}): + def from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None): """ Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that @@ -160,6 +189,10 @@ def from_dict(cls, dict, field_map={}): elif k in field_map: k = field_map[k] _new_doc[k] = v + + if _doc.get("id") is None: + _new_doc["id_hash_keys"]=id_hash_keys + _new_doc["id_hash_from"]=id_hash_from # Convert list of rows to pd.DataFrame if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list): From d8739dee9b4e1996a77a6fd15e85dd347d1c0f70 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 18 Dec 2021 21:35:16 +0000 Subject: [PATCH 02/15] Add latest docstring and tutorial changes --- docs/_src/api/api/primitives.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md index 95234e0460..96bce5dc9e 100644 --- a/docs/_src/api/api/primitives.md +++ b/docs/_src/api/api/primitives.md @@ -13,7 +13,7 @@ class Document() #### \_\_init\_\_ ```python - | __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None) + | __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None, id_hash_from: Optional[List[Literal["content", "meta"]]] = None) ``` One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. @@ -71,7 +71,7 @@ dict with content of the Document ```python | @classmethod - | from_dict(cls, dict, field_map={}) + | from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None) ``` Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the From 0829f421f5b4234c9dd4d35c2e33ef529ac99273 Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Tue, 21 Dec 2021 12:28:07 +0100 Subject: [PATCH 03/15] added pr review --- haystack/document_stores/base.py | 17 ++++++++++++-- haystack/schema.py | 40 ++++++++++++++------------------ test/test_schema.py | 18 ++++++++++---- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/haystack/document_stores/base.py b/haystack/document_stores/base.py index e3d45ec28e..4ddf293b75 100644 --- a/haystack/document_stores/base.py +++ b/haystack/document_stores/base.py @@ -313,9 +313,22 @@ def delete_labels(self, index: Optional[str] = None, ids: Optional[List[str]] = def _create_document_field_map(self) -> Dict: pass - def run(self, documents: List[dict], index: Optional[str] = None, id_hash_from: Optional[Literal["content", "meta"]] = None ): # type: ignore + def run(self, documents: List[dict], index: Optional[str] = None, id_hash_keys: Optional[List[str]] = None ): # type: ignore + """ + Run requests of document stores + + Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents. + In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function + is therefore only an interim solution until the run function also accepts documents. + + :param documents: A list of dicts that are documents. + :param index: Optional name of index where the documents shall be written to. + If None, the DocumentStore's default index (self.index) will be used. + :param id_hash_keys: List of the fields that the hashes of the ids are generated from. + """ + field_map = self._create_document_field_map() - doc_objects = [Document.from_dict(d, field_map=field_map, id_hash_from=id_hash_from) for d in documents] + doc_objects = [Document.from_dict(d, field_map=field_map, id_hash_keys=id_hash_keys) for d in documents] self.write_documents(documents=doc_objects, index=index) return {}, "output_1" diff --git a/haystack/schema.py b/haystack/schema.py index 0c345e3e7f..1f79539863 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -55,8 +55,7 @@ def __init__( score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, - id_hash_keys: Optional[List[str]] = None, - id_hash_from: Optional[List[Literal["content", "meta"]]] = None + id_hash_keys: Optional[List[str]] = None ): """ One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. @@ -80,9 +79,10 @@ def __init__( In the range of [0,1], where 1 means extremely relevant. :param meta: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed). :param embedding: Vector encoding of the text - :param id_hash_keys: Generate the document id from a custom list of strings. + :param id_hash_keys: Generate the document id from a custom list of strings that refere to the documents attributes. If you want ensure you don't have duplicate documents in your DocumentStore but texts are - not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"]. + not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]). + In this case the id will be generated by using the content and the defined metadata. """ if content is None: @@ -93,8 +93,12 @@ def __init__( self.score = score self.meta = meta or {} self.id_hash_keys = id_hash_keys - self.id_hash_from = id_hash_from + allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ] + if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes): + raise ValueError(f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") + + if embedding is not None: embedding = np.asarray(embedding) self.embedding = embedding @@ -103,36 +107,29 @@ def __init__( if id: self.id: str = str(id) else: - self.id: str = self._get_id(id_hash_keys=id_hash_keys, id_hash_from=id_hash_from) + self.id: str = self._get_id(id_hash_keys=id_hash_keys) def _get_id(self, - id_hash_keys: Optional[List[str]] = None, - id_hash_from: Optional[List[Literal["content", "meta"]]] = None + id_hash_keys: Optional[List[str]] = None ): """ Generate the id of a document by creating the hash of strings. By default the content of a document is used to generate the hash. There are two ways of modifying the generated id of a document. Either static keys or a selection of the content. - :param id_hash_keys: Optional list of strings that are used to generate the hash. - :param id_hash_from: Optional list of fields that should be dynamically used to generate the hash. + :param id_hash_keys: Optional list of fields that should be dynamically used to generate the hash. """ - if id_hash_keys is None and id_hash_from is None: + if id_hash_keys is None: return '{:02x}'.format(mmh3.hash128(str(self.content), signed=False)) final_hash_key = "" - if id_hash_keys is not None: - final_hash_key += ":".join(id_hash_keys) - - if id_hash_from is not None: - if "content" in id_hash_from: - final_hash_key += ":"+ str(self.content) - if "meta" in id_hash_from: - final_hash_key += ":"+ str(self.meta) + for attr in id_hash_keys: + final_hash_key += ":" + str(getattr(self,attr)) if final_hash_key == "": - raise ValueError(f"Cant't create 'Document': 'id_hash_from' must contain at least one of ['content', 'meta']") + raise ValueError(f"Cant't create 'Document': 'id_hash_keys' must contain at least one of ['content', 'meta']") + return '{:02x}'.format(mmh3.hash128(final_hash_key, signed=False)) def to_dict(self, field_map={}) -> Dict: @@ -160,7 +157,7 @@ def to_dict(self, field_map={}) -> Dict: return _doc @classmethod - def from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None): + def from_dict(cls, dict, field_map={}, id_hash_keys=None): """ Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that @@ -192,7 +189,6 @@ def from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None): if _doc.get("id") is None: _new_doc["id_hash_keys"]=id_hash_keys - _new_doc["id_hash_from"]=id_hash_from # Convert list of rows to pd.DataFrame if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list): diff --git a/test/test_schema.py b/test/test_schema.py index 9faedbfbec..bbba3429c9 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -1,4 +1,5 @@ from haystack.schema import Document, Label, Answer, Span +import pytest import numpy as np LABELS = [ @@ -152,9 +153,18 @@ def test_generate_doc_id_using_custom_list(): text1 = "text1" text2 = "text2" - doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["key1", text1]) - doc2_text1 = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["key1", text1]) - doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["key1", text2]) + doc1_meta1_id_by_content = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"]) + doc1_meta2_id_by_content = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content"]) + assert doc1_meta1_id_by_content.id == doc1_meta2_id_by_content.id - assert doc1_text1.id == doc2_text1.id + doc1_meta1_id_by_content_and_meta = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content","meta"]) + doc1_meta2_id_by_content_and_meta = Document(content=text1, meta={"name": "doc2"}, id_hash_keys=["content", "meta"]) + assert doc1_meta1_id_by_content_and_meta.id != doc1_meta2_id_by_content_and_meta.id + + doc1_text1 = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content"]) + doc3_text2 = Document(content=text2, meta={"name": "doc3"}, id_hash_keys=["content"]) assert doc1_text1.id != doc3_text2.id + + + with pytest.raises(ValueError): + _ = Document(content=text1, meta={"name": "doc1"}, id_hash_keys=["content","non_existing_field"]) \ No newline at end of file From a619fae923b941ae6e176979b9511fcc7be2de52 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 21 Dec 2021 11:29:13 +0000 Subject: [PATCH 04/15] Add latest docstring and tutorial changes --- docs/_src/api/api/document_store.md | 20 ++++++++++++++++++++ docs/_src/api/api/primitives.md | 9 +++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 0543fb65b8..2416b8103a 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -137,6 +137,26 @@ from disk and also indexed batchwise to the DocumentStore in order to prevent ou - `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the same question might be found in different contexts. + +#### run + +```python + | run(documents: List[dict], index: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) +``` + +Run requests of document stores + +Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents. +In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function +is therefore only an interim solution until the run function also accepts documents. + +**Arguments**: + +- `documents`: A list of dicts that are documents. +- `index`: Optional name of index where the documents shall be written to. + If None, the DocumentStore's default index (self.index) will be used. +- `id_hash_keys`: List of the fields that the hashes of the ids are generated from. + #### get\_batches\_from\_generator diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md index 96bce5dc9e..9e2a3e3ed0 100644 --- a/docs/_src/api/api/primitives.md +++ b/docs/_src/api/api/primitives.md @@ -13,7 +13,7 @@ class Document() #### \_\_init\_\_ ```python - | __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None, id_hash_from: Optional[List[Literal["content", "meta"]]] = None) + | __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None) ``` One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. @@ -39,9 +39,10 @@ There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. In the range of [0,1], where 1 means extremely relevant. - `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed). - `embedding`: Vector encoding of the text -- `id_hash_keys`: Generate the document id from a custom list of strings. +- `id_hash_keys`: Generate the document id from a custom list of strings that refere to the documents attributes. If you want ensure you don't have duplicate documents in your DocumentStore but texts are - not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"]. + not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]). + In this case the id will be generated by using the content and the defined metadata. #### to\_dict @@ -71,7 +72,7 @@ dict with content of the Document ```python | @classmethod - | from_dict(cls, dict, field_map={}, id_hash_keys=None, id_hash_from=None) + | from_dict(cls, dict, field_map={}, id_hash_keys=None) ``` Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the From dafa6a33ea968dce0aaf246450eeacbd56b49467 Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Tue, 21 Dec 2021 15:55:26 +0100 Subject: [PATCH 05/15] fixed tests --- test/test_document_store.py | 51 ++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index 5aa5c6c34c..56939609de 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -37,56 +37,57 @@ def test_init_elastic_client(): def test_write_with_duplicate_doc_ids(document_store): - documents = [ + duplicate_documents = [ Document( content="Doc1", - id_hash_keys=["key1"] + id_hash_keys=["content"] ), Document( - content="Doc2", - id_hash_keys=["key1"] + content="Doc1", + id_hash_keys=["content"] ) ] - document_store.write_documents(documents, duplicate_documents="skip") + document_store.write_documents(duplicate_documents, duplicate_documents="skip") assert len(document_store.get_all_documents()) == 1 with pytest.raises(Exception): - document_store.write_documents(documents, duplicate_documents="fail") + document_store.write_documents(duplicate_documents, duplicate_documents="fail") @pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True) def test_write_with_duplicate_doc_ids_custom_index(document_store): - documents = [ + duplicate_documents = [ Document( content="Doc1", - id_hash_keys=["key1"] + id_hash_keys=["content"] ), Document( - content="Doc2", - id_hash_keys=["key1"] + content="Doc1", + id_hash_keys=["content"] ) ] document_store.delete_documents(index="haystack_custom_test") - document_store.write_documents(documents, index="haystack_custom_test", duplicate_documents="skip") + document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="skip") + assert len(document_store.get_all_documents()) == 1 with pytest.raises(DuplicateDocumentError): - document_store.write_documents(documents, index="haystack_custom_test", duplicate_documents="fail") + document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="fail") # Weaviate manipulates document objects in-place when writing them to an index. # It generates a uuid based on the provided id and the index name where the document is added to. # We need to get rid of these generated uuids for this test and therefore reset the document objects. # As a result, the documents will receive a fresh uuid based on their id_hash_keys and a different index name. if isinstance(document_store, WeaviateDocumentStore): - documents = [ + duplicate_documents = [ Document( content="Doc1", - id_hash_keys=["key1"] + id_hash_keys=["content"] ), Document( - content="Doc2", - id_hash_keys=["key1"] + content="Doc1", + id_hash_keys=["content"] ) ] # writing to the default, empty index should still work - document_store.write_documents(documents, duplicate_documents="fail") + document_store.write_documents(duplicate_documents, duplicate_documents="fail") def test_get_all_documents_without_filters(document_store_with_docs): @@ -102,17 +103,17 @@ def test_get_all_document_filter_duplicate_text_value(document_store): Document( content="Doc1", meta={"f1": "0"}, - id_hash_keys=["Doc1", "1"] + id_hash_keys=["meta"] ), Document( content="Doc1", meta={"f1": "1", "meta_id": "0"}, - id_hash_keys=["Doc1", "2"] + id_hash_keys=["meta"] ), Document( content="Doc2", meta={"f3": "0"}, - id_hash_keys=["Doc2", "3"] + id_hash_keys=["meta"] ) ] document_store.write_documents(documents) @@ -121,6 +122,16 @@ def test_get_all_document_filter_duplicate_text_value(document_store): assert len(documents) == 1 assert {d.meta["meta_id"] for d in documents} == {"0"} + documents = document_store.get_all_documents(filters={"f1": ["0"]}) + assert documents[0].content == "Doc1" + assert len(documents) == 1 + assert {d.meta["meta_id"] for d in documents} == {} + + documents = document_store.get_all_documents(filters={"f3": ["0"]}) + assert documents[0].content == "Doc2" + assert len(documents) == 1 + assert {d.meta["meta_id"] for d in documents} == {} + def test_get_all_documents_with_correct_filters(document_store_with_docs): documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]}) From 66d0950d78853b0656f517c294a44984a5147366 Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Tue, 21 Dec 2021 15:58:19 +0100 Subject: [PATCH 06/15] fix mypy error --- haystack/schema.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 1f79539863..65240d8c0c 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -92,11 +92,12 @@ def __init__( self.content_type = content_type self.score = score self.meta = meta or {} - self.id_hash_keys = id_hash_keys allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ] - if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes): - raise ValueError(f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") + self.id_hash_keys = id_hash_keys + if self.id_hash_keys != None: + if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes): + raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") if embedding is not None: From 11695258eddbb7690ac7954fe8d291ae6102194e Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Tue, 21 Dec 2021 16:51:45 +0100 Subject: [PATCH 07/15] fix mypy issue --- haystack/schema.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 65240d8c0c..1b2d2b77b3 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -95,9 +95,8 @@ def __init__( allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ] self.id_hash_keys = id_hash_keys - if self.id_hash_keys != None: - if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes): - raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") + if not set(self.id_hash_keys or []) <= set(allowed_hash_key_attributes): + raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") if embedding is not None: From 1c103ac0f1b1ef829f7877722c9243936e9853dc Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Tue, 21 Dec 2021 16:54:38 +0100 Subject: [PATCH 08/15] ignore typing --- haystack/schema.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 1b2d2b77b3..7e80181fcf 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -95,8 +95,10 @@ def __init__( allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ] self.id_hash_keys = id_hash_keys - if not set(self.id_hash_keys or []) <= set(allowed_hash_key_attributes): - raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") + + if self.id_hash_keys != None: + if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes): #type: ignore + raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") if embedding is not None: From 6e11cb704d6962cceb94c84642fc13fd1881dc65 Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Tue, 21 Dec 2021 16:55:52 +0100 Subject: [PATCH 09/15] fixed correct check --- haystack/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/schema.py b/haystack/schema.py index 7e80181fcf..3d6f0e9ed0 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -96,7 +96,7 @@ def __init__( allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ] self.id_hash_keys = id_hash_keys - if self.id_hash_keys != None: + if self.id_hash_keys is not None: if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes): #type: ignore raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") From 7bed279430ebfa33c3320e6f2693288c09e1b797 Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Tue, 21 Dec 2021 18:02:33 +0100 Subject: [PATCH 10/15] fixed tests --- test/test_document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index 56939609de..0fd8791f11 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -67,7 +67,7 @@ def test_write_with_duplicate_doc_ids_custom_index(document_store): ] document_store.delete_documents(index="haystack_custom_test") document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="skip") - assert len(document_store.get_all_documents()) == 1 + assert len(document_store.get_all_documents(index="haystack_custom_test")) == 1 with pytest.raises(DuplicateDocumentError): document_store.write_documents(duplicate_documents, index="haystack_custom_test", duplicate_documents="fail") From 9665d779b0ab4623c2f8dd3e1a2ce6d8a44c604f Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Tue, 21 Dec 2021 18:34:39 +0100 Subject: [PATCH 11/15] try fixing the tests --- test/test_document_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index 0fd8791f11..d02ccf1fd2 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -125,12 +125,12 @@ def test_get_all_document_filter_duplicate_text_value(document_store): documents = document_store.get_all_documents(filters={"f1": ["0"]}) assert documents[0].content == "Doc1" assert len(documents) == 1 - assert {d.meta["meta_id"] for d in documents} == {} + assert len({d.meta["meta_id"] for d in documents}) == 0 documents = document_store.get_all_documents(filters={"f3": ["0"]}) assert documents[0].content == "Doc2" assert len(documents) == 1 - assert {d.meta["meta_id"] for d in documents} == {} + assert len({d.meta["meta_id"] for d in documents}) == 0 def test_get_all_documents_with_correct_filters(document_store_with_docs): From 4302dbaab4a53b124914e6bc9739048704552ce0 Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Wed, 22 Dec 2021 10:35:28 +0100 Subject: [PATCH 12/15] set id hash keys only if not none --- haystack/schema.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 3d6f0e9ed0..82808e1c75 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -94,9 +94,10 @@ def __init__( self.meta = meta or {} allowed_hash_key_attributes = ["content", "content_type", "score", "meta", "embedding" ] - self.id_hash_keys = id_hash_keys + - if self.id_hash_keys is not None: + if id_hash_keys is not None: + self.id_hash_keys = id_hash_keys if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes): #type: ignore raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") From 1f162ad80f149515df760333d52f8679e3dda12c Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Wed, 22 Dec 2021 11:49:07 +0100 Subject: [PATCH 13/15] dont store id_hash_keys --- haystack/schema.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/haystack/schema.py b/haystack/schema.py index 82808e1c75..1eed1b82ba 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -97,9 +97,8 @@ def __init__( if id_hash_keys is not None: - self.id_hash_keys = id_hash_keys - if not set(self.id_hash_keys) <= set(allowed_hash_key_attributes): #type: ignore - raise ValueError(f"You passed custom strings {self.id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") + if not set(id_hash_keys) <= set(allowed_hash_key_attributes): #type: ignore + raise ValueError(f"You passed custom strings {id_hash_keys} to id_hash_keys which is deprecated. Supply instead a list of Document's attribute names that the id should be based on (e.g. {allowed_hash_key_attributes}). See /~https://github.com/deepset-ai/haystack/pull/1910 for details)") if embedding is not None: From 99b4f5795458ad5a0a39a3c77767f0dcfd50cd52 Mon Sep 17 00:00:00 2001 From: ArzelaAscoIi Date: Wed, 22 Dec 2021 12:04:14 +0100 Subject: [PATCH 14/15] fix tests --- test/test_document_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index d02ccf1fd2..fbcfbf7d1b 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -125,12 +125,12 @@ def test_get_all_document_filter_duplicate_text_value(document_store): documents = document_store.get_all_documents(filters={"f1": ["0"]}) assert documents[0].content == "Doc1" assert len(documents) == 1 - assert len({d.meta["meta_id"] for d in documents}) == 0 + assert documents[0].meta.get("meta_id") is None documents = document_store.get_all_documents(filters={"f3": ["0"]}) assert documents[0].content == "Doc2" assert len(documents) == 1 - assert len({d.meta["meta_id"] for d in documents}) == 0 + assert documents[0].meta.get("meta_id") is None def test_get_all_documents_with_correct_filters(document_store_with_docs): From b61634740066c10b1df0104c109e3d243fb3f82d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 3 Jan 2022 12:59:39 +0000 Subject: [PATCH 15/15] Add latest docstring and tutorial changes --- docs/_src/api/api/document_store.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index f375623669..9d0009c0be 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -168,7 +168,7 @@ from disk and also indexed batchwise to the DocumentStore in order to prevent ou #### run ```python - | run(documents: List[dict], index: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) + | run(documents: List[dict], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None) ``` Run requests of document stores @@ -180,6 +180,7 @@ is therefore only an interim solution until the run function also accepts docume **Arguments**: - `documents`: A list of dicts that are documents. +- `headers`: A list of headers. - `index`: Optional name of index where the documents shall be written to. If None, the DocumentStore's default index (self.index) will be used. - `id_hash_keys`: List of the fields that the hashes of the ids are generated from.