Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filters for delete_all_documents() #591

Merged
merged 5 commits into from
Nov 16, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion haystack/document_store/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def get_label_count(self, index: Optional[str] = None) -> int:
def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "label"):
pass

def delete_all_documents(self, index: str):
@abstractmethod
def delete_all_documents(self, index: str, filters: Optional[Dict[str, List[str]]] = None):
pass

18 changes: 15 additions & 3 deletions haystack/document_store/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,14 +614,26 @@ def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_i
self.write_documents(docs, index=doc_index)
self.write_labels(labels, index=label_index)

def delete_all_documents(self, index: str):
def delete_all_documents(self, index: str, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete all documents in an index.
Delete documents in an index. All documents are deleted if no filters are passed.

:param index: index name
:return: None
"""
self.client.delete_by_query(index=index, body={"query": {"match_all": {}}}, ignore=[404])
query: Dict[str, Any] = {"query": {}}
if filters:
filter_clause = []
for key, values in filters.items():
filter_clause.append(
{
"terms": {key: values}
}
)
query["query"]["bool"] = {"filter": filter_clause}
else:
query["query"] = {"match_all": {}}
self.client.delete_by_query(index=index, body=query, ignore=[404])
# We want to be sure that all docs are deleted before continuing (delete_by_query doesn't support wait_for)
time.sleep(1)

Expand Down
4 changes: 3 additions & 1 deletion haystack/document_store/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,13 +201,15 @@ def add_eval_data(self, filename: str, doc_index: Optional[str] = None, label_in
self.write_documents(docs, index=doc_index)
self.write_labels(labels, index=label_index)

def delete_all_documents(self, index: Optional[str] = None):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete all documents in a index.

:param index: index name
:return: None
"""

if filters:
raise NotImplementedError("Delete by filters is not implemented for InMemoryDocumentStore.")
index = index or self.index
self.indexes[index] = {}
4 changes: 3 additions & 1 deletion haystack/document_store/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,14 +248,16 @@ def query_by_embedding(self,
"Change the query type (e.g. by choosing a different retriever) "
"or change the DocumentStore (e.g. to ElasticsearchDocumentStore)")

def delete_all_documents(self, index=None):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete all documents in a index.

:param index: index name
:return: None
"""

if filters:
raise NotImplementedError("Delete by filters is not implemented for SQLDocumentStore.")
index = index or self.index
documents = self.session.query(DocumentORM).filter_by(index=index)
documents.delete(synchronize_session=False)
Expand Down
15 changes: 15 additions & 0 deletions test/test_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,21 @@ def test_write_document_with_embeddings(document_store):
assert len(document_store.get_all_documents(index="haystack_test_1")) == 4


@pytest.mark.elasticsearch
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
def test_delete_documents(document_store_with_docs):
assert len(document_store_with_docs.get_all_documents()) == 3

document_store_with_docs.delete_all_documents(index="haystack_test", filters={"meta_field": ["test1", "test2"]})
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 1
assert documents[0].meta["meta_field"] == "test3"

document_store_with_docs.delete_all_documents(index="haystack_test")
documents = document_store_with_docs.get_all_documents()
assert len(documents) == 0


@pytest.mark.elasticsearch
def test_labels(document_store):
label = Label(
Expand Down