Skip to content

Commit

Permalink
[Weaviate] Exit the while loop when we query less documents than avai…
Browse files Browse the repository at this point in the history
…lable (#2537)

* exit the while loop when we query less documents than available in Weaviate

* use monkeypatch fixture, remove unused markers

* we know key is there, use brackets to get the value

* use custom exception

* add warning message when we hit the QUERY_MAXIMUM_RESULTS problem

* restore pytest marker

* removed unused import

* make the warning message more clear
  • Loading branch information
masci authored May 20, 2022
1 parent fd2ca35 commit a9a4156
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 10 deletions.
43 changes: 34 additions & 9 deletions haystack/document_stores/weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,17 @@
from haystack.document_stores.base import get_batches_from_generator
from haystack.document_stores.filter_utils import LogicalFilterClause
from haystack.document_stores.utils import convert_date_to_rfc3339
from haystack.errors import DocumentStoreError


logger = logging.getLogger(__name__)
UUID_PATTERN = re.compile(r"^[\da-f]{8}-([\da-f]{4}-){3}[\da-f]{12}$", re.IGNORECASE)


class WeaviateDocumentStoreError(DocumentStoreError):
pass


class WeaviateDocumentStore(BaseDocumentStore):
"""
Expand Down Expand Up @@ -697,17 +702,37 @@ def _get_all_documents_in_index(
query = query.with_where(filter_dict)

if all_docs:
# .with_limit() must be used with .with_offset, of the latter won't work properly
# https://weaviate-python-client.readthedocs.io/en/latest/weaviate.gql.html?highlight=offset#weaviate.gql.get.GetBuilder.with_offset
# Passing offset:0 raises an error, so we pass it only after the first round
# `.with_limit()` must be used with `.with_offset`, or the latter won't work properly
# https://weaviate-python-client.readthedocs.io/en/latest/weaviate.gql.html?highlight=offset#weaviate.gql.get.GetBuilder.with_offset
query = query.with_limit(100).with_offset(offset=len(all_docs))

result = query.do()
try:
result = query.do()
except Exception as e:
raise WeaviateDocumentStoreError(f"Weaviate raised an exception: {e}")

if "errors" in result:
raise WeaviateDocumentStoreError(f"Query results contain errors: {result['errors']}")

# If `query.do` didn't raise and `result` doesn't contain errors,
# we are good accessing data
docs = result.get("data").get("Get").get(index)

# `docs` can be empty if the query returned less documents than the actual
# number. This can happen when the number of document stored is greater
# than QUERY_MAXIMUM_RESULTS.
# See: https://weaviate.io/developers/weaviate/current/graphql-references/filters.html#offset-argument-pagination
if not docs:
logger.warning(
"The query returned less documents than expected: this can happen when "
"the value of the QUERY_MAXIMUM_RESULTS environment variable is lower than "
"the total number of documents stored. See Weaviate documentation for "
"more details."
)
break

if result and "data" in result and "Get" in result.get("data"):
if result.get("data").get("Get").get(index):
all_docs += result.get("data").get("Get").get(index)
else:
raise ValueError(f"Weaviate returned ad exception: {result}")
all_docs += docs

yield from all_docs

Expand Down Expand Up @@ -1156,7 +1181,7 @@ def delete_all_documents(
raise NotImplementedError("WeaviateDocumentStore does not support headers.")

logger.warning(
"""DEPRECATION WARNINGS:
"""DEPRECATION WARNINGS:
1. delete_all_documents() method is deprecated, please use delete_documents method
For more details, please refer to the issue: /~https://github.com/deepset-ai/haystack/issues/1045
"""
Expand Down
16 changes: 15 additions & 1 deletion test/document_stores/test_weaviate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import uuid

import numpy as np
import pytest

from haystack.schema import Document
from ..conftest import get_document_store
import uuid


embedding_dim = 768

Expand Down Expand Up @@ -105,3 +108,14 @@ def test_query(document_store_with_docs):

docs = document_store_with_docs.query(filters={"content": ["live"]})
assert len(docs) == 3


@pytest.mark.weaviate
def test_get_all_documents_unaffected_by_QUERY_MAXIMUM_RESULTS(document_store_with_docs, monkeypatch):
"""
Ensure `get_all_documents` works no matter the value of QUERY_MAXIMUM_RESULTS
see /~https://github.com/deepset-ai/haystack/issues/2517
"""
monkeypatch.setattr(document_store_with_docs, "get_document_count", lambda **kwargs: 13_000)
docs = document_store_with_docs.get_all_documents()
assert len(docs) == 3

0 comments on commit a9a4156

Please sign in to comment.