From fbbca6e3dc348b201ee82ab57de05be0c44af9bb Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 10 Jan 2022 18:28:49 +0100 Subject: [PATCH 01/21] Add unique contraint to avoid PSQL crash and debug hanging tests --- haystack/document_stores/sql.py | 2 +- test/test_faiss_on_postgres.py | 163 ++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 test/test_faiss_on_postgres.py diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index 666870238a..a175a07a14 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -20,7 +20,7 @@ class ORMBase(Base): __abstract__ = True - id = Column(String(100), default=lambda: str(uuid4()), primary_key=True) + id = Column(String(100), default=lambda: str(uuid4()), unique=True, primary_key=True) created_at = Column(DateTime, server_default=func.now()) updated_at = Column(DateTime, server_default=func.now(), server_onupdate=func.now()) diff --git a/test/test_faiss_on_postgres.py b/test/test_faiss_on_postgres.py new file mode 100644 index 0000000000..301c53ffe1 --- /dev/null +++ b/test/test_faiss_on_postgres.py @@ -0,0 +1,163 @@ +import uuid +import faiss +import math +import numpy as np +import pytest +import sys +import subprocess +import logging +from time import sleep +from sqlalchemy import create_engine, text +import psycopg + +from haystack.schema import Document +from haystack.pipelines import DocumentSearchPipeline +from haystack.document_stores.faiss import FAISSDocumentStore +from haystack.document_stores.weaviate import WeaviateDocumentStore + +from haystack.pipelines import Pipeline +from haystack.nodes.retriever.dense import EmbeddingRetriever + + +DOCUMENTS = [ + {"meta": {"name": "name_1", "year": "2020", "month": "01"}, "content": "text_1", "embedding": np.random.rand(768).astype(np.float32)}, + {"meta": {"name": "name_2", "year": "2020", "month": "02"}, "content": "text_2", "embedding": np.random.rand(768).astype(np.float32)}, + {"meta": {"name": "name_3", "year": "2020", "month": "03"}, "content": "text_3", "embedding": np.random.rand(768).astype(np.float64)}, + {"meta": {"name": "name_4", "year": "2021", "month": "01"}, "content": "text_4", "embedding": np.random.rand(768).astype(np.float32)}, + {"meta": {"name": "name_5", "year": "2021", "month": "02"}, "content": "text_5", "embedding": np.random.rand(768).astype(np.float32)}, + {"meta": {"name": "name_6", "year": "2021", "month": "03"}, "content": "text_6", "embedding": np.random.rand(768).astype(np.float64)}, +] + + +# @pytest.fixture +# def sql_url(): + +# # status = subprocess.run(["docker run --name postgres_test -d -e POSTGRES_HOST_AUTH_METHOD=trust -p 5432:5432 postgres"], shell=True) +# # if status.returncode: +# # logging.warning("Tried to start PostgreSQL through Docker but this failed. It is likely that there is already an existing instance running.") +# # else: +# # sleep(5) + +# engine = create_engine( +# 'postgresql://postgres:postgres@127.0.0.1/postgres', +# isolation_level='AUTOCOMMIT') + +# # with engine.connect() as connection: +# connection = engine.connect() + +# try: +# connection.execute('DROP SCHEMA public CASCADE;') +# except Exception as e: +# logging.error(e) +# try: +# connection.execute('CREATE SCHEMA public;') +# connection.execute('SET SESSION idle_in_transaction_session_timeout = "1s";') + +# yield "postgresql://postgres:postgres@127.0.0.1/postgres" + +# finally: +# connection.execute('DROP SCHEMA public CASCADE;') +# connection.close() + +# logging.error(" -----------------------> Done") + + # sleep(1) + + # status = subprocess.run(["docker stop postgres_test"], shell=True) + # if status.returncode: + # logging.warning("Tried to start PostgreSQL through Docker but this failed. It is likely that there is already an existing instance running.") + + # status = subprocess.run(["docker rm postgres_test"], shell=True) + # if status.returncode: + # logging.warning("Tried to start PostgreSQL through Docker but this failed. It is likely that there is already an existing instance running.") + + + +# @pytest.fixture +# def sql_url(tmp_path): +# return f"sqlite:////{tmp_path/'haystack_test.db'}" + + + +@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") +def test_faiss_index_save_and_load(tmp_path): #, sql_url): + + engine = create_engine( + 'postgresql://postgres:postgres@127.0.0.1/postgres', + isolation_level='AUTOCOMMIT', + future=True) + + with engine.connect() as connection: + #connection = engine.connect() + + try: + connection.execute(text('DROP SCHEMA public CASCADE')) + connection.commit() + except psycopg.errors.ProgrammingError as pe: + logging.error(pe) + except Exception as e: + logging.error(e) + + connection.execute(text('CREATE SCHEMA public')) + connection.commit() + #connection.execute('SET SESSION idle_in_transaction_session_timeout = "1s";') + + sql_url = "postgresql://postgres:postgres@127.0.0.1/postgres" + + + + + document_store = FAISSDocumentStore( + sql_url=sql_url, + index="haystack_test", + progress_bar=False # Just to check if the init parameters are kept + ) + document_store.write_documents(DOCUMENTS) + + # test saving the index + document_store.save(tmp_path / "haystack_test_faiss") + + # clear existing faiss_index + document_store.faiss_indexes[document_store.index].reset() + + # test faiss index is cleared + assert document_store.faiss_indexes[document_store.index].ntotal == 0 + + # test loading the index + new_document_store = FAISSDocumentStore.load(tmp_path / "haystack_test_faiss") + + # check faiss index is restored + assert new_document_store.faiss_indexes[document_store.index].ntotal == len(DOCUMENTS) + # check if documents are restored + assert len(new_document_store.get_all_documents()) == len(DOCUMENTS) + # Check if the init parameters are kept + assert not new_document_store.progress_bar + + # test saving and loading the loaded faiss index + new_document_store.save(tmp_path / "haystack_test_faiss") + reloaded_document_store = FAISSDocumentStore.load(tmp_path / "haystack_test_faiss") + + # check faiss index is restored + assert reloaded_document_store.faiss_indexes[document_store.index].ntotal == len(DOCUMENTS) + # check if documents are restored + assert len(reloaded_document_store.get_all_documents()) == len(DOCUMENTS) + # Check if the init parameters are kept + assert not reloaded_document_store.progress_bar + + # test loading the index via init + new_document_store = FAISSDocumentStore(faiss_index_path=tmp_path / "haystack_test_faiss") + + # check faiss index is restored + assert new_document_store.faiss_indexes[document_store.index].ntotal == len(DOCUMENTS) + # check if documents are restored + assert len(new_document_store.get_all_documents()) == len(DOCUMENTS) + # Check if the init parameters are kept + assert not new_document_store.progress_bar + + + + connection.execute(text('DROP SCHEMA public CASCADE')) + connection.commit() + # connection.close() + + logging.error(" -----------------------> Done") From 7b7a27135f9879f433b8f5a95f477328fe0433dc Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 11 Jan 2022 15:04:55 +0100 Subject: [PATCH 02/21] Tests seems to be runnable on PG now, still to verify --- haystack/document_stores/faiss.py | 4 +- haystack/document_stores/milvus.py | 4 +- haystack/document_stores/milvus2x.py | 2 + haystack/document_stores/sql.py | 11 +- test/conftest.py | 115 ++++++++++++++----- test/test_faiss_and_milvus.py | 31 +++-- test/test_faiss_on_postgres.py | 163 --------------------------- 7 files changed, 125 insertions(+), 205 deletions(-) delete mode 100644 test/test_faiss_on_postgres.py diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index e1fb2ca8a0..663a06a5f1 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -48,6 +48,7 @@ def __init__( duplicate_documents: str = 'overwrite', faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, + isolation_level: str = None, **kwargs, ): """ @@ -145,7 +146,8 @@ def __init__( super().__init__( url=sql_url, index=index, - duplicate_documents=duplicate_documents + duplicate_documents=duplicate_documents, + isolation_level=isolation_level ) self._validate_index_sync() diff --git a/haystack/document_stores/milvus.py b/haystack/document_stores/milvus.py index 80c19c333f..5ab0fb9a4e 100644 --- a/haystack/document_stores/milvus.py +++ b/haystack/document_stores/milvus.py @@ -51,6 +51,7 @@ def __init__( embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = 'overwrite', + isolation_level: str = None, **kwargs, ): """ @@ -129,7 +130,8 @@ def __init__( super().__init__( url=sql_url, index=index, - duplicate_documents=duplicate_documents + duplicate_documents=duplicate_documents, + isolation_level=isolation_level, ) def __del__(self): diff --git a/haystack/document_stores/milvus2x.py b/haystack/document_stores/milvus2x.py index dc3eecfffe..85405138fa 100644 --- a/haystack/document_stores/milvus2x.py +++ b/haystack/document_stores/milvus2x.py @@ -71,6 +71,7 @@ def __init__( custom_fields: Optional[List[Any]] = None, progress_bar: bool = True, duplicate_documents: str = 'overwrite', + isolation_level: str = None, ): """ :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale @@ -165,6 +166,7 @@ def __init__( url=sql_url, index=index, duplicate_documents=duplicate_documents + isolation_level=isolation_level, ) def _create_collection_and_index_if_not_exist( diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index a175a07a14..4f44f526a0 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -96,7 +96,8 @@ def __init__( index: str = "document", label_index: str = "label", duplicate_documents: str = "overwrite", - check_same_thread: bool = False + check_same_thread: bool = False, + isolation_level: str = None ): """ An SQL backed DocumentStore. Currently supports SQLite, PostgreSQL and MySQL backends. @@ -112,6 +113,7 @@ def __init__( fail: an error is raised if the document ID of the document being added already exists. :param check_same_thread: Set to False to mitigate multithreading issues in older SQLite versions (see https://docs.sqlalchemy.org/en/14/dialects/sqlite.html?highlight=check_same_thread#threading-pooling-behavior) + :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` """ # save init parameters to enable export of component config as YAML @@ -119,10 +121,13 @@ def __init__( url=url, index=index, label_index=label_index, duplicate_documents=duplicate_documents, check_same_thread=check_same_thread ) + create_engine_params = {} + if isolation_level: + create_engine_params["isolation_level"] = isolation_level if "sqlite" in url: - engine = create_engine(url, connect_args={'check_same_thread': check_same_thread}) + engine = create_engine(url, connect_args={'check_same_thread': check_same_thread}, **create_engine_params) else: - engine = create_engine(url) + engine = create_engine(url, **create_engine_params) ORMBase.metadata.create_all(engine) Session = sessionmaker(bind=engine) self.session = Session() diff --git a/test/conftest.py b/test/conftest.py index eccce83b7f..31bbbb2cb3 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,6 +3,8 @@ from subprocess import run from sys import platform import gc +import logging +from sqlalchemy import create_engine, text import numpy as np import psutil @@ -478,41 +480,94 @@ def get_retriever(retriever_type, document_store): @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate"]) -def document_store_with_docs(request, test_docs_xs): - document_store = get_document_store(request.param) - document_store.write_documents(test_docs_xs) - yield document_store - document_store.delete_documents() - +def document_store_with_docs(request, test_docs_xs, tmp_path): + for document_store in get_document_store(request.param, tmp_path=tmp_path): + document_store.write_documents(test_docs_xs) + yield document_store + document_store.delete_documents() @pytest.fixture -def document_store(request, test_docs_xs): +def document_store(request, tmp_path): vector_dim = request.node.get_closest_marker("vector_dim", pytest.mark.vector_dim(768)) - document_store = get_document_store(request.param, vector_dim.args[0]) - yield document_store - document_store.delete_documents() + for document_store in get_document_store(request.param, embedding_dim=vector_dim.args[0], tmp_path=tmp_path): + yield document_store + document_store.delete_documents() @pytest.fixture(params=["faiss", "milvus", "weaviate"]) -def document_store_cosine(request, test_docs_xs): +def document_store_cosine(request, tmp_path): vector_dim = request.node.get_closest_marker("vector_dim", pytest.mark.vector_dim(768)) - document_store = get_document_store(request.param, vector_dim.args[0], similarity="cosine") - yield document_store - document_store.delete_documents() + for document_store in get_document_store(request.param, embedding_dim=vector_dim.args[0], similarity="cosine", tmp_path=tmp_path): + yield document_store + document_store.delete_documents() @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate"]) -def document_store_cosine_small(request, test_docs_xs): +def document_store_cosine_small(request, tmp_path): vector_dim = request.node.get_closest_marker("vector_dim", pytest.mark.vector_dim(3)) - document_store = get_document_store(request.param, vector_dim.args[0], similarity="cosine") - yield document_store - document_store.delete_documents() + for document_store in get_document_store(request.param, embedding_dim=vector_dim.args[0], similarity="cosine", tmp_path=tmp_path): + yield document_store + document_store.delete_documents() + + + +SQL_TYPE = "sqlite" +# SQL_TYPE = "postgres" + + +@pytest.fixture +def sql_url(tmp_path): + if SQL_TYPE == "postgres": + try: + setup_postgres() + yield get_sql_url(tmp_path) + finally: + teardown_postgres() + else: + yield get_sql_url(tmp_path) + -def get_document_store(document_store_type, embedding_dim=768, embedding_field="embedding", index="haystack_test", similarity:str="dot_product"): +def get_sql_url(tmp_path): + if SQL_TYPE == "postgres": + return "postgresql://postgres:postgres@127.0.0.1/postgres" + else: + return f"sqlite:///{tmp_path}/haystack_test.db" + + +def setup_postgres(): + # status = subprocess.run(["docker run --name postgres_test -d -e POSTGRES_HOST_AUTH_METHOD=trust -p 5432:5432 postgres"], shell=True) + # if status.returncode: + # logging.warning("Tried to start PostgreSQL through Docker but this failed. It is likely that there is already an existing instance running.") + # else: + # sleep(5) + engine = create_engine('postgresql://postgres:postgres@127.0.0.1/postgres', isolation_level='AUTOCOMMIT') + + with engine.connect() as connection: + try: + connection.execute(text('DROP SCHEMA public CASCADE')) + except Exception as e: + logging.error(e) + connection.execute(text('CREATE SCHEMA public;')) + connection.execute(text('SET SESSION idle_in_transaction_session_timeout = "1s";')) + + +def teardown_postgres(): + engine = create_engine('postgresql://postgres:postgres@127.0.0.1/postgres', isolation_level='AUTOCOMMIT') + with engine.connect() as connection: + connection.execute(text('DROP SCHEMA public CASCADE')) + connection.close() + + +def get_document_store(document_store_type, tmp_path, embedding_dim=768, embedding_field="embedding", index="haystack_test", similarity:str="dot_product"): + + if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus"]: + setup_postgres() + if document_store_type == "sql": document_store = SQLDocumentStore(url="sqlite://", index=index) + elif document_store_type == "memory": document_store = InMemoryDocumentStore( - return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, index=index, similarity=similarity - ) + return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, index=index, similarity=similarity) + elif document_store_type == "elasticsearch": # make sure we start from a fresh index client = Elasticsearch() @@ -520,28 +575,33 @@ def get_document_store(document_store_type, embedding_dim=768, embedding_field=" document_store = ElasticsearchDocumentStore( index=index, return_embedding=True, embedding_dim=embedding_dim, embedding_field=embedding_field, similarity=similarity ) + elif document_store_type == "faiss": document_store = FAISSDocumentStore( vector_dim=embedding_dim, - sql_url="sqlite://", + sql_url=get_sql_url(tmp_path), return_embedding=True, embedding_field=embedding_field, index=index, - similarity=similarity + similarity=similarity, + isolation_level="AUTOCOMMIT" if SQL_TYPE == "postgres" else None ) + elif document_store_type == "milvus": document_store = MilvusDocumentStore( vector_dim=embedding_dim, - sql_url="sqlite://", + sql_url=get_sql_url(tmp_path), return_embedding=True, embedding_field=embedding_field, index=index, - similarity=similarity + similarity=similarity, + isolation_level="AUTOCOMMIT" if SQL_TYPE == "postgres" else None ) _, collections = document_store.milvus_server.list_collections() for collection in collections: if collection.startswith(index): document_store.milvus_server.drop_collection(collection) + elif document_store_type == "weaviate": document_store = WeaviateDocumentStore( weaviate_url="http://localhost:8080", @@ -554,7 +614,10 @@ def get_document_store(document_store_type, embedding_dim=768, embedding_field=" else: raise Exception(f"No document store fixture for '{document_store_type}'") - return document_store + yield document_store + + if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus"]: + teardown_postgres() @pytest.fixture(scope="function") diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py index b2ca1c7c2c..8e7ec6023e 100644 --- a/test/test_faiss_and_milvus.py +++ b/test/test_faiss_and_milvus.py @@ -13,6 +13,7 @@ from haystack.pipelines import Pipeline from haystack.nodes.retriever.dense import EmbeddingRetriever + DOCUMENTS = [ {"meta": {"name": "name_1", "year": "2020", "month": "01"}, "content": "text_1", "embedding": np.random.rand(768).astype(np.float32)}, {"meta": {"name": "name_2", "year": "2020", "month": "02"}, "content": "text_2", "embedding": np.random.rand(768).astype(np.float32)}, @@ -23,12 +24,14 @@ ] + @pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") -def test_faiss_index_save_and_load(tmp_path): +def test_faiss_index_save_and_load(tmp_path, sql_url): document_store = FAISSDocumentStore( - sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", + sql_url=sql_url, index="haystack_test", - progress_bar=False # Just to check if the init parameters are kept + progress_bar=False, # Just to check if the init parameters are kept + isolation_level="AUTOCOMMIT" ) document_store.write_documents(DOCUMENTS) @@ -74,11 +77,12 @@ def test_faiss_index_save_and_load(tmp_path): @pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") -def test_faiss_index_save_and_load_custom_path(tmp_path): +def test_faiss_index_save_and_load_custom_path(tmp_path, sql_url): document_store = FAISSDocumentStore( - sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", + sql_url=sql_url, index="haystack_test", - progress_bar=False # Just to check if the init parameters are kept + progress_bar=False, # Just to check if the init parameters are kept + isolation_level="AUTOCOMMIT" ) document_store.write_documents(DOCUMENTS) @@ -128,13 +132,15 @@ def test_faiss_index_mutual_exclusive_args(tmp_path): with pytest.raises(ValueError): FAISSDocumentStore( sql_url=f"sqlite:////{tmp_path/'haystack_test.db'}", - faiss_index_path=f"{tmp_path/'haystack_test'}" + faiss_index_path=f"{tmp_path/'haystack_test'}", + isolation_level="AUTOCOMMIT" ) with pytest.raises(ValueError): FAISSDocumentStore( f"sqlite:////{tmp_path/'haystack_test.db'}", - faiss_index_path=f"{tmp_path/'haystack_test'}" + faiss_index_path=f"{tmp_path/'haystack_test'}", + isolation_level="AUTOCOMMIT" ) @@ -225,7 +231,9 @@ def test_update_with_empty_store(document_store, retriever): @pytest.mark.parametrize("index_factory", ["Flat", "HNSW", "IVF1,Flat"]) def test_faiss_retrieving(index_factory, tmp_path): document_store = FAISSDocumentStore( - sql_url=f"sqlite:////{tmp_path/'test_faiss_retrieving.db'}", faiss_index_factory_str=index_factory + sql_url=f"sqlite:////{tmp_path/'test_faiss_retrieving.db'}", + faiss_index_factory_str=index_factory, + isolation_level="AUTOCOMMIT" ) document_store.delete_all_documents(index="document") @@ -394,7 +402,6 @@ def test_get_docs_with_many_filters(document_store, retriever): assert "2020" == documents[0].meta["year"] - @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) @pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_pipeline(document_store, retriever): @@ -421,7 +428,9 @@ def test_faiss_passing_index_from_outside(tmp_path): faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable) faiss_index.nprobe = 2 document_store = FAISSDocumentStore( - sql_url=f"sqlite:////{tmp_path/'haystack_test_faiss.db'}", faiss_index=faiss_index, index=index + sql_url=f"sqlite:////{tmp_path/'haystack_test_faiss.db'}", + faiss_index=faiss_index, index=index, + isolation_level="AUTOCOMMIT" ) document_store.delete_documents() diff --git a/test/test_faiss_on_postgres.py b/test/test_faiss_on_postgres.py deleted file mode 100644 index 301c53ffe1..0000000000 --- a/test/test_faiss_on_postgres.py +++ /dev/null @@ -1,163 +0,0 @@ -import uuid -import faiss -import math -import numpy as np -import pytest -import sys -import subprocess -import logging -from time import sleep -from sqlalchemy import create_engine, text -import psycopg - -from haystack.schema import Document -from haystack.pipelines import DocumentSearchPipeline -from haystack.document_stores.faiss import FAISSDocumentStore -from haystack.document_stores.weaviate import WeaviateDocumentStore - -from haystack.pipelines import Pipeline -from haystack.nodes.retriever.dense import EmbeddingRetriever - - -DOCUMENTS = [ - {"meta": {"name": "name_1", "year": "2020", "month": "01"}, "content": "text_1", "embedding": np.random.rand(768).astype(np.float32)}, - {"meta": {"name": "name_2", "year": "2020", "month": "02"}, "content": "text_2", "embedding": np.random.rand(768).astype(np.float32)}, - {"meta": {"name": "name_3", "year": "2020", "month": "03"}, "content": "text_3", "embedding": np.random.rand(768).astype(np.float64)}, - {"meta": {"name": "name_4", "year": "2021", "month": "01"}, "content": "text_4", "embedding": np.random.rand(768).astype(np.float32)}, - {"meta": {"name": "name_5", "year": "2021", "month": "02"}, "content": "text_5", "embedding": np.random.rand(768).astype(np.float32)}, - {"meta": {"name": "name_6", "year": "2021", "month": "03"}, "content": "text_6", "embedding": np.random.rand(768).astype(np.float64)}, -] - - -# @pytest.fixture -# def sql_url(): - -# # status = subprocess.run(["docker run --name postgres_test -d -e POSTGRES_HOST_AUTH_METHOD=trust -p 5432:5432 postgres"], shell=True) -# # if status.returncode: -# # logging.warning("Tried to start PostgreSQL through Docker but this failed. It is likely that there is already an existing instance running.") -# # else: -# # sleep(5) - -# engine = create_engine( -# 'postgresql://postgres:postgres@127.0.0.1/postgres', -# isolation_level='AUTOCOMMIT') - -# # with engine.connect() as connection: -# connection = engine.connect() - -# try: -# connection.execute('DROP SCHEMA public CASCADE;') -# except Exception as e: -# logging.error(e) -# try: -# connection.execute('CREATE SCHEMA public;') -# connection.execute('SET SESSION idle_in_transaction_session_timeout = "1s";') - -# yield "postgresql://postgres:postgres@127.0.0.1/postgres" - -# finally: -# connection.execute('DROP SCHEMA public CASCADE;') -# connection.close() - -# logging.error(" -----------------------> Done") - - # sleep(1) - - # status = subprocess.run(["docker stop postgres_test"], shell=True) - # if status.returncode: - # logging.warning("Tried to start PostgreSQL through Docker but this failed. It is likely that there is already an existing instance running.") - - # status = subprocess.run(["docker rm postgres_test"], shell=True) - # if status.returncode: - # logging.warning("Tried to start PostgreSQL through Docker but this failed. It is likely that there is already an existing instance running.") - - - -# @pytest.fixture -# def sql_url(tmp_path): -# return f"sqlite:////{tmp_path/'haystack_test.db'}" - - - -@pytest.mark.skipif(sys.platform in ['win32', 'cygwin'], reason="Test with tmp_path not working on windows runner") -def test_faiss_index_save_and_load(tmp_path): #, sql_url): - - engine = create_engine( - 'postgresql://postgres:postgres@127.0.0.1/postgres', - isolation_level='AUTOCOMMIT', - future=True) - - with engine.connect() as connection: - #connection = engine.connect() - - try: - connection.execute(text('DROP SCHEMA public CASCADE')) - connection.commit() - except psycopg.errors.ProgrammingError as pe: - logging.error(pe) - except Exception as e: - logging.error(e) - - connection.execute(text('CREATE SCHEMA public')) - connection.commit() - #connection.execute('SET SESSION idle_in_transaction_session_timeout = "1s";') - - sql_url = "postgresql://postgres:postgres@127.0.0.1/postgres" - - - - - document_store = FAISSDocumentStore( - sql_url=sql_url, - index="haystack_test", - progress_bar=False # Just to check if the init parameters are kept - ) - document_store.write_documents(DOCUMENTS) - - # test saving the index - document_store.save(tmp_path / "haystack_test_faiss") - - # clear existing faiss_index - document_store.faiss_indexes[document_store.index].reset() - - # test faiss index is cleared - assert document_store.faiss_indexes[document_store.index].ntotal == 0 - - # test loading the index - new_document_store = FAISSDocumentStore.load(tmp_path / "haystack_test_faiss") - - # check faiss index is restored - assert new_document_store.faiss_indexes[document_store.index].ntotal == len(DOCUMENTS) - # check if documents are restored - assert len(new_document_store.get_all_documents()) == len(DOCUMENTS) - # Check if the init parameters are kept - assert not new_document_store.progress_bar - - # test saving and loading the loaded faiss index - new_document_store.save(tmp_path / "haystack_test_faiss") - reloaded_document_store = FAISSDocumentStore.load(tmp_path / "haystack_test_faiss") - - # check faiss index is restored - assert reloaded_document_store.faiss_indexes[document_store.index].ntotal == len(DOCUMENTS) - # check if documents are restored - assert len(reloaded_document_store.get_all_documents()) == len(DOCUMENTS) - # Check if the init parameters are kept - assert not reloaded_document_store.progress_bar - - # test loading the index via init - new_document_store = FAISSDocumentStore(faiss_index_path=tmp_path / "haystack_test_faiss") - - # check faiss index is restored - assert new_document_store.faiss_indexes[document_store.index].ntotal == len(DOCUMENTS) - # check if documents are restored - assert len(new_document_store.get_all_documents()) == len(DOCUMENTS) - # Check if the init parameters are kept - assert not new_document_store.progress_bar - - - - connection.execute(text('DROP SCHEMA public CASCADE')) - connection.commit() - # connection.close() - - logging.error(" -----------------------> Done") From caadc971d65508260ea133ed7b060402ff5470a7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 11 Jan 2022 14:06:01 +0000 Subject: [PATCH 03/21] Add latest docstring and tutorial changes --- docs/_src/api/api/document_store.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 8464bc47b9..b4705ddf5a 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -952,7 +952,7 @@ class SQLDocumentStore(BaseDocumentStore) #### \_\_init\_\_ ```python - | __init__(url: str = "sqlite://", index: str = "document", label_index: str = "label", duplicate_documents: str = "overwrite", check_same_thread: bool = False) + | __init__(url: str = "sqlite://", index: str = "document", label_index: str = "label", duplicate_documents: str = "overwrite", check_same_thread: bool = False, isolation_level: str = None) ``` An SQL backed DocumentStore. Currently supports SQLite, PostgreSQL and MySQL backends. @@ -970,6 +970,7 @@ An SQL backed DocumentStore. Currently supports SQLite, PostgreSQL and MySQL bac fail: an error is raised if the document ID of the document being added already exists. - `check_same_thread`: Set to False to mitigate multithreading issues in older SQLite versions (see https://docs.sqlalchemy.org/en/14/dialects/sqlite.html?highlight=check_same_thread#threading-pooling-behavior) +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` #### get\_document\_by\_id @@ -1202,7 +1203,7 @@ the vector embeddings are indexed in a FAISS Index. #### \_\_init\_\_ ```python - | __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional["faiss.swigfaiss.Index"] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = 'overwrite', faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, **kwargs, ,) + | __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional["faiss.swigfaiss.Index"] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = 'overwrite', faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, **kwargs, ,) ``` **Arguments**: @@ -1478,7 +1479,7 @@ Usage: #### \_\_init\_\_ ```python - | __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = 'overwrite', **kwargs, ,) + | __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = 'overwrite', isolation_level: str = None, **kwargs, ,) ``` **Arguments**: From 250d44daad76a66e0354f2fd912eb7f81715069d Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 11 Jan 2022 15:35:18 +0100 Subject: [PATCH 04/21] Forgot comma --- haystack/document_stores/milvus2x.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/document_stores/milvus2x.py b/haystack/document_stores/milvus2x.py index 85405138fa..5feb5f842d 100644 --- a/haystack/document_stores/milvus2x.py +++ b/haystack/document_stores/milvus2x.py @@ -165,7 +165,7 @@ def __init__( super().__init__( url=sql_url, index=index, - duplicate_documents=duplicate_documents + duplicate_documents=duplicate_documents, isolation_level=isolation_level, ) From 1addbd7ccf4d06731c3a37d27e533b7bed283e84 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 11 Jan 2022 16:35:26 +0100 Subject: [PATCH 05/21] Fix a few more issues left and add isolation_level to set.config() --- haystack/document_stores/faiss.py | 3 +- haystack/document_stores/milvus.py | 1 + haystack/document_stores/milvus2x.py | 1 + haystack/document_stores/sql.py | 2 + test/conftest.py | 18 ++--- test/test_document_store.py | 101 +++++++++++++++++++++++++-- test/test_faiss_and_milvus.py | 77 -------------------- 7 files changed, 113 insertions(+), 90 deletions(-) diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index 66c24e88b0..676f49f84b 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -116,7 +116,8 @@ def __init__( index=index, similarity=similarity, embedding_field=embedding_field, - progress_bar=progress_bar + progress_bar=progress_bar, + isolation_level=isolation_level ) if similarity in ("dot_product", "cosine"): diff --git a/haystack/document_stores/milvus.py b/haystack/document_stores/milvus.py index 6bf7982e42..538392c1bc 100644 --- a/haystack/document_stores/milvus.py +++ b/haystack/document_stores/milvus.py @@ -105,6 +105,7 @@ def __init__( embedding_dim=embedding_dim, index_file_size=index_file_size, similarity=similarity, index_type=index_type, index_param=index_param, search_param=search_param, duplicate_documents=duplicate_documents, return_embedding=return_embedding, embedding_field=embedding_field, progress_bar=progress_bar, + isolation_level=isolation_level ) self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool) diff --git a/haystack/document_stores/milvus2x.py b/haystack/document_stores/milvus2x.py index 0acd50f863..026c09bcea 100644 --- a/haystack/document_stores/milvus2x.py +++ b/haystack/document_stores/milvus2x.py @@ -128,6 +128,7 @@ def __init__( search_param=search_param, duplicate_documents=duplicate_documents, id_field=id_field, return_embedding=return_embedding, embedding_field=embedding_field, progress_bar=progress_bar, custom_fields=custom_fields, + isolation_level=isolation_level ) logger.warning("Milvus2DocumentStore is in experimental state until Milvus 2.0 is released") diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index 4f44f526a0..a7ef00f460 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -124,6 +124,8 @@ def __init__( create_engine_params = {} if isolation_level: create_engine_params["isolation_level"] = isolation_level + + logging.warning(f"create engine params: {create_engine_params}") if "sqlite" in url: engine = create_engine(url, connect_args={'check_same_thread': check_same_thread}, **create_engine_params) else: diff --git a/test/conftest.py b/test/conftest.py index 29387e6d61..b2bf0536cb 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -42,6 +42,11 @@ from haystack.nodes.question_generator import QuestionGenerator +# To manually run the tests with default PostgreSQL instead of SQLite, switch the lines below +SQL_TYPE = "sqlite" +# SQL_TYPE = "postgres" + + def pytest_addoption(parser): parser.addoption("--document_store_type", action="store", default="elasticsearch, faiss, memory, milvus, weaviate") @@ -508,11 +513,6 @@ def document_store_cosine_small(request, tmp_path): document_store.delete_documents() - -SQL_TYPE = "sqlite" -# SQL_TYPE = "postgres" - - @pytest.fixture def sql_url(tmp_path): if SQL_TYPE == "postgres": @@ -559,10 +559,11 @@ def teardown_postgres(): def get_document_store(document_store_type, tmp_path, embedding_dim=768, embedding_field="embedding", index="haystack_test", similarity:str="dot_product"): if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus"]: + logging.warning("Setting up PostgreSQL...") setup_postgres() if document_store_type == "sql": - document_store = SQLDocumentStore(url="sqlite://", index=index) + document_store = SQLDocumentStore(url=get_sql_url(tmp_path), index=index, isolation_level="AUTOCOMMIT") elif document_store_type == "memory": document_store = InMemoryDocumentStore( @@ -584,7 +585,7 @@ def get_document_store(document_store_type, tmp_path, embedding_dim=768, embeddi embedding_field=embedding_field, index=index, similarity=similarity, - isolation_level="AUTOCOMMIT" if SQL_TYPE == "postgres" else None + isolation_level="AUTOCOMMIT"# if SQL_TYPE == "postgres" else None ) elif document_store_type == "milvus": @@ -595,7 +596,7 @@ def get_document_store(document_store_type, tmp_path, embedding_dim=768, embeddi embedding_field=embedding_field, index=index, similarity=similarity, - isolation_level="AUTOCOMMIT" if SQL_TYPE == "postgres" else None + isolation_level="AUTOCOMMIT"# if SQL_TYPE == "postgres" else None ) _, collections = document_store.milvus_server.list_collections() for collection in collections: @@ -617,6 +618,7 @@ def get_document_store(document_store_type, tmp_path, embedding_dim=768, embeddi yield document_store if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus"]: + logging.warning("Tearing down PostgreSQL...") teardown_postgres() diff --git a/test/test_document_store.py b/test/test_document_store.py index 39bc5caf51..581edb7bb7 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -1,4 +1,6 @@ from unittest import mock +import uuid +import math import numpy as np import pandas as pd import pytest @@ -16,6 +18,18 @@ from haystack.document_stores.faiss import FAISSDocumentStore + +DOCUMENTS = [ + {"meta": {"name": "name_1", "year": "2020", "month": "01"}, "content": "text_1", "embedding": np.random.rand(768).astype(np.float32)}, + {"meta": {"name": "name_2", "year": "2020", "month": "02"}, "content": "text_2", "embedding": np.random.rand(768).astype(np.float32)}, + {"meta": {"name": "name_3", "year": "2020", "month": "03"}, "content": "text_3", "embedding": np.random.rand(768).astype(np.float64)}, + {"meta": {"name": "name_4", "year": "2021", "month": "01"}, "content": "text_4", "embedding": np.random.rand(768).astype(np.float32)}, + {"meta": {"name": "name_5", "year": "2021", "month": "02"}, "content": "text_5", "embedding": np.random.rand(768).astype(np.float32)}, + {"meta": {"name": "name_6", "year": "2021", "month": "03"}, "content": "text_6", "embedding": np.random.rand(768).astype(np.float64)}, +] + + + @pytest.mark.elasticsearch def test_init_elastic_client(): # defaults @@ -147,8 +161,8 @@ def test_get_all_documents_with_correct_filters(document_store_with_docs): assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"} -def test_get_all_documents_with_correct_filters_legacy_sqlite(test_docs_xs): - document_store_with_docs = get_document_store("sql") +def test_get_all_documents_with_correct_filters_legacy_sqlite(test_docs_xs, tmp_path): + document_store_with_docs = get_document_store("sql", tmp_path) document_store_with_docs.write_documents(test_docs_xs) document_store_with_docs.use_windowed_query = False @@ -819,7 +833,7 @@ def test_update_meta(document_store): @pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"]) def test_custom_embedding_field(document_store_type): document_store = get_document_store( - document_store_type=document_store_type, embedding_field="custom_embedding_field" + document_store_type=document_store_type, tmp_path=tmp_path, embedding_field="custom_embedding_field" ) doc_to_write = {"content": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)} document_store.write_documents([doc_to_write]) @@ -973,4 +987,83 @@ def test_custom_headers(document_store_with_docs: BaseDocumentStore): args, kwargs = mock_client.search.call_args assert "headers" in kwargs assert kwargs["headers"] == custom_headers - assert len(documents) > 0 \ No newline at end of file + assert len(documents) > 0 + + +def ensure_ids_are_correct_uuids(docs:list,document_store:object)->None: + # Weaviate currently only supports UUIDs + if type(document_store)==WeaviateDocumentStore: + for d in docs: + d["id"] = str(uuid.uuid4()) + + +def test_cosine_similarity(document_store_cosine): + # below we will write documents to the store and then query it to see if vectors were normalized + + ensure_ids_are_correct_uuids(docs=DOCUMENTS, document_store=document_store_cosine) + document_store_cosine.write_documents(documents=DOCUMENTS) + + # note that the same query will be used later when querying after updating the embeddings + query = np.random.rand(768).astype(np.float32) + + query_results = document_store_cosine.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) + + # check if search with cosine similarity returns the correct number of results + assert len(query_results) == len(DOCUMENTS) + indexed_docs = {} + for doc in DOCUMENTS: + indexed_docs[doc["content"]] = doc["embedding"] + + for doc in query_results: + result_emb = doc.embedding + original_emb = np.array([indexed_docs[doc.content]], dtype="float32") + document_store_cosine.normalize_embedding(original_emb[0]) + + # check if the stored embedding was normalized + assert np.allclose(original_emb[0], result_emb, rtol=0.01) + + # check if the score is plausible for cosine similarity + assert 0 <= doc.score <= 1.0 + + # now check if vectors are normalized when updating embeddings + class MockRetriever(): + def embed_documents(self, docs): + return [np.random.rand(768).astype(np.float32) for doc in docs] + + retriever = MockRetriever() + document_store_cosine.update_embeddings(retriever=retriever) + query_results = document_store_cosine.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) + + for doc in query_results: + original_emb = np.array([indexed_docs[doc.content]], dtype="float32") + document_store_cosine.normalize_embedding(original_emb[0]) + # check if the original embedding has changed after updating the embeddings + assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01) + + +def test_normalize_embeddings_diff_shapes(document_store_cosine_small): + VEC_1 = np.array([.1, .2, .3], dtype="float32") + document_store_cosine_small.normalize_embedding(VEC_1) + assert np.linalg.norm(VEC_1) - 1 < 0.01 + + VEC_1 = np.array([.1, .2, .3], dtype="float32").reshape(1, -1) + document_store_cosine_small.normalize_embedding(VEC_1) + assert np.linalg.norm(VEC_1) - 1 < 0.01 + + +def test_cosine_sanity_check(document_store_cosine_small): + VEC_1 = np.array([.1, .2, .3], dtype="float32") + VEC_2 = np.array([.4, .5, .6], dtype="float32") + + # This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity + # The score is normalized to yield a value between 0 and 1. + KNOWN_COSINE = (0.9746317 + 1) / 2 + + docs = [{"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1}] + ensure_ids_are_correct_uuids(docs=docs,document_store=document_store_cosine_small) + document_store_cosine_small.write_documents(documents=docs) + + query_results = document_store_cosine_small.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True) + + # check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318 + assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.00002) diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py index 8e7ec6023e..3c45b26851 100644 --- a/test/test_faiss_and_milvus.py +++ b/test/test_faiss_and_milvus.py @@ -443,80 +443,3 @@ def test_faiss_passing_index_from_outside(tmp_path): # test if vectors ids are associated with docs for doc in documents_indexed: assert 0 <= int(doc.meta["vector_id"]) <= 7 - -def ensure_ids_are_correct_uuids(docs:list,document_store:object)->None: - # Weaviate currently only supports UUIDs - if type(document_store)==WeaviateDocumentStore: - for d in docs: - d["id"] = str(uuid.uuid4()) - -def test_cosine_similarity(document_store_cosine): - # below we will write documents to the store and then query it to see if vectors were normalized - - ensure_ids_are_correct_uuids(docs=DOCUMENTS,document_store=document_store_cosine) - document_store_cosine.write_documents(documents=DOCUMENTS) - - # note that the same query will be used later when querying after updating the embeddings - query = np.random.rand(768).astype(np.float32) - - query_results = document_store_cosine.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) - - # check if search with cosine similarity returns the correct number of results - assert len(query_results) == len(DOCUMENTS) - indexed_docs = {} - for doc in DOCUMENTS: - indexed_docs[doc["content"]] = doc["embedding"] - - for doc in query_results: - result_emb = doc.embedding - original_emb = np.array([indexed_docs[doc.content]], dtype="float32") - document_store_cosine.normalize_embedding(original_emb[0]) - - # check if the stored embedding was normalized - assert np.allclose(original_emb[0], result_emb, rtol=0.01) - - # check if the score is plausible for cosine similarity - assert 0 <= doc.score <= 1.0 - - # now check if vectors are normalized when updating embeddings - class MockRetriever(): - def embed_documents(self, docs): - return [np.random.rand(768).astype(np.float32) for doc in docs] - - retriever = MockRetriever() - document_store_cosine.update_embeddings(retriever=retriever) - query_results = document_store_cosine.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) - - for doc in query_results: - original_emb = np.array([indexed_docs[doc.content]], dtype="float32") - document_store_cosine.normalize_embedding(original_emb[0]) - # check if the original embedding has changed after updating the embeddings - assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01) - - -def test_normalize_embeddings_diff_shapes(document_store_cosine_small): - VEC_1 = np.array([.1, .2, .3], dtype="float32") - document_store_cosine_small.normalize_embedding(VEC_1) - assert np.linalg.norm(VEC_1) - 1 < 0.01 - - VEC_1 = np.array([.1, .2, .3], dtype="float32").reshape(1, -1) - document_store_cosine_small.normalize_embedding(VEC_1) - assert np.linalg.norm(VEC_1) - 1 < 0.01 - - -def test_cosine_sanity_check(document_store_cosine_small): - VEC_1 = np.array([.1, .2, .3], dtype="float32") - VEC_2 = np.array([.4, .5, .6], dtype="float32") - - # This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity - # The score is normalized to yield a value between 0 and 1. - KNOWN_COSINE = (0.9746317 + 1) / 2 - - docs = [{"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1}] - ensure_ids_are_correct_uuids(docs=docs,document_store=document_store_cosine_small) - document_store_cosine_small.write_documents(documents=docs) - - query_results = document_store_cosine_small.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True) - - # check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318 - assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.00002) From 463ba445c55c7b6732f7a1336d9373ce81d52d39 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 11 Jan 2022 18:18:21 +0100 Subject: [PATCH 06/21] Properly fix MetaDocumentORM and MetaLabelORM with composite foreign key constraints --- haystack/document_stores/sql.py | 53 ++++++++++++++------------------- test/test_document_store.py | 22 +++++++------- 2 files changed, 34 insertions(+), 41 deletions(-) diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index a7ef00f460..3d218e9a43 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -4,7 +4,7 @@ import itertools import numpy as np from uuid import uuid4 -from sqlalchemy import and_, func, create_engine, Column, String, DateTime, ForeignKey, Boolean, Text, text, JSON +from sqlalchemy import and_, func, create_engine, Column, String, DateTime, ForeignKey, Boolean, Text, text, JSON, ForeignKeyConstraint from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship, sessionmaker from sqlalchemy.sql import case, null @@ -20,7 +20,7 @@ class ORMBase(Base): __abstract__ = True - id = Column(String(100), default=lambda: str(uuid4()), unique=True, primary_key=True) + id = Column(String(100), default=lambda: str(uuid4()), primary_key=True) created_at = Column(DateTime, server_default=func.now()) updated_at = Column(DateTime, server_default=func.now(), server_onupdate=func.now()) @@ -33,9 +33,6 @@ class DocumentORM(ORMBase): # primary key in combination with id to allow the same doc in different indices index = Column(String(100), nullable=False, primary_key=True) vector_id = Column(String(100), unique=True, nullable=True) - - # labels = relationship("LabelORM", back_populates="document") - # speeds up queries for get_documents_by_vector_ids() by having a single query that returns joined metadata meta = relationship("MetaDocumentORM", back_populates="documents", lazy="joined") @@ -45,36 +42,18 @@ class MetaDocumentORM(ORMBase): name = Column(String(100), index=True) value = Column(String(1000), index=True) - document_id = Column( - String(100), - ForeignKey("document.id", ondelete="CASCADE", onupdate="CASCADE"), - nullable=False, - index=True - ) - documents = relationship("DocumentORM", back_populates="meta") - -class MetaLabelORM(ORMBase): - __tablename__ = "meta_label" - - name = Column(String(100), index=True) - value = Column(String(1000), index=True) - label_id = Column( - String(100), - ForeignKey("label.id", ondelete="CASCADE", onupdate="CASCADE"), - nullable=False, - index=True - ) - - labels = relationship("LabelORM", back_populates="meta") + document_id = Column(String(100), nullable=False, index=True) + document_index = Column(String(100), nullable=False, index=True) + __table_args__ = (ForeignKeyConstraint([document_id, document_index], + [DocumentORM.id, DocumentORM.index], + ondelete="CASCADE", onupdate="CASCADE"), {}) class LabelORM(ORMBase): __tablename__ = "label" - # document_id = Column(String(100), ForeignKey("document.id", ondelete="CASCADE", onupdate="CASCADE"), nullable=False) - index = Column(String(100), nullable=False, primary_key=True) query = Column(Text, nullable=False) answer = Column(JSON, nullable=True) @@ -86,7 +65,21 @@ class LabelORM(ORMBase): pipeline_id = Column(String(500), nullable=True) meta = relationship("MetaLabelORM", back_populates="labels", lazy="joined") - # document = relationship("DocumentORM", back_populates="labels") + + +class MetaLabelORM(ORMBase): + __tablename__ = "meta_label" + + name = Column(String(100), index=True) + value = Column(String(1000), index=True) + labels = relationship("LabelORM", back_populates="meta") + + label_id = Column(String(100), nullable=False, index=True) + label_index = Column(String(100), nullable=False, index=True) + __table_args__ = (ForeignKeyConstraint([label_id, label_index], + [LabelORM.id, LabelORM.index], + ondelete="CASCADE", onupdate="CASCADE"), {}) + class SQLDocumentStore(BaseDocumentStore): @@ -130,7 +123,7 @@ def __init__( engine = create_engine(url, connect_args={'check_same_thread': check_same_thread}, **create_engine_params) else: engine = create_engine(url, **create_engine_params) - ORMBase.metadata.create_all(engine) + Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) self.session = Session() self.index: str = index diff --git a/test/test_document_store.py b/test/test_document_store.py index 581edb7bb7..9b05716e10 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -162,18 +162,18 @@ def test_get_all_documents_with_correct_filters(document_store_with_docs): def test_get_all_documents_with_correct_filters_legacy_sqlite(test_docs_xs, tmp_path): - document_store_with_docs = get_document_store("sql", tmp_path) - document_store_with_docs.write_documents(test_docs_xs) + for document_store_with_docs in get_document_store("sql", tmp_path): + document_store_with_docs.write_documents(test_docs_xs) - document_store_with_docs.use_windowed_query = False - documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]}) - assert len(documents) == 1 - assert documents[0].meta["name"] == "filename2" + document_store_with_docs.use_windowed_query = False + documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]}) + assert len(documents) == 1 + assert documents[0].meta["name"] == "filename2" - documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]}) - assert len(documents) == 2 - assert {d.meta["name"] for d in documents} == {"filename1", "filename3"} - assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"} + documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]}) + assert len(documents) == 2 + assert {d.meta["name"] for d in documents} == {"filename1", "filename3"} + assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"} def test_get_all_documents_with_incorrect_filter_name(document_store_with_docs): @@ -831,7 +831,7 @@ def test_update_meta(document_store): @pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"]) -def test_custom_embedding_field(document_store_type): +def test_custom_embedding_field(document_store_type, tmp_path): document_store = get_document_store( document_store_type=document_store_type, tmp_path=tmp_path, embedding_field="custom_embedding_field" ) From bb534eee209fbc7a43d0a81e0d92fe2fa2d353c6 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 11 Jan 2022 18:39:14 +0100 Subject: [PATCH 07/21] Fix mypy --- haystack/document_stores/sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index 3d218e9a43..a29c90c04d 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -48,7 +48,7 @@ class MetaDocumentORM(ORMBase): document_index = Column(String(100), nullable=False, index=True) __table_args__ = (ForeignKeyConstraint([document_id, document_index], [DocumentORM.id, DocumentORM.index], - ondelete="CASCADE", onupdate="CASCADE"), {}) + ondelete="CASCADE", onupdate="CASCADE"), {}) #type: ignore class LabelORM(ORMBase): @@ -78,7 +78,7 @@ class MetaLabelORM(ORMBase): label_index = Column(String(100), nullable=False, index=True) __table_args__ = (ForeignKeyConstraint([label_id, label_index], [LabelORM.id, LabelORM.index], - ondelete="CASCADE", onupdate="CASCADE"), {}) + ondelete="CASCADE", onupdate="CASCADE"), {}) #type: ignore From 0e2ebfba0e6c7a235005755263ca3209111b0928 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 12 Jan 2022 10:38:34 +0100 Subject: [PATCH 08/21] update_document_meta() was not using index properly --- haystack/document_stores/elasticsearch.py | 4 +++- haystack/document_stores/sql.py | 11 +++++------ haystack/document_stores/weaviate.py | 4 +++- test/conftest.py | 2 +- test/test_document_store.py | 18 ++++++++---------- test/test_weaviate.py | 8 ++++---- 6 files changed, 24 insertions(+), 23 deletions(-) diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 6e3951bba8..10e0046adf 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -551,10 +551,12 @@ def write_labels( if labels_to_index: bulk(self.client, labels_to_index, request_timeout=300, refresh=self.refresh_type, headers=headers) - def update_document_meta(self, id: str, meta: Dict[str, str], headers: Optional[Dict[str, str]] = None): + def update_document_meta(self, id: str, meta: Dict[str, str], headers: Optional[Dict[str, str]] = None, index: str = None): """ Update the metadata dictionary of a document by specifying its string id """ + if not index: + index = self.index body = {"doc": meta} self.client.update(index=self.index, id=id, body=body, refresh=self.refresh_type, headers=headers) diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index a29c90c04d..a5c3c6d493 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -113,12 +113,9 @@ def __init__( self.set_config( url=url, index=index, label_index=label_index, duplicate_documents=duplicate_documents, check_same_thread=check_same_thread ) - create_engine_params = {} if isolation_level: create_engine_params["isolation_level"] = isolation_level - - logging.warning(f"create engine params: {create_engine_params}") if "sqlite" in url: engine = create_engine(url, connect_args={'check_same_thread': check_same_thread}, **create_engine_params) else: @@ -461,12 +458,14 @@ def reset_vector_ids(self, index: Optional[str] = None): self.session.query(DocumentORM).filter_by(index=index).update({DocumentORM.vector_id: null()}) self.session.commit() - def update_document_meta(self, id: str, meta: Dict[str, str]): + def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None): """ Update the metadata dictionary of a document by specifying its string id """ - self.session.query(MetaDocumentORM).filter_by(document_id=id).delete() - meta_orms = [MetaDocumentORM(name=key, value=value, document_id=id) for key, value in meta.items()] + if not index: + index = self.index + self.session.query(MetaDocumentORM).filter_by(document_id=id, document_index=index).delete() + meta_orms = [MetaDocumentORM(name=key, value=value, document_id=id, document_index=index) for key, value in meta.items()] for m in meta_orms: self.session.add(m) self.session.commit() diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index e17ff98fbe..2f4561cb94 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -483,10 +483,12 @@ def write_documents( progress_bar.update(batch_size) progress_bar.close() - def update_document_meta(self, id: str, meta: Dict[str, str]): + def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None): """ Update the metadata dictionary of a document by specifying its string id. """ + if not index: + index = self.index self.weaviate_client.data_object.update(meta, class_name=self.index, uuid=id) def get_embedding_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int: diff --git a/test/conftest.py b/test/conftest.py index b2bf0536cb..aadde5368e 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -44,7 +44,7 @@ # To manually run the tests with default PostgreSQL instead of SQLite, switch the lines below SQL_TYPE = "sqlite" -# SQL_TYPE = "postgres" +#SQL_TYPE = "postgres" def pytest_addoption(parser): diff --git a/test/test_document_store.py b/test/test_document_store.py index 9b05716e10..44029aa797 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -804,7 +804,7 @@ def test_multilabel_no_answer(document_store): assert len(multi_labels[0].answers) == 1 -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "milvus", "weaviate"], indirect=True) # Currently update_document_meta() is not implemented for Memory doc store def test_update_meta(document_store): documents = [ @@ -832,15 +832,13 @@ def test_update_meta(document_store): @pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"]) def test_custom_embedding_field(document_store_type, tmp_path): - document_store = get_document_store( - document_store_type=document_store_type, tmp_path=tmp_path, embedding_field="custom_embedding_field" - ) - doc_to_write = {"content": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)} - document_store.write_documents([doc_to_write]) - documents = document_store.get_all_documents(return_embedding=True) - assert len(documents) == 1 - assert documents[0].content == "test" - np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding) + for document_store in get_document_store(document_store_type=document_store_type, tmp_path=tmp_path, embedding_field="custom_embedding_field"): + doc_to_write = {"content": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)} + document_store.write_documents([doc_to_write]) + documents = document_store.get_all_documents(return_embedding=True) + assert len(documents) == 1 + assert documents[0].content == "test" + np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding) @pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) diff --git a/test/test_weaviate.py b/test/test_weaviate.py index b6a3de9775..49ece2f52c 100644 --- a/test/test_weaviate.py +++ b/test/test_weaviate.py @@ -30,16 +30,16 @@ def get_uuid(): @pytest.fixture(params=["weaviate"]) -def document_store_with_docs(request): - document_store = get_document_store(request.param) +def document_store_with_docs(request, tmp_path): + document_store = get_document_store(request.param, tmp_path=tmp_path) document_store.write_documents(DOCUMENTS_XS) yield document_store document_store.delete_documents() @pytest.fixture(params=["weaviate"]) -def document_store(request): - document_store = get_document_store(request.param) +def document_store(request, tmp_path): + document_store = get_document_store(request.param, tmp_path=tmp_path) yield document_store document_store.delete_documents() From c1589a898c8cd3560869ff60192e82e45f817bb0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 12 Jan 2022 09:39:27 +0000 Subject: [PATCH 09/21] Add latest docstring and tutorial changes --- docs/_src/api/api/document_store.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index cb14d575b3..eaf9747e21 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -380,7 +380,7 @@ Write annotation labels into document store. #### update\_document\_meta ```python - | update_document_meta(id: str, meta: Dict[str, str], headers: Optional[Dict[str, str]] = None) + | update_document_meta(id: str, meta: Dict[str, str], headers: Optional[Dict[str, str]] = None, index: str = None) ``` Update the metadata dictionary of a document by specifying its string id @@ -1095,7 +1095,7 @@ Set vector IDs for all documents as None #### update\_document\_meta ```python - | update_document_meta(id: str, meta: Dict[str, str]) + | update_document_meta(id: str, meta: Dict[str, str], index: str = None) ``` Update the metadata dictionary of a document by specifying its string id @@ -1863,7 +1863,7 @@ None #### update\_document\_meta ```python - | update_document_meta(id: str, meta: Dict[str, str]) + | update_document_meta(id: str, meta: Dict[str, str], index: str = None) ``` Update the metadata dictionary of a document by specifying its string id. From 681b0365f91b09435d6f2a0553f73d3779c3508d Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 12 Jan 2022 12:25:17 +0100 Subject: [PATCH 10/21] Another small fix for test_weaviate.py --- test/conftest.py | 2 +- test/test_weaviate.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index aadde5368e..05773bb773 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -558,7 +558,7 @@ def teardown_postgres(): def get_document_store(document_store_type, tmp_path, embedding_dim=768, embedding_field="embedding", index="haystack_test", similarity:str="dot_product"): - if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus"]: + if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus", "sql"]: logging.warning("Setting up PostgreSQL...") setup_postgres() diff --git a/test/test_weaviate.py b/test/test_weaviate.py index 49ece2f52c..60ddb8ca1c 100644 --- a/test/test_weaviate.py +++ b/test/test_weaviate.py @@ -31,17 +31,17 @@ def get_uuid(): @pytest.fixture(params=["weaviate"]) def document_store_with_docs(request, tmp_path): - document_store = get_document_store(request.param, tmp_path=tmp_path) - document_store.write_documents(DOCUMENTS_XS) - yield document_store - document_store.delete_documents() + for document_store in get_document_store(request.param, tmp_path=tmp_path): + document_store.write_documents(DOCUMENTS_XS) + yield document_store + document_store.delete_documents() @pytest.fixture(params=["weaviate"]) def document_store(request, tmp_path): - document_store = get_document_store(request.param, tmp_path=tmp_path) - yield document_store - document_store.delete_documents() + for document_store in get_document_store(request.param, tmp_path=tmp_path): + yield document_store + document_store.delete_documents() @pytest.mark.weaviate From 65e7bf3f11a0b4c30f9807c040b380b5fb60f0f6 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 12 Jan 2022 13:52:12 +0100 Subject: [PATCH 11/21] Improve dosctrings for 'isolation_level' --- haystack/document_stores/faiss.py | 1 + haystack/document_stores/milvus.py | 1 + haystack/document_stores/milvus2x.py | 3 ++- haystack/document_stores/sql.py | 2 +- 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index 676f49f84b..f79568b7c5 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -95,6 +95,7 @@ def __init__( If specified no other params besides faiss_config_path must be specified. :param faiss_config_path: Stored FAISS initial configuration parameters. Can be created via calling `save()` + :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ # special case if we want to load an existing index from disk # load init params from disk and run init again diff --git a/haystack/document_stores/milvus.py b/haystack/document_stores/milvus.py index 538392c1bc..4eeec700bf 100644 --- a/haystack/document_stores/milvus.py +++ b/haystack/document_stores/milvus.py @@ -98,6 +98,7 @@ def __init__( overwrite: Update any existing documents with the same ID when adding documents. fail: an error is raised if the document ID of the document being added already exists. + :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ # save init parameters to enable export of component config as YAML self.set_config( diff --git a/haystack/document_stores/milvus2x.py b/haystack/document_stores/milvus2x.py index 026c09bcea..4ea6626612 100644 --- a/haystack/document_stores/milvus2x.py +++ b/haystack/document_stores/milvus2x.py @@ -73,7 +73,7 @@ def __init__( custom_fields: Optional[List[Any]] = None, progress_bar: bool = True, duplicate_documents: str = 'overwrite', - isolation_level: str = None, + isolation_level: str = None ): """ :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale @@ -119,6 +119,7 @@ def __init__( overwrite: Update any existing documents with the same ID when adding documents. fail: an error is raised if the document ID of the document being added already exists. + :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ # save init parameters to enable export of component config as YAML diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index a5c3c6d493..7ef5db7912 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -106,7 +106,7 @@ def __init__( fail: an error is raised if the document ID of the document being added already exists. :param check_same_thread: Set to False to mitigate multithreading issues in older SQLite versions (see https://docs.sqlalchemy.org/en/14/dialects/sqlite.html?highlight=check_same_thread#threading-pooling-behavior) - :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` + :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) """ # save init parameters to enable export of component config as YAML From 9c3d8fa4cc4c93b360d604eb339c4b101a070bc2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 12 Jan 2022 12:53:05 +0000 Subject: [PATCH 12/21] Add latest docstring and tutorial changes --- docs/_src/api/api/document_store.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index eaf9747e21..bd37d120b5 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -970,7 +970,7 @@ An SQL backed DocumentStore. Currently supports SQLite, PostgreSQL and MySQL bac fail: an error is raised if the document ID of the document being added already exists. - `check_same_thread`: Set to False to mitigate multithreading issues in older SQLite versions (see https://docs.sqlalchemy.org/en/14/dialects/sqlite.html?highlight=check_same_thread#threading-pooling-behavior) -- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) #### get\_document\_by\_id @@ -1249,6 +1249,7 @@ the vector embeddings are indexed in a FAISS Index. If specified no other params besides faiss_config_path must be specified. - `faiss_config_path`: Stored FAISS initial configuration parameters. Can be created via calling `save()` +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) #### write\_documents @@ -1526,6 +1527,7 @@ Note that an overly large index_file_size value may cause failure to load a segm overwrite: Update any existing documents with the same ID when adding documents. fail: an error is raised if the document ID of the document being added already exists. +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) #### write\_documents From 115f072b61a7495564aa36f0aada81d0ff2d3f99 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 13 Jan 2022 09:50:44 +0100 Subject: [PATCH 13/21] Implement Thomas' feedback --- haystack/document_stores/elasticsearch.py | 2 +- haystack/document_stores/weaviate.py | 2 +- test/conftest.py | 72 ++++++++++++++--------- test/test_document_store.py | 34 +++++------ test/test_weaviate.py | 14 ++--- 5 files changed, 69 insertions(+), 55 deletions(-) diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 10e0046adf..4a4072004d 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -558,7 +558,7 @@ def update_document_meta(self, id: str, meta: Dict[str, str], headers: Optional[ if not index: index = self.index body = {"doc": meta} - self.client.update(index=self.index, id=id, body=body, refresh=self.refresh_type, headers=headers) + self.client.update(index=index, id=id, body=body, refresh=self.refresh_type, headers=headers) def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int: diff --git a/haystack/document_stores/weaviate.py b/haystack/document_stores/weaviate.py index 2f4561cb94..508e33631c 100644 --- a/haystack/document_stores/weaviate.py +++ b/haystack/document_stores/weaviate.py @@ -489,7 +489,7 @@ def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None) """ if not index: index = self.index - self.weaviate_client.data_object.update(meta, class_name=self.index, uuid=id) + self.weaviate_client.data_object.update(meta, class_name=index, uuid=id) def get_embedding_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int: """ diff --git a/test/conftest.py b/test/conftest.py index 05773bb773..44e23d295b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -44,7 +44,7 @@ # To manually run the tests with default PostgreSQL instead of SQLite, switch the lines below SQL_TYPE = "sqlite" -#SQL_TYPE = "postgres" +# SQL_TYPE = "postgres" def pytest_addoption(parser): @@ -486,43 +486,57 @@ def get_retriever(retriever_type, document_store): @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate"]) def document_store_with_docs(request, test_docs_xs, tmp_path): - for document_store in get_document_store(request.param, tmp_path=tmp_path): - document_store.write_documents(test_docs_xs) - yield document_store - document_store.delete_documents() + document_store = get_document_store(request.param, tmp_path=tmp_path) + document_store.write_documents(test_docs_xs) + yield document_store + document_store.delete_documents() @pytest.fixture def document_store(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) - for document_store in get_document_store(request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path): - yield document_store - document_store.delete_documents() + document_store = get_document_store(request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path) + yield document_store + document_store.delete_documents() @pytest.fixture(params=["faiss", "milvus", "weaviate"]) def document_store_cosine(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) - for document_store in get_document_store(request.param, embedding_dim=embedding_dim.args[0], similarity="cosine", tmp_path=tmp_path): - yield document_store - document_store.delete_documents() + document_store = get_document_store(request.param, embedding_dim=embedding_dim.args[0], similarity="cosine", tmp_path=tmp_path) + yield document_store + document_store.delete_documents() @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate"]) def document_store_cosine_small(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(3)) - for document_store in get_document_store(request.param, embedding_dim=embedding_dim.args[0], similarity="cosine", tmp_path=tmp_path): - yield document_store - document_store.delete_documents() + document_store = get_document_store(request.param, embedding_dim=embedding_dim.args[0], similarity="cosine", tmp_path=tmp_path) + yield document_store + document_store.delete_documents() -@pytest.fixture -def sql_url(tmp_path): +@pytest.fixture(scope="function", autouse=True) +def postgres_fixture(): if SQL_TYPE == "postgres": - try: - setup_postgres() - yield get_sql_url(tmp_path) - finally: - teardown_postgres() + setup_postgres() + yield + teardown_postgres() else: - yield get_sql_url(tmp_path) + yield + + +# @pytest.fixture +# def sql_url(tmp_path): +# if SQL_TYPE == "postgres": +# try: +# setup_postgres() +# yield get_sql_url(tmp_path) +# finally: +# teardown_postgres() +# else: +# yield get_sql_url(tmp_path) + +@pytest.fixture +def sql_url(tmp_path): + return get_sql_url(tmp_path) def get_sql_url(tmp_path): @@ -558,9 +572,9 @@ def teardown_postgres(): def get_document_store(document_store_type, tmp_path, embedding_dim=768, embedding_field="embedding", index="haystack_test", similarity:str="dot_product"): - if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus", "sql"]: - logging.warning("Setting up PostgreSQL...") - setup_postgres() + # if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus", "sql"]: + # logging.warning("Setting up PostgreSQL...") + # setup_postgres() if document_store_type == "sql": document_store = SQLDocumentStore(url=get_sql_url(tmp_path), index=index, isolation_level="AUTOCOMMIT") @@ -615,11 +629,11 @@ def get_document_store(document_store_type, tmp_path, embedding_dim=768, embeddi else: raise Exception(f"No document store fixture for '{document_store_type}'") - yield document_store + return document_store - if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus"]: - logging.warning("Tearing down PostgreSQL...") - teardown_postgres() + # if SQL_TYPE == "postgres" and document_store_type in ["faiss", "milvus"]: + # logging.warning("Tearing down PostgreSQL...") + # teardown_postgres() @pytest.fixture(scope="function") diff --git a/test/test_document_store.py b/test/test_document_store.py index 44029aa797..eb5b79cc64 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -162,18 +162,18 @@ def test_get_all_documents_with_correct_filters(document_store_with_docs): def test_get_all_documents_with_correct_filters_legacy_sqlite(test_docs_xs, tmp_path): - for document_store_with_docs in get_document_store("sql", tmp_path): - document_store_with_docs.write_documents(test_docs_xs) + document_store_with_docs = get_document_store("sql", tmp_path) + document_store_with_docs.write_documents(test_docs_xs) - document_store_with_docs.use_windowed_query = False - documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]}) - assert len(documents) == 1 - assert documents[0].meta["name"] == "filename2" + document_store_with_docs.use_windowed_query = False + documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test2"]}) + assert len(documents) == 1 + assert documents[0].meta["name"] == "filename2" - documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]}) - assert len(documents) == 2 - assert {d.meta["name"] for d in documents} == {"filename1", "filename3"} - assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"} + documents = document_store_with_docs.get_all_documents(filters={"meta_field": ["test1", "test3"]}) + assert len(documents) == 2 + assert {d.meta["name"] for d in documents} == {"filename1", "filename3"} + assert {d.meta["meta_field"] for d in documents} == {"test1", "test3"} def test_get_all_documents_with_incorrect_filter_name(document_store_with_docs): @@ -832,13 +832,13 @@ def test_update_meta(document_store): @pytest.mark.parametrize("document_store_type", ["elasticsearch", "memory"]) def test_custom_embedding_field(document_store_type, tmp_path): - for document_store in get_document_store(document_store_type=document_store_type, tmp_path=tmp_path, embedding_field="custom_embedding_field"): - doc_to_write = {"content": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)} - document_store.write_documents([doc_to_write]) - documents = document_store.get_all_documents(return_embedding=True) - assert len(documents) == 1 - assert documents[0].content == "test" - np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding) + document_store = get_document_store(document_store_type=document_store_type, tmp_path=tmp_path, embedding_field="custom_embedding_field") + doc_to_write = {"content": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)} + document_store.write_documents([doc_to_write]) + documents = document_store.get_all_documents(return_embedding=True) + assert len(documents) == 1 + assert documents[0].content == "test" + np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding) @pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) diff --git a/test/test_weaviate.py b/test/test_weaviate.py index 60ddb8ca1c..49ece2f52c 100644 --- a/test/test_weaviate.py +++ b/test/test_weaviate.py @@ -31,17 +31,17 @@ def get_uuid(): @pytest.fixture(params=["weaviate"]) def document_store_with_docs(request, tmp_path): - for document_store in get_document_store(request.param, tmp_path=tmp_path): - document_store.write_documents(DOCUMENTS_XS) - yield document_store - document_store.delete_documents() + document_store = get_document_store(request.param, tmp_path=tmp_path) + document_store.write_documents(DOCUMENTS_XS) + yield document_store + document_store.delete_documents() @pytest.fixture(params=["weaviate"]) def document_store(request, tmp_path): - for document_store in get_document_store(request.param, tmp_path=tmp_path): - yield document_store - document_store.delete_documents() + document_store = get_document_store(request.param, tmp_path=tmp_path) + yield document_store + document_store.delete_documents() @pytest.mark.weaviate From d2a6ad5d88ade265fe9419505cd36dbf188e4bfb Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 13 Jan 2022 11:36:49 +0100 Subject: [PATCH 14/21] Fix bug introduced in merge --- test/conftest.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index cf0043e100..d0c3e0f1af 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -487,7 +487,7 @@ def get_retriever(retriever_type, document_store): @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate"]) def document_store_with_docs(request, test_docs_xs, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) - document_store = get_document_store(request.param, embedding_dim.args[0], tmp_path=tmp_path) + document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path) document_store.write_documents(test_docs_xs) yield document_store document_store.delete_documents() @@ -495,21 +495,21 @@ def document_store_with_docs(request, test_docs_xs, tmp_path): @pytest.fixture def document_store(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) - document_store = get_document_store(request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path) + document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim=embedding_dim.args[0], tmp_path=tmp_path) yield document_store document_store.delete_documents() @pytest.fixture(params=["memory", "faiss", "milvus", "elasticsearch"]) def document_store_dot_product(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) - document_store = get_document_store(request.param, embedding_dim.args[0], similarity="dot_product", tmp_path=tmp_path) + document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim.args[0], similarity="dot_product", tmp_path=tmp_path) yield document_store document_store.delete_documents() @pytest.fixture(params=["memory", "faiss", "milvus", "elasticsearch"]) def document_store_dot_product_with_docs(request, test_docs_xs, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) - document_store = get_document_store(request.param, embedding_dim.args[0], similarity="dot_product", tmp_path=tmp_path) + document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim.args[0], similarity="dot_product", tmp_path=tmp_path) document_store.write_documents(test_docs_xs) yield document_store document_store.delete_documents() @@ -517,14 +517,14 @@ def document_store_dot_product_with_docs(request, test_docs_xs, tmp_path): @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus"]) def document_store_dot_product_small(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(3)) - document_store = get_document_store(request.param, embedding_dim.args[0], similarity="dot_product", tmp_path=tmp_path) + document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim.args[0], similarity="dot_product", tmp_path=tmp_path) yield document_store document_store.delete_documents() @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate"]) def document_store_cosine_small(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(3)) - document_store = get_document_store(request.param, embedding_dim=embedding_dim.args[0], similarity="cosine", tmp_path=tmp_path) + document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim.args[0], similarity="cosine", tmp_path=tmp_path) yield document_store document_store.delete_documents() From b96f83bb99a2beace8b862238572a4c7cf91cdb5 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 13 Jan 2022 13:42:54 +0100 Subject: [PATCH 15/21] Typo --- test/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/conftest.py b/test/conftest.py index d0c3e0f1af..426193e438 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -495,7 +495,7 @@ def document_store_with_docs(request, test_docs_xs, tmp_path): @pytest.fixture def document_store(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) - document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim=embedding_dim.args[0], tmp_path=tmp_path) + document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path) yield document_store document_store.delete_documents() From 0cb19f4f5a82f6d7ae025068d4e7ad7c634b3c84 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 13 Jan 2022 14:05:52 +0100 Subject: [PATCH 16/21] Another typo from the merge --- test/conftest.py | 2 +- test/test_document_store.py | 32 +++++++------- test/test_faiss_and_milvus.py | 79 ----------------------------------- 3 files changed, 17 insertions(+), 96 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 426193e438..06abc204d0 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -522,7 +522,7 @@ def document_store_dot_product_small(request, tmp_path): document_store.delete_documents() @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate"]) -def document_store_cosine_small(request, tmp_path): +def document_store_small(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(3)) document_store = get_document_store(document_store_type=request.param, embedding_dim=embedding_dim.args[0], similarity="cosine", tmp_path=tmp_path) yield document_store diff --git a/test/test_document_store.py b/test/test_document_store.py index 0e7cb69198..3038026fb1 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -1014,17 +1014,17 @@ def ensure_ids_are_correct_uuids(docs:list,document_store:object)->None: for d in docs: d["id"] = str(uuid.uuid4()) - -def test_cosine_similarity(document_store_cosine): + +def test_cosine_similarity(document_store): # below we will write documents to the store and then query it to see if vectors were normalized - ensure_ids_are_correct_uuids(docs=DOCUMENTS, document_store=document_store_cosine) - document_store_cosine.write_documents(documents=DOCUMENTS) + ensure_ids_are_correct_uuids(docs=DOCUMENTS,document_store=document_store) + document_store.write_documents(documents=DOCUMENTS) # note that the same query will be used later when querying after updating the embeddings query = np.random.rand(768).astype(np.float32) - query_results = document_store_cosine.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) + query_results = document_store.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) # check if search with cosine similarity returns the correct number of results assert len(query_results) == len(DOCUMENTS) @@ -1035,7 +1035,7 @@ def test_cosine_similarity(document_store_cosine): for doc in query_results: result_emb = doc.embedding original_emb = np.array([indexed_docs[doc.content]], dtype="float32") - document_store_cosine.normalize_embedding(original_emb[0]) + document_store.normalize_embedding(original_emb[0]) # check if the stored embedding was normalized assert np.allclose(original_emb[0], result_emb, rtol=0.01) @@ -1049,27 +1049,27 @@ def embed_documents(self, docs): return [np.random.rand(768).astype(np.float32) for doc in docs] retriever = MockRetriever() - document_store_cosine.update_embeddings(retriever=retriever) - query_results = document_store_cosine.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) + document_store.update_embeddings(retriever=retriever) + query_results = document_store.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) for doc in query_results: original_emb = np.array([indexed_docs[doc.content]], dtype="float32") - document_store_cosine.normalize_embedding(original_emb[0]) + document_store.normalize_embedding(original_emb[0]) # check if the original embedding has changed after updating the embeddings assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01) -def test_normalize_embeddings_diff_shapes(document_store_cosine_small): +def test_normalize_embeddings_diff_shapes(document_store_dot_product_small): VEC_1 = np.array([.1, .2, .3], dtype="float32") - document_store_cosine_small.normalize_embedding(VEC_1) + document_store_dot_product_small.normalize_embedding(VEC_1) assert np.linalg.norm(VEC_1) - 1 < 0.01 VEC_1 = np.array([.1, .2, .3], dtype="float32").reshape(1, -1) - document_store_cosine_small.normalize_embedding(VEC_1) + document_store_dot_product_small.normalize_embedding(VEC_1) assert np.linalg.norm(VEC_1) - 1 < 0.01 -def test_cosine_sanity_check(document_store_cosine_small): +def test_cosine_sanity_check(document_store_small): VEC_1 = np.array([.1, .2, .3], dtype="float32") VEC_2 = np.array([.4, .5, .6], dtype="float32") @@ -1078,10 +1078,10 @@ def test_cosine_sanity_check(document_store_cosine_small): KNOWN_COSINE = (0.9746317 + 1) / 2 docs = [{"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1}] - ensure_ids_are_correct_uuids(docs=docs,document_store=document_store_cosine_small) - document_store_cosine_small.write_documents(documents=docs) + ensure_ids_are_correct_uuids(docs=docs,document_store=document_store_small) + document_store_small.write_documents(documents=docs) - query_results = document_store_cosine_small.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True) + query_results = document_store_small.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True) # check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318 assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.00002) diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py index 7e4fc01b23..965c28f507 100644 --- a/test/test_faiss_and_milvus.py +++ b/test/test_faiss_and_milvus.py @@ -445,82 +445,3 @@ def test_faiss_passing_index_from_outside(tmp_path): # test if vectors ids are associated with docs for doc in documents_indexed: assert 0 <= int(doc.meta["vector_id"]) <= 7 - - -def ensure_ids_are_correct_uuids(docs:list,document_store:object)->None: - # Weaviate currently only supports UUIDs - if type(document_store)==WeaviateDocumentStore: - for d in docs: - d["id"] = str(uuid.uuid4()) - -def test_cosine_similarity(document_store): - # below we will write documents to the store and then query it to see if vectors were normalized - - ensure_ids_are_correct_uuids(docs=DOCUMENTS,document_store=document_store) - document_store.write_documents(documents=DOCUMENTS) - - # note that the same query will be used later when querying after updating the embeddings - query = np.random.rand(768).astype(np.float32) - - query_results = document_store.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) - - # check if search with cosine similarity returns the correct number of results - assert len(query_results) == len(DOCUMENTS) - indexed_docs = {} - for doc in DOCUMENTS: - indexed_docs[doc["content"]] = doc["embedding"] - - for doc in query_results: - result_emb = doc.embedding - original_emb = np.array([indexed_docs[doc.content]], dtype="float32") - document_store.normalize_embedding(original_emb[0]) - - # check if the stored embedding was normalized - assert np.allclose(original_emb[0], result_emb, rtol=0.01) - - # check if the score is plausible for cosine similarity - assert 0 <= doc.score <= 1.0 - - # now check if vectors are normalized when updating embeddings - class MockRetriever(): - def embed_documents(self, docs): - return [np.random.rand(768).astype(np.float32) for doc in docs] - - retriever = MockRetriever() - document_store.update_embeddings(retriever=retriever) - query_results = document_store.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) - - for doc in query_results: - original_emb = np.array([indexed_docs[doc.content]], dtype="float32") - document_store.normalize_embedding(original_emb[0]) - # check if the original embedding has changed after updating the embeddings - assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01) - - -def test_normalize_embeddings_diff_shapes(document_store_dot_product_small): - VEC_1 = np.array([.1, .2, .3], dtype="float32") - document_store_dot_product_small.normalize_embedding(VEC_1) - assert np.linalg.norm(VEC_1) - 1 < 0.01 - - VEC_1 = np.array([.1, .2, .3], dtype="float32").reshape(1, -1) - document_store_dot_product_small.normalize_embedding(VEC_1) - assert np.linalg.norm(VEC_1) - 1 < 0.01 - - -def test_cosine_sanity_check(document_store_small): - VEC_1 = np.array([.1, .2, .3], dtype="float32") - VEC_2 = np.array([.4, .5, .6], dtype="float32") - - # This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity - # The score is normalized to yield a value between 0 and 1. - KNOWN_COSINE = (0.9746317 + 1) / 2 - - docs = [{"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1}] - ensure_ids_are_correct_uuids(docs=docs,document_store=document_store_small) - document_store_small.write_documents(documents=docs) - - query_results = document_store_small.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True) - - # check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318 - assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.00002) - From afe8483475729922b42ce9d8a3224f7af8ed3724 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 14 Jan 2022 09:51:07 +0100 Subject: [PATCH 17/21] Exclude ES and Memory from the cosine_sanity_check test --- test/test_document_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_document_store.py b/test/test_document_store.py index 3038026fb1..2e6b6b46b4 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -1069,6 +1069,7 @@ def test_normalize_embeddings_diff_shapes(document_store_dot_product_small): assert np.linalg.norm(VEC_1) - 1 < 0.01 +@pytest.mark.parametrize("document_store", ["faiss", "milvus", "weaviate"], indirect=True) def test_cosine_sanity_check(document_store_small): VEC_1 = np.array([.1, .2, .3], dtype="float32") VEC_2 = np.array([.4, .5, .6], dtype="float32") From 3b669b38f51581e4b3a26158ec2390f5024f9769 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 14 Jan 2022 10:20:01 +0100 Subject: [PATCH 18/21] Bug in fixture name --- test/test_document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index 2e6b6b46b4..f960baa895 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -1069,7 +1069,7 @@ def test_normalize_embeddings_diff_shapes(document_store_dot_product_small): assert np.linalg.norm(VEC_1) - 1 < 0.01 -@pytest.mark.parametrize("document_store", ["faiss", "milvus", "weaviate"], indirect=True) +@pytest.mark.parametrize("document_store_small", ["faiss", "milvus", "weaviate"], indirect=True) def test_cosine_sanity_check(document_store_small): VEC_1 = np.array([.1, .2, .3], dtype="float32") VEC_2 = np.array([.4, .5, .6], dtype="float32") From d9e14e013a2eb5d5c106de0660555d9d5bf44c8e Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 14 Jan 2022 10:45:43 +0100 Subject: [PATCH 19/21] move ensure_ids_are_correct_uuids in conftest and move one test back to faiss & milvus suite --- test/conftest.py | 8 ++++++++ test/test_document_store.py | 27 +-------------------------- test/test_faiss_and_milvus.py | 21 +++++++++++++++++++++ 3 files changed, 30 insertions(+), 26 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 06abc204d0..a9a530e924 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,6 +3,7 @@ from subprocess import run from sys import platform import gc +import uuid import logging from sqlalchemy import create_engine, text @@ -484,6 +485,13 @@ def get_retriever(retriever_type, document_store): return retriever +def ensure_ids_are_correct_uuids(docs:list,document_store:object)->None: + # Weaviate currently only supports UUIDs + if type(document_store)==WeaviateDocumentStore: + for d in docs: + d["id"] = str(uuid.uuid4()) + + @pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate"]) def document_store_with_docs(request, test_docs_xs, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) diff --git a/test/test_document_store.py b/test/test_document_store.py index f960baa895..50053fe624 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -9,7 +9,7 @@ from elasticsearch.exceptions import RequestError -from conftest import get_document_store +from conftest import get_document_store, ensure_ids_are_correct_uuids from haystack.document_stores import WeaviateDocumentStore from haystack.document_stores.base import BaseDocumentStore from haystack.errors import DuplicateDocumentError @@ -1008,13 +1008,6 @@ def test_custom_headers(document_store_with_docs: BaseDocumentStore): assert len(documents) > 0 -def ensure_ids_are_correct_uuids(docs:list,document_store:object)->None: - # Weaviate currently only supports UUIDs - if type(document_store)==WeaviateDocumentStore: - for d in docs: - d["id"] = str(uuid.uuid4()) - - def test_cosine_similarity(document_store): # below we will write documents to the store and then query it to see if vectors were normalized @@ -1068,21 +1061,3 @@ def test_normalize_embeddings_diff_shapes(document_store_dot_product_small): document_store_dot_product_small.normalize_embedding(VEC_1) assert np.linalg.norm(VEC_1) - 1 < 0.01 - -@pytest.mark.parametrize("document_store_small", ["faiss", "milvus", "weaviate"], indirect=True) -def test_cosine_sanity_check(document_store_small): - VEC_1 = np.array([.1, .2, .3], dtype="float32") - VEC_2 = np.array([.4, .5, .6], dtype="float32") - - # This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity - # The score is normalized to yield a value between 0 and 1. - KNOWN_COSINE = (0.9746317 + 1) / 2 - - docs = [{"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1}] - ensure_ids_are_correct_uuids(docs=docs,document_store=document_store_small) - document_store_small.write_documents(documents=docs) - - query_results = document_store_small.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True) - - # check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318 - assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.00002) diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py index 965c28f507..24b07fa5c4 100644 --- a/test/test_faiss_and_milvus.py +++ b/test/test_faiss_and_milvus.py @@ -13,6 +13,8 @@ from haystack.pipelines import Pipeline from haystack.nodes.retriever.dense import EmbeddingRetriever +from conftest import ensure_ids_are_correct_uuids + DOCUMENTS = [ {"meta": {"name": "name_1", "year": "2020", "month": "01"}, "content": "text_1", "embedding": np.random.rand(768).astype(np.float32)}, @@ -445,3 +447,22 @@ def test_faiss_passing_index_from_outside(tmp_path): # test if vectors ids are associated with docs for doc in documents_indexed: assert 0 <= int(doc.meta["vector_id"]) <= 7 + + +@pytest.mark.parametrize("document_store_small", ["faiss", "milvus", "weaviate"], indirect=True) +def test_cosine_sanity_check(document_store_small): + VEC_1 = np.array([.1, .2, .3], dtype="float32") + VEC_2 = np.array([.4, .5, .6], dtype="float32") + + # This is the cosine similarity of VEC_1 and VEC_2 calculated using sklearn.metrics.pairwise.cosine_similarity + # The score is normalized to yield a value between 0 and 1. + KNOWN_COSINE = (0.9746317 + 1) / 2 + + docs = [{"name": "vec_1", "text": "vec_1", "content": "vec_1", "embedding": VEC_1}] + ensure_ids_are_correct_uuids(docs=docs,document_store=document_store_small) + document_store_small.write_documents(documents=docs) + + query_results = document_store_small.query_by_embedding(query_emb=VEC_2, top_k=1, return_embedding=True) + + # check if faiss returns the same cosine similarity. Manual testing with faiss yielded 0.9746318 + assert math.isclose(query_results[0].score, KNOWN_COSINE, abs_tol=0.00002) \ No newline at end of file From c1913dfcd6e4323c139cf5c1bf39ae4202cf77a2 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 14 Jan 2022 11:32:07 +0100 Subject: [PATCH 20/21] Move back tests to faiss&milvus suite, cannot parametrize them properly --- test/test_document_store.py | 54 --------------------------------- test/test_faiss_and_milvus.py | 56 +++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 54 deletions(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index 50053fe624..30d55f4bac 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -1007,57 +1007,3 @@ def test_custom_headers(document_store_with_docs: BaseDocumentStore): assert kwargs["headers"] == custom_headers assert len(documents) > 0 - -def test_cosine_similarity(document_store): - # below we will write documents to the store and then query it to see if vectors were normalized - - ensure_ids_are_correct_uuids(docs=DOCUMENTS,document_store=document_store) - document_store.write_documents(documents=DOCUMENTS) - - # note that the same query will be used later when querying after updating the embeddings - query = np.random.rand(768).astype(np.float32) - - query_results = document_store.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) - - # check if search with cosine similarity returns the correct number of results - assert len(query_results) == len(DOCUMENTS) - indexed_docs = {} - for doc in DOCUMENTS: - indexed_docs[doc["content"]] = doc["embedding"] - - for doc in query_results: - result_emb = doc.embedding - original_emb = np.array([indexed_docs[doc.content]], dtype="float32") - document_store.normalize_embedding(original_emb[0]) - - # check if the stored embedding was normalized - assert np.allclose(original_emb[0], result_emb, rtol=0.01) - - # check if the score is plausible for cosine similarity - assert 0 <= doc.score <= 1.0 - - # now check if vectors are normalized when updating embeddings - class MockRetriever(): - def embed_documents(self, docs): - return [np.random.rand(768).astype(np.float32) for doc in docs] - - retriever = MockRetriever() - document_store.update_embeddings(retriever=retriever) - query_results = document_store.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) - - for doc in query_results: - original_emb = np.array([indexed_docs[doc.content]], dtype="float32") - document_store.normalize_embedding(original_emb[0]) - # check if the original embedding has changed after updating the embeddings - assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01) - - -def test_normalize_embeddings_diff_shapes(document_store_dot_product_small): - VEC_1 = np.array([.1, .2, .3], dtype="float32") - document_store_dot_product_small.normalize_embedding(VEC_1) - assert np.linalg.norm(VEC_1) - 1 < 0.01 - - VEC_1 = np.array([.1, .2, .3], dtype="float32").reshape(1, -1) - document_store_dot_product_small.normalize_embedding(VEC_1) - assert np.linalg.norm(VEC_1) - 1 < 0.01 - diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py index 24b07fa5c4..7dedc0ad31 100644 --- a/test/test_faiss_and_milvus.py +++ b/test/test_faiss_and_milvus.py @@ -449,6 +449,62 @@ def test_faiss_passing_index_from_outside(tmp_path): assert 0 <= int(doc.meta["vector_id"]) <= 7 +@pytest.mark.parametrize("document_store", ["faiss", "milvus", "weaviate"], indirect=True) +def test_cosine_similarity(document_store): + # below we will write documents to the store and then query it to see if vectors were normalized + + ensure_ids_are_correct_uuids(docs=DOCUMENTS,document_store=document_store) + document_store.write_documents(documents=DOCUMENTS) + + # note that the same query will be used later when querying after updating the embeddings + query = np.random.rand(768).astype(np.float32) + + query_results = document_store.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) + + # check if search with cosine similarity returns the correct number of results + assert len(query_results) == len(DOCUMENTS) + indexed_docs = {} + for doc in DOCUMENTS: + indexed_docs[doc["content"]] = doc["embedding"] + + for doc in query_results: + result_emb = doc.embedding + original_emb = np.array([indexed_docs[doc.content]], dtype="float32") + document_store.normalize_embedding(original_emb[0]) + + # check if the stored embedding was normalized + assert np.allclose(original_emb[0], result_emb, rtol=0.01) + + # check if the score is plausible for cosine similarity + assert 0 <= doc.score <= 1.0 + + # now check if vectors are normalized when updating embeddings + class MockRetriever(): + def embed_documents(self, docs): + return [np.random.rand(768).astype(np.float32) for doc in docs] + + retriever = MockRetriever() + document_store.update_embeddings(retriever=retriever) + query_results = document_store.query_by_embedding(query_emb=query, top_k=len(DOCUMENTS), return_embedding=True) + + for doc in query_results: + original_emb = np.array([indexed_docs[doc.content]], dtype="float32") + document_store.normalize_embedding(original_emb[0]) + # check if the original embedding has changed after updating the embeddings + assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01) + + +@pytest.mark.parametrize("document_store_dot_product_small", ["faiss", "milvus", "weaviate"], indirect=True) +def test_normalize_embeddings_diff_shapes(document_store_dot_product_small): + VEC_1 = np.array([.1, .2, .3], dtype="float32") + document_store_dot_product_small.normalize_embedding(VEC_1) + assert np.linalg.norm(VEC_1) - 1 < 0.01 + + VEC_1 = np.array([.1, .2, .3], dtype="float32").reshape(1, -1) + document_store_dot_product_small.normalize_embedding(VEC_1) + assert np.linalg.norm(VEC_1) - 1 < 0.01 + + @pytest.mark.parametrize("document_store_small", ["faiss", "milvus", "weaviate"], indirect=True) def test_cosine_sanity_check(document_store_small): VEC_1 = np.array([.1, .2, .3], dtype="float32") From 70bde27fb1306bb0784a6fe6faec2f2e57539fd2 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 14 Jan 2022 12:21:24 +0100 Subject: [PATCH 21/21] Remove one failing test for Weaviate --- test/test_faiss_and_milvus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_faiss_and_milvus.py b/test/test_faiss_and_milvus.py index 7dedc0ad31..221ce6e4d7 100644 --- a/test/test_faiss_and_milvus.py +++ b/test/test_faiss_and_milvus.py @@ -494,7 +494,7 @@ def embed_documents(self, docs): assert not np.allclose(original_emb[0], doc.embedding, rtol=0.01) -@pytest.mark.parametrize("document_store_dot_product_small", ["faiss", "milvus", "weaviate"], indirect=True) +@pytest.mark.parametrize("document_store_dot_product_small", ["faiss", "milvus"], indirect=True) def test_normalize_embeddings_diff_shapes(document_store_dot_product_small): VEC_1 = np.array([.1, .2, .3], dtype="float32") document_store_dot_product_small.normalize_embedding(VEC_1)