From 827d6f824b7eadf24d2d94032efef9db41bf27f5 Mon Sep 17 00:00:00 2001 From: James Briggs Date: Sat, 5 Feb 2022 09:56:21 +0700 Subject: [PATCH 01/58] added core install and functionality of pinecone doc store (init, upsert, query, delete) --- haystack/document_stores/__init__.py | 2 ++ setup.cfg | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/haystack/document_stores/__init__.py b/haystack/document_stores/__init__.py index 9a580261b7..7f2c4499ed 100644 --- a/haystack/document_stores/__init__.py +++ b/haystack/document_stores/__init__.py @@ -15,6 +15,7 @@ SQLDocumentStore = safe_import("haystack.document_stores.sql", "SQLDocumentStore", "sql") FAISSDocumentStore = safe_import("haystack.document_stores.faiss", "FAISSDocumentStore", "faiss") +PineconeDocumentStore = safe_import("haystack.document_stores.pinecone", "PineconeDocumentStore", "pinecone") if os.getenv("MILVUS2_ENABLED"): MilvusDocumentStore = safe_import("haystack.document_stores.milvus2x", "MilvusDocumentStore", "milvus2") else: @@ -24,4 +25,5 @@ from haystack.document_stores.memory import InMemoryDocumentStore from haystack.document_stores.deepsetcloud import DeepsetCloudDocumentStore +#from haystack.document_stores.pinecone import PineconeDocumentStore from haystack.document_stores.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl diff --git a/setup.cfg b/setup.cfg index 971414ddb4..6bc294b14b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -124,6 +124,10 @@ milvus = farm-haystack[sql,only-milvus] weaviate = weaviate-client==2.5.0 +only-pinecone = + pinecone-client +pinecone = + farm-haystack[sql,only-pinecone] graphdb = SPARQLWrapper docstores = From c4d21a919b1bc73a004587007ec78ae9af0b79b6 Mon Sep 17 00:00:00 2001 From: James Briggs Date: Sun, 13 Feb 2022 20:08:18 +0700 Subject: [PATCH 02/58] implemented core functionality of Pinecone doc store --- haystack/document_stores/pinecone.py | 660 +++++++++++++++++++++++++++ 1 file changed, 660 insertions(+) create mode 100644 haystack/document_stores/pinecone.py diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py new file mode 100644 index 0000000000..6d6c3e168e --- /dev/null +++ b/haystack/document_stores/pinecone.py @@ -0,0 +1,660 @@ +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from haystack.nodes.retriever import BaseRetriever + +import json +import logging +from pathlib import Path +from typing import Union, List, Optional, Dict, Generator +from tqdm.auto import tqdm + +import pinecone +import faiss +import numpy as np + +from haystack.schema import Document +from haystack.document_stores.sql import SQLDocumentStore +from haystack.document_stores.base import get_batches_from_generator +from inspect import Signature, signature + +logger = logging.getLogger(__name__) + + +class PineconeDocumentStore(SQLDocumentStore): + """ + Document store for very large scale embedding based dense retrievers like the DPR. + + It implements the Pinecone vector database (https://www.pinecone.io) + to perform similarity search on vectors. + + The document text is stored using the SQLDocumentStore, while + the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index. + """ + top_k_limit = 10_000 + top_k_limit_vectors = 1_000 + def __init__( + self, + api_key: str, + environment: str = "us-west1-gcp", + sql_url: str = "sqlite:///pinecone_document_store.db", + pinecone_index: Optional["pinecone.Index"] = None, + vector_dim: int = 768, + return_embedding: bool = False, + index: str = "document", + similarity: str = "cosine", + replicas: int = 1, + shards: int = 1, + embedding_field: str = "embedding", + progress_bar: bool = True, + duplicate_documents: str = 'overwrite', + **kwargs, + ): + """ + :param api_key: Pinecone vector database API key (https://app.pinecone.io) + :param environment: Pinecone cloud environment uses "us-west1-gcp" by default. Other GCP and AWS regions are supported, + contact Pinecone if required. + :param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale + deployment, Postgres is recommended. + :param pinecone_index: pinecone-client Index object, an index will be initialized or loaded if not specified. + :param vector_dim: the embedding vector size. + :param return_embedding: To return document embedding + :param index: Name of index in document store to use. + :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default since it is + more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence-Transformer model. + In both cases, the returned values in Document.score are normalized to be in range [0,1]: + For `dot_product`: expit(np.asarray(raw_score / 100)) + For `cosine`: (raw_score + 1) / 2 + :param replicas: The number of replicas. Replicas duplicate your index. They provide higher availability and + throughput. + :param shards: The number of shards to be used in the index. We recommend you use 1 shard per 1GB of data. + :param embedding_field: Name of field containing an embedding vector. + :param progress_bar: Whether to show a tqdm progress bar or not. + Can be helpful to disable in production deployments to keep the logs clean. + :param duplicate_documents: Handle duplicates document based on parameter options. + Parameter options : ( 'skip','overwrite','fail') + skip: Ignore the duplicates documents + overwrite: Update any existing documents with the same ID when adding documents. + fail: an error is raised if the document ID of the document being added already + exists. + """ + # Connect to Pinecone server using python client binding + pinecone.init(api_key=api_key, environment=environment) + + # formal similarity string + if similarity in ("dot_product", "cosine"): + self.metric_type = similarity + elif similarity in ("l2", "euclidean"): + self.metric_type = "euclidean" + else: + raise ValueError("The Pinecone document store can currently only support dot_product, cosine and euclidean metrics. " + "Please set similarity to one of the above.") + + self.index = index + self.vector_dim = vector_dim + self.return_embedding = return_embedding + self.embedding_field = embedding_field + self.progress_bar = progress_bar + self.duplicate_documents = duplicate_documents + + # Pinecone index params + self.replicas = replicas + self.shards = shards + + # initialize dictionary of index connections + self.pinecone_indexes: Dict[str, pinecone.Index] = {} + clean_index = self._sanitize_index_name(index) + if pinecone_index: + self.pinecone_indexes[clean_index] = pinecone_index + else: + self.pinecone_indexes[clean_index] = self._create_index_if_not_exist( + vector_dim=self.vector_dim, + index=clean_index, + metric_type=self.metric_type, + replicas=self.replicas, + shards=self.shards + ) + + self.return_embedding = return_embedding + self.embedding_field = embedding_field + + self.progress_bar = progress_bar + + super().__init__( + url=sql_url, + index=index, # no sanitation for SQL index name + duplicate_documents=duplicate_documents + ) + + self._validate_index_sync() + + def _sanitize_index_name(self, index: Optional[str]) -> Optional[str]: + if index is None: + return None + elif "_" in index: + return index.replace('_', '-').lower() + else: + return index.lower() + + def _create_index_if_not_exist( + self, + vector_dim: int, + index: Optional[str] = None, + metric_type: Optional[str] = "cosine", + replicas: Optional[int] = 1, + shards: Optional[int] = 1 + ): + """ + Create a new index for storing documents in case if an + index with the name doesn't exist already. + """ + index = index or self.index + index = self._sanitize_index_name(index) + + # if index already loaded can skip + if index in self.pinecone_indexes.keys(): + index_conn = self.pinecone_indexes[index] + else: + # search pinecone hosted indexes and create if it does not exist + if index not in pinecone.list_indexes(): + pinecone.create_index( + name=index, + dimension=vector_dim, + metric=metric_type, + replicas = replicas, + shards = shards + ) + index_conn = pinecone.Index(index) + + # get index statistics + stats = index_conn.describe_index_stats() + dims = stats['dimension'] + count = stats['namespaces']['']['vector_count'] if stats['namespaces'].get('') else 0 + logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") + # return index connection + return index_conn + + def _convert_pinecone_result_to_document( + self, + result: dict, + return_embedding: bool + ) -> Document: + """ + Convert Pinecone result dict into haystack document object. This is more involved because + weaviate search result dict varies between get and query interfaces. + Weaviate get methods return the data items in properties key, whereas the query doesn't. + """ + score = None + content = "" + + id = result.get("id") + score = result.get("score") + embedding = result.get("values") + meta = result.get("metadata") + + content_type = None + if meta.get("contenttype") is not None: + content_type = str(meta.pop("contenttype")) + + if return_embedding and embedding: + embedding = np.asarray(embedding, dtype=np.float32) + + document = Document.from_dict({ + "id": id, + "content": content, + "content_type": content_type, + "meta": meta, + "score": score, + "embedding": embedding, + }) + return document + + def _validate_params_load_from_disk(self, sig: Signature, locals: dict, kwargs: dict): + # TODO probably not needed + raise NotImplementedError("_validate_params_load_from_disk not implemented for PineconeDocumentStore") + allowed_params = ["faiss_index_path", "faiss_config_path", "self", "kwargs"] + invalid_param_set = False + + for param in sig.parameters.values(): + if param.name not in allowed_params and param.default != locals[param.name]: + invalid_param_set = True + break + + if invalid_param_set or len(kwargs) > 0: + raise ValueError("if faiss_index_path is passed no other params besides faiss_config_path are allowed.") + + def _validate_index_sync(self): + # This check ensures the correct document database was loaded. + # If it fails, make sure you provided the path to the database + # used when creating the original Pinecone index + if not self.get_document_count() == self.get_embedding_count(): + raise ValueError("The number of documents present in the SQL database does not " + "match the number of embeddings in Pinecone. Make sure your Pinecone " + "index aligns to the same database that was used when creating the " + "original index.") + + def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None, + batch_size: int = 10_000, duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None) -> None: + """ + Add new documents to the DocumentStore. + + :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index + them right away in Pinecone. If not, you can later call update_embeddings() to create & index them. + :param index: (SQL) index name for storing the docs and metadata + :param batch_size: When working with large number of documents, batching can help reduce memory footprint. + :param duplicate_documents: Handle duplicates document based on parameter options. + Parameter options : ( 'skip','overwrite','fail') + skip: Ignore the duplicates documents + overwrite: Update any existing documents with the same ID when adding documents. + fail: an error is raised if the document ID of the document being added already + exists. + :raises DuplicateDocumentError: Exception trigger on duplicate document + :return: None + """ + if headers: + raise NotImplementedError("PineconeDocumentStore does not support headers.") + + index = index or self.index + index = self._sanitize_index_name(index) + duplicate_documents = duplicate_documents or self.duplicate_documents + assert duplicate_documents in self.duplicate_documents_options, \ + f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}" + + if not self.pinecone_indexes.get(index): + self.pinecone_indexes[index] = self._create_index_if_not_exist( + vector_dim=self.vector_dim, + index=index, + metric_type=self.metric, + replicas=self.replicas, + shards=self.shards + ) + + field_map = self._create_document_field_map() + document_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents] + document_objects = self._handle_duplicate_documents(documents=document_objects, + index=index, + duplicate_documents=duplicate_documents) + if len(document_objects) > 0: + add_vectors = False if document_objects[0].embedding is None else True + # I don't think below is required + """ + if self.duplicate_documents == "overwrite" and add_vectors: + logger.warning("You have to provide `duplicate_documents = 'overwrite'` arg and " + "`FAISSDocumentStore` does not support update in existing `faiss_index`.\n" + "Please call `update_embeddings` method to repopulate `faiss_index`") + """ + + with tqdm(total = len(document_objects), disable =not self.progress_bar, position=0, + desc="Writing Documents") as progress_bar: + for i in range(0, len(document_objects), batch_size): + ids = [doc.id for doc in document_objects[i: i + batch_size]] + # TODO find way to identify long metadata fields and split these to be stored in SQL + metadata = [doc.meta for doc in document_objects[i: i + batch_size]] + if add_vectors: + embeddings = [doc.embedding for doc in document_objects[i: i + batch_size]] + embeddings_to_index = np.array(embeddings, dtype="float32") + + if self.similarity=="cosine": self.normalize_embedding(embeddings_to_index) + # TODO not sure if required to convert to list objects (maybe already are) + embeddings = [embed.tolist() for embed in embeddings] + vectors = zip(ids, embeddings, metadata) + self.pinecone_indexes[index].upsert(vectors=vectors) + + docs_to_write_in_sql = [] + for doc in document_objects[i: i + batch_size]: + # TODO I think this is not necessary as we have doc.id, before was required + # to map from doc.id to the integer 'vector_id' values used by faiss - but + # we do need to use vector_id as this is used by the sql doc store + #if add_vectors: + doc.meta["vector_id"] = doc.id + docs_to_write_in_sql.append(doc) + super(PineconeDocumentStore, self).write_documents(docs_to_write_in_sql, index=index, + duplicate_documents=duplicate_documents) + progress_bar.update(batch_size) + progress_bar.close() + + def _create_document_field_map(self) -> Dict: + return { + self.index: self.embedding_field, + } + + def update_embeddings( + self, + retriever: 'BaseRetriever', + index: Optional[str] = None, + update_existing_embeddings: bool = True, + filters: Optional[Dict] = None, + batch_size: int = 10_000 + ): + """ + Updates the embeddings in the the document store using the encoding model specified in the retriever. + This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + + :param retriever: Retriever to use to get embeddings for text + :param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used. + :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False, + only documents without embeddings are processed. This mode can be used for + incremental updating of embeddings, wherein, only newly indexed documents + get processed. + :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. + Example: {"genre": {"$in": ["documentary", "action"]}}, + more info on filtering syntax here https://www.pinecone.io/docs/metadata-filtering/ + :param batch_size: When working with large number of documents, batching can help reduce memory footprint. + :return: None + """ + if filters: + raise Exception("update_embeddings does not support filtering.") + + index = index or self.index + index = self._sanitize_index_name(index) + + if not self.pinecone_indexes.get(index): + raise ValueError("Couldn't find a Pinecone index. Try to init the PineconeDocumentStore() again ...") + + document_count = self.get_document_count(index=index) + if document_count == 0: + logger.warning("Calling DocumentStore.update_embeddings() on an empty index") + return + + logger.info(f"Updating embeddings for {document_count} docs...") + + result = self._query( + index=index, + vector_ids=None, + batch_size=batch_size, + filters=filters, + only_documents_without_embedding=not update_existing_embeddings + ) + batched_documents = get_batches_from_generator(result, batch_size) + with tqdm(total=document_count, disable=not self.progress_bar, position=0, unit=" docs", + desc="Updating Embedding") as progress_bar: + for document_batch in batched_documents: + embeddings = retriever.embed_documents(document_batch) # type: ignore + assert len(document_batch) == len(embeddings) + + embeddings_to_index = np.array(embeddings, dtype="float32") + + if self.similarity=="cosine": self.normalize_embedding(embeddings_to_index) + + embeddings = embeddings.tolist() + + metadata = [] + ids = [] + for doc in document_batch: + # TODO if vector_id unecessary then rewrite below (maybe it is needed) + metadata.append({key: value for key, value in doc.meta.items() if key != "vector_id"}) + ids.append(doc.id) + # update existing vectors in pinecone index + self.pinecone_indexes[index].upsert(vectors=zip(ids, embeddings, metadata)) + + progress_bar.set_description_str("Documents Processed") + progress_bar.update(batch_size) + + def get_all_documents( + self, + index: Optional[str] = None, + filters: Optional[Dict] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None + ) -> List[Document]: + if headers: + raise NotImplementedError("PineconeDocumentStore does not support headers.") + if filters: + raise Exception("get_all_documents does not support filters.") + self._limit_check(batch_size) + + result = self.get_all_documents_generator( + index=index, filters=filters, return_embedding=return_embedding, batch_size=batch_size + ) + documents = list(result) + return documents + + def get_all_documents_generator( + self, + index: Optional[str] = None, + filters: Optional[Dict] = None, + return_embedding: Optional[bool] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None + ) -> Generator[Document, None, None]: + """ + Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + document store and yielded as individual documents. This method can be used to iteratively process + a large number of documents without having to load all documents in memory. + + :param index: Name of the index to get the documents from. If None, the + DocumentStore's default index (self.index) will be used. + :param filters: Optional filters to narrow down the documents to return. + Example: {"genre": {"$in": ["documentary", "action"]}}, + more info on filtering syntax here https://www.pinecone.io/docs/metadata-filtering/ + :param return_embedding: Whether to return the document embeddings. + :param batch_size: When working with large number of documents, batching can help reduce memory footprint. + """ + if headers: + raise NotImplementedError("PineconeDocumentStore does not support headers.") + if filters: + raise Exception("get_all_documents_generator does not support filters.") + self._limit_check(batch_size) + + index = index or self.index + index = self._sanitize_index_name(index) + documents = super(PineconeDocumentStore, self).get_all_documents_generator( + index=index, filters=filters, batch_size=batch_size, return_embedding=False + ) + if return_embedding is None: + return_embedding = self.return_embedding + + for doc in documents: + if return_embedding: + if doc.meta and doc.meta.get("vector_id") is not None: + res = self.pinecone_indexes[index].fetch(ids=[doc.id]) + if res['vectors'].get(doc.id): + doc.embedding = self._convert_pinecone_result_to_document( + result=res['vectors'][doc.id], + return_embedding=return_embedding + ).embedding + yield doc + + def get_documents_by_id( + self, ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None + ) -> List[Document]: + if headers: + raise NotImplementedError("PineconeDocumentStore does not support headers.") + self._limit_check(batch_size) + + index = index or self.index + index = self._sanitize_index_name(index) + # TODO could put this repetative chunk in a _method? + if not self.pinecone_indexes.get(index): + self.pinecone_indexes[index] = self._create_index_if_not_exist( + vector_dim=self.vector_dim, + index=index, + metric_type=self.metric, + replicas=self.replicas, + shards=self.shards + ) + # check there are vectors + count = self.get_embedding_count(index) + if count == 0: + raise Exception("No documents exist, try creating documents with write_embeddings first.") + res = self.pinecone_indexes[index].fetch(ids=ids) + # convert Pinecone responses to documents + documents = [] + for id_val in ids: + # check exists + if res['vectors'].get(id_val): + documents.append( + self._convert_pinecone_result_to_document( + result=res['vectors'][id_val], + return_embedding=self.return_embedding + ) + ) + # get content from SQL + content = super().get_documents_by_id([doc.id for doc in documents]) + for i, doc in enumerate(documents): + doc.content = content[i].content + return documents + + def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict] = None) -> int: + """ + Return the count of embeddings in the document store. + """ + if filters: + raise Exception("Filters are not supported for get_embedding_count in PineconeDocumentStore") + index = index or self.index + index = self._sanitize_index_name(index) + if not self.pinecone_indexes.get(index): + self.pinecone_indexes[index] = self._create_index_if_not_exist( + vector_dim=self.vector_dim, + index=self.index, + metric_type=self.metric, + replicas=self.replicas, + shards=self.shards + ) + + stats = self.pinecone_indexes[index].describe_index_stats() + # if no namespace return zero + count = stats['namespaces']['']['vector_count'] if stats['namespaces'].get('') else 0 + return count + + def train_index( + self, + documents: Optional[Union[List[dict], List[Document]]], + embeddings: Optional[np.ndarray] = None, + index: Optional[str] = None, + ): + """ + Not applicable to PineconeDocumentStore. + """ + raise NotImplementedError("PineconeDocumentStore does not require training") + + def delete_documents(self, index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict] = None, headers: Optional[Dict[str, str]] = None): + """ + Delete documents from the document store. + + :param index: Index name to delete the documents from. If None, the + DocumentStore's default index (self.index) will be used. + :param ids: Optional list of IDs to narrow down the documents to be deleted. + :param filters: Optional filters to narrow down the documents to be deleted (not supported by PineconeDocumentStore). + Example: {"genre": {"$in": ["documentary", "action"]}}, + more info on filtering syntax here https://www.pinecone.io/docs/metadata-filtering/ + :return: None + """ + if headers: + raise NotImplementedError("PineconeDocumentStore does not support headers.") + if filters: + raise NotImplementedError("PineconeDocumentStore does not support filtering during document deletion.") + + index = index or self.index + index = self._sanitize_index_name(index) + if not self.pinecone_indexes.get(index): + self.pinecone_indexes[index] = self._create_index_if_not_exist( + vector_dim=self.vector_dim, + index=self.index, + metric_type=self.metric, + replicas=self.replicas, + shards=self.shards + ) + _ = self.pinecone_indexes[index].delete(ids=ids) + # delete from SQL + super().delete_documents(index=index, ids=ids, filters=filters) + + def query_by_embedding( + self, + query_emb: np.ndarray, + filters: Optional[Dict] = None, + top_k: int = 10, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + headers: Optional[Dict[str, str]] = None + ) -> List[Document]: + """ + Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + + :param query_emb: Embedding of the query (e.g. gathered from DPR) + :param filters: Optional filters to narrow down the search space. + Example: {"genre": {"$in": ["documentary", "action"]}}, + more info on filtering syntax here https://www.pinecone.io/docs/metadata-filtering/ + :param top_k: How many documents to return + :param index: Index name to query the document from. + :param return_embedding: To return document embedding + :return: + """ + if headers: + raise NotImplementedError("PineconeDocumentStore does not support headers.") + self._limit_check(top_k, include_values=return_embedding) + + index = index or self.index + index = self._sanitize_index_name(index) + + if not self.pinecone_indexes.get(index): + raise Exception(f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running 'update_embeddings()' to create and populate an index.") + + if return_embedding is None: + return_embedding = self.return_embedding + + query_emb = query_emb.reshape(1, -1).astype(np.float32) + + if self.similarity=="cosine": self.normalize_embedding(query_emb) + + res = self.pinecone_indexes[index].query( + query_emb.tolist(), + top_k=top_k, + include_values=True, + filter=filters + ) + + score_matrix = [] + vector_id_matrix = [] + for match in res['results'][0]['matches']: + score_matrix.append(match['score']) + vector_id_matrix.append(match['id']) + + documents = self.get_documents_by_vector_ids(vector_id_matrix, index=index) + + #assign query score to each document + scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)} + for i, doc in enumerate(documents): + raw_score = scores_for_vector_ids[doc.id] + doc.score = self.finalize_raw_score(raw_score,self.similarity) + + if return_embedding is True: + # get embedding from Pinecone response + doc.embedding = self.pinecone_indexes[index].reconstruct(int(doc.id)) + + return documents + + def save(self): + """ + Save index to the specified file, not implemented for PineconeDocumentStore. + """ + raise NotImplementedError("save method not implemented for PineconeDocumentStore") + + def _load_init_params_from_config(self, index_path: Optional[Union[str, Path]] = None, config_path: Optional[Union[str, Path]] = None): + raise NotImplementedError("Load init params from config not implemented for Pinecone") + + def _limit_check(self, top_k: str, include_values: Optional[bool] = None): + """ + Confirms the top_k value does not exceed Pinecone vector database limits. + """ + if include_values: + if top_k > self.top_k_limit_vectors: + raise Exception( + f"PineconeDocumentStore allows requests of no more than {self.top_k_limit_vectors} records ", + f"when returning embedding values. This request is attempting to return {top_k} records." + ) + else: + if top_k > self.top_k_limit: + raise Exception( + f"PineconeDocumentStore allows requests of no more than {self.top_k_limit} records. ", + f"This request is attempting to return {top_k} records." + ) + + @classmethod + def load(): + """ + Default class method used for loading indexes. Not applicable to the PineconeDocumentStore. + """ + raise NotImplementedError("load method not supported for PineconeDocumentStore") From 4eff62b91c6bac18fa39e201fe26ef9165f52414 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 24 Feb 2022 11:12:41 +0000 Subject: [PATCH 03/58] Update Documentation & Code Style --- haystack/document_stores/pinecone.py | 227 ++++++++++++++------------- 1 file changed, 122 insertions(+), 105 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 6d6c3e168e..03c74e95bc 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -31,8 +31,10 @@ class PineconeDocumentStore(SQLDocumentStore): The document text is stored using the SQLDocumentStore, while the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index. """ + top_k_limit = 10_000 top_k_limit_vectors = 1_000 + def __init__( self, api_key: str, @@ -47,12 +49,12 @@ def __init__( shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, - duplicate_documents: str = 'overwrite', + duplicate_documents: str = "overwrite", **kwargs, ): """ :param api_key: Pinecone vector database API key (https://app.pinecone.io) - :param environment: Pinecone cloud environment uses "us-west1-gcp" by default. Other GCP and AWS regions are supported, + :param environment: Pinecone cloud environment uses "us-west1-gcp" by default. Other GCP and AWS regions are supported, contact Pinecone if required. :param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale deployment, Postgres is recommended. @@ -62,7 +64,7 @@ def __init__( :param index: Name of index in document store to use. :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default since it is more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence-Transformer model. - In both cases, the returned values in Document.score are normalized to be in range [0,1]: + In both cases, the returned values in Document.score are normalized to be in range [0,1]: For `dot_product`: expit(np.asarray(raw_score / 100)) For `cosine`: (raw_score + 1) / 2 :param replicas: The number of replicas. Replicas duplicate your index. They provide higher availability and @@ -87,8 +89,10 @@ def __init__( elif similarity in ("l2", "euclidean"): self.metric_type = "euclidean" else: - raise ValueError("The Pinecone document store can currently only support dot_product, cosine and euclidean metrics. " - "Please set similarity to one of the above.") + raise ValueError( + "The Pinecone document store can currently only support dot_product, cosine and euclidean metrics. " + "Please set similarity to one of the above." + ) self.index = index self.vector_dim = vector_dim @@ -112,7 +116,7 @@ def __init__( index=clean_index, metric_type=self.metric_type, replicas=self.replicas, - shards=self.shards + shards=self.shards, ) self.return_embedding = return_embedding @@ -121,18 +125,16 @@ def __init__( self.progress_bar = progress_bar super().__init__( - url=sql_url, - index=index, # no sanitation for SQL index name - duplicate_documents=duplicate_documents + url=sql_url, index=index, duplicate_documents=duplicate_documents # no sanitation for SQL index name ) self._validate_index_sync() - + def _sanitize_index_name(self, index: Optional[str]) -> Optional[str]: if index is None: return None elif "_" in index: - return index.replace('_', '-').lower() + return index.replace("_", "-").lower() else: return index.lower() @@ -142,10 +144,10 @@ def _create_index_if_not_exist( index: Optional[str] = None, metric_type: Optional[str] = "cosine", replicas: Optional[int] = 1, - shards: Optional[int] = 1 + shards: Optional[int] = 1, ): """ - Create a new index for storing documents in case if an + Create a new index for storing documents in case if an index with the name doesn't exist already. """ index = index or self.index @@ -158,27 +160,19 @@ def _create_index_if_not_exist( # search pinecone hosted indexes and create if it does not exist if index not in pinecone.list_indexes(): pinecone.create_index( - name=index, - dimension=vector_dim, - metric=metric_type, - replicas = replicas, - shards = shards + name=index, dimension=vector_dim, metric=metric_type, replicas=replicas, shards=shards ) index_conn = pinecone.Index(index) # get index statistics stats = index_conn.describe_index_stats() - dims = stats['dimension'] - count = stats['namespaces']['']['vector_count'] if stats['namespaces'].get('') else 0 + dims = stats["dimension"] + count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") # return index connection return index_conn - def _convert_pinecone_result_to_document( - self, - result: dict, - return_embedding: bool - ) -> Document: + def _convert_pinecone_result_to_document(self, result: dict, return_embedding: bool) -> Document: """ Convert Pinecone result dict into haystack document object. This is more involved because weaviate search result dict varies between get and query interfaces. @@ -198,15 +192,17 @@ def _convert_pinecone_result_to_document( if return_embedding and embedding: embedding = np.asarray(embedding, dtype=np.float32) - - document = Document.from_dict({ - "id": id, - "content": content, - "content_type": content_type, - "meta": meta, - "score": score, - "embedding": embedding, - }) + + document = Document.from_dict( + { + "id": id, + "content": content, + "content_type": content_type, + "meta": meta, + "score": score, + "embedding": embedding, + } + ) return document def _validate_params_load_from_disk(self, sig: Signature, locals: dict, kwargs: dict): @@ -217,25 +213,32 @@ def _validate_params_load_from_disk(self, sig: Signature, locals: dict, kwargs: for param in sig.parameters.values(): if param.name not in allowed_params and param.default != locals[param.name]: - invalid_param_set = True - break - + invalid_param_set = True + break + if invalid_param_set or len(kwargs) > 0: raise ValueError("if faiss_index_path is passed no other params besides faiss_config_path are allowed.") - def _validate_index_sync(self): + def _validate_index_sync(self): # This check ensures the correct document database was loaded. # If it fails, make sure you provided the path to the database # used when creating the original Pinecone index if not self.get_document_count() == self.get_embedding_count(): - raise ValueError("The number of documents present in the SQL database does not " - "match the number of embeddings in Pinecone. Make sure your Pinecone " - "index aligns to the same database that was used when creating the " - "original index.") - - def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None, - batch_size: int = 10_000, duplicate_documents: Optional[str] = None, - headers: Optional[Dict[str, str]] = None) -> None: + raise ValueError( + "The number of documents present in the SQL database does not " + "match the number of embeddings in Pinecone. Make sure your Pinecone " + "index aligns to the same database that was used when creating the " + "original index." + ) + + def write_documents( + self, + documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 10_000, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + ) -> None: """ Add new documents to the DocumentStore. @@ -254,12 +257,13 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O """ if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") - + index = index or self.index index = self._sanitize_index_name(index) duplicate_documents = duplicate_documents or self.duplicate_documents - assert duplicate_documents in self.duplicate_documents_options, \ - f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}" + assert ( + duplicate_documents in self.duplicate_documents_options + ), f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}" if not self.pinecone_indexes.get(index): self.pinecone_indexes[index] = self._create_index_if_not_exist( @@ -267,14 +271,14 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O index=index, metric_type=self.metric, replicas=self.replicas, - shards=self.shards + shards=self.shards, ) field_map = self._create_document_field_map() document_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents] - document_objects = self._handle_duplicate_documents(documents=document_objects, - index=index, - duplicate_documents=duplicate_documents) + document_objects = self._handle_duplicate_documents( + documents=document_objects, index=index, duplicate_documents=duplicate_documents + ) if len(document_objects) > 0: add_vectors = False if document_objects[0].embedding is None else True # I don't think below is required @@ -285,32 +289,35 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O "Please call `update_embeddings` method to repopulate `faiss_index`") """ - with tqdm(total = len(document_objects), disable =not self.progress_bar, position=0, - desc="Writing Documents") as progress_bar: + with tqdm( + total=len(document_objects), disable=not self.progress_bar, position=0, desc="Writing Documents" + ) as progress_bar: for i in range(0, len(document_objects), batch_size): - ids = [doc.id for doc in document_objects[i: i + batch_size]] + ids = [doc.id for doc in document_objects[i : i + batch_size]] # TODO find way to identify long metadata fields and split these to be stored in SQL - metadata = [doc.meta for doc in document_objects[i: i + batch_size]] + metadata = [doc.meta for doc in document_objects[i : i + batch_size]] if add_vectors: - embeddings = [doc.embedding for doc in document_objects[i: i + batch_size]] + embeddings = [doc.embedding for doc in document_objects[i : i + batch_size]] embeddings_to_index = np.array(embeddings, dtype="float32") - if self.similarity=="cosine": self.normalize_embedding(embeddings_to_index) + if self.similarity == "cosine": + self.normalize_embedding(embeddings_to_index) # TODO not sure if required to convert to list objects (maybe already are) embeddings = [embed.tolist() for embed in embeddings] vectors = zip(ids, embeddings, metadata) self.pinecone_indexes[index].upsert(vectors=vectors) docs_to_write_in_sql = [] - for doc in document_objects[i: i + batch_size]: + for doc in document_objects[i : i + batch_size]: # TODO I think this is not necessary as we have doc.id, before was required # to map from doc.id to the integer 'vector_id' values used by faiss - but # we do need to use vector_id as this is used by the sql doc store - #if add_vectors: + # if add_vectors: doc.meta["vector_id"] = doc.id docs_to_write_in_sql.append(doc) - super(PineconeDocumentStore, self).write_documents(docs_to_write_in_sql, index=index, - duplicate_documents=duplicate_documents) + super(PineconeDocumentStore, self).write_documents( + docs_to_write_in_sql, index=index, duplicate_documents=duplicate_documents + ) progress_bar.update(batch_size) progress_bar.close() @@ -321,11 +328,11 @@ def _create_document_field_map(self) -> Dict: def update_embeddings( self, - retriever: 'BaseRetriever', + retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict] = None, - batch_size: int = 10_000 + batch_size: int = 10_000, ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -358,24 +365,26 @@ def update_embeddings( return logger.info(f"Updating embeddings for {document_count} docs...") - + result = self._query( index=index, vector_ids=None, batch_size=batch_size, filters=filters, - only_documents_without_embedding=not update_existing_embeddings + only_documents_without_embedding=not update_existing_embeddings, ) batched_documents = get_batches_from_generator(result, batch_size) - with tqdm(total=document_count, disable=not self.progress_bar, position=0, unit=" docs", - desc="Updating Embedding") as progress_bar: + with tqdm( + total=document_count, disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding" + ) as progress_bar: for document_batch in batched_documents: embeddings = retriever.embed_documents(document_batch) # type: ignore assert len(document_batch) == len(embeddings) embeddings_to_index = np.array(embeddings, dtype="float32") - if self.similarity=="cosine": self.normalize_embedding(embeddings_to_index) + if self.similarity == "cosine": + self.normalize_embedding(embeddings_to_index) embeddings = embeddings.tolist() @@ -397,7 +406,7 @@ def get_all_documents( filters: Optional[Dict] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, - headers: Optional[Dict[str, str]] = None + headers: Optional[Dict[str, str]] = None, ) -> List[Document]: if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") @@ -417,7 +426,7 @@ def get_all_documents_generator( filters: Optional[Dict] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, - headers: Optional[Dict[str, str]] = None + headers: Optional[Dict[str, str]] = None, ) -> Generator[Document, None, None]: """ Get all documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -437,7 +446,7 @@ def get_all_documents_generator( if filters: raise Exception("get_all_documents_generator does not support filters.") self._limit_check(batch_size) - + index = index or self.index index = self._sanitize_index_name(index) documents = super(PineconeDocumentStore, self).get_all_documents_generator( @@ -450,20 +459,23 @@ def get_all_documents_generator( if return_embedding: if doc.meta and doc.meta.get("vector_id") is not None: res = self.pinecone_indexes[index].fetch(ids=[doc.id]) - if res['vectors'].get(doc.id): + if res["vectors"].get(doc.id): doc.embedding = self._convert_pinecone_result_to_document( - result=res['vectors'][doc.id], - return_embedding=return_embedding + result=res["vectors"][doc.id], return_embedding=return_embedding ).embedding yield doc def get_documents_by_id( - self, ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None + self, + ids: List[str], + index: Optional[str] = None, + batch_size: int = 10_000, + headers: Optional[Dict[str, str]] = None, ) -> List[Document]: if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") self._limit_check(batch_size) - + index = index or self.index index = self._sanitize_index_name(index) # TODO could put this repetative chunk in a _method? @@ -473,7 +485,7 @@ def get_documents_by_id( index=index, metric_type=self.metric, replicas=self.replicas, - shards=self.shards + shards=self.shards, ) # check there are vectors count = self.get_embedding_count(index) @@ -484,11 +496,10 @@ def get_documents_by_id( documents = [] for id_val in ids: # check exists - if res['vectors'].get(id_val): + if res["vectors"].get(id_val): documents.append( self._convert_pinecone_result_to_document( - result=res['vectors'][id_val], - return_embedding=self.return_embedding + result=res["vectors"][id_val], return_embedding=self.return_embedding ) ) # get content from SQL @@ -511,12 +522,12 @@ def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dic index=self.index, metric_type=self.metric, replicas=self.replicas, - shards=self.shards + shards=self.shards, ) stats = self.pinecone_indexes[index].describe_index_stats() # if no namespace return zero - count = stats['namespaces']['']['vector_count'] if stats['namespaces'].get('') else 0 + count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 return count def train_index( @@ -530,7 +541,13 @@ def train_index( """ raise NotImplementedError("PineconeDocumentStore does not require training") - def delete_documents(self, index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict] = None, headers: Optional[Dict[str, str]] = None): + def delete_documents( + self, + index: Optional[str] = None, + ids: Optional[List[str]] = None, + filters: Optional[Dict] = None, + headers: Optional[Dict[str, str]] = None, + ): """ Delete documents from the document store. @@ -546,7 +563,7 @@ def delete_documents(self, index: Optional[str] = None, ids: Optional[List[str]] raise NotImplementedError("PineconeDocumentStore does not support headers.") if filters: raise NotImplementedError("PineconeDocumentStore does not support filtering during document deletion.") - + index = index or self.index index = self._sanitize_index_name(index) if not self.pinecone_indexes.get(index): @@ -555,7 +572,7 @@ def delete_documents(self, index: Optional[str] = None, ids: Optional[List[str]] index=self.index, metric_type=self.metric, replicas=self.replicas, - shards=self.shards + shards=self.shards, ) _ = self.pinecone_indexes[index].delete(ids=ids) # delete from SQL @@ -568,7 +585,7 @@ def query_by_embedding( top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, - headers: Optional[Dict[str, str]] = None + headers: Optional[Dict[str, str]] = None, ) -> List[Document]: """ Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. @@ -590,35 +607,33 @@ def query_by_embedding( index = self._sanitize_index_name(index) if not self.pinecone_indexes.get(index): - raise Exception(f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running 'update_embeddings()' to create and populate an index.") + raise Exception( + f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running 'update_embeddings()' to create and populate an index." + ) if return_embedding is None: return_embedding = self.return_embedding query_emb = query_emb.reshape(1, -1).astype(np.float32) - if self.similarity=="cosine": self.normalize_embedding(query_emb) + if self.similarity == "cosine": + self.normalize_embedding(query_emb) - res = self.pinecone_indexes[index].query( - query_emb.tolist(), - top_k=top_k, - include_values=True, - filter=filters - ) + res = self.pinecone_indexes[index].query(query_emb.tolist(), top_k=top_k, include_values=True, filter=filters) score_matrix = [] vector_id_matrix = [] - for match in res['results'][0]['matches']: - score_matrix.append(match['score']) - vector_id_matrix.append(match['id']) + for match in res["results"][0]["matches"]: + score_matrix.append(match["score"]) + vector_id_matrix.append(match["id"]) documents = self.get_documents_by_vector_ids(vector_id_matrix, index=index) - #assign query score to each document + # assign query score to each document scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)} for i, doc in enumerate(documents): raw_score = scores_for_vector_ids[doc.id] - doc.score = self.finalize_raw_score(raw_score,self.similarity) + doc.score = self.finalize_raw_score(raw_score, self.similarity) if return_embedding is True: # get embedding from Pinecone response @@ -632,9 +647,11 @@ def save(self): """ raise NotImplementedError("save method not implemented for PineconeDocumentStore") - def _load_init_params_from_config(self, index_path: Optional[Union[str, Path]] = None, config_path: Optional[Union[str, Path]] = None): + def _load_init_params_from_config( + self, index_path: Optional[Union[str, Path]] = None, config_path: Optional[Union[str, Path]] = None + ): raise NotImplementedError("Load init params from config not implemented for Pinecone") - + def _limit_check(self, top_k: str, include_values: Optional[bool] = None): """ Confirms the top_k value does not exceed Pinecone vector database limits. @@ -643,13 +660,13 @@ def _limit_check(self, top_k: str, include_values: Optional[bool] = None): if top_k > self.top_k_limit_vectors: raise Exception( f"PineconeDocumentStore allows requests of no more than {self.top_k_limit_vectors} records ", - f"when returning embedding values. This request is attempting to return {top_k} records." + f"when returning embedding values. This request is attempting to return {top_k} records.", ) else: if top_k > self.top_k_limit: raise Exception( f"PineconeDocumentStore allows requests of no more than {self.top_k_limit} records. ", - f"This request is attempting to return {top_k} records." + f"This request is attempting to return {top_k} records.", ) @classmethod From 15debe772d6f85ccb9984d120b7105c035e91b98 Mon Sep 17 00:00:00 2001 From: James Briggs Date: Sat, 26 Feb 2022 18:51:45 +0700 Subject: [PATCH 04/58] updated filtering to use Haystack filtering and reduced default batch_size --- haystack/document_stores/pinecone.py | 76 +++++++++++++++------------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 6d6c3e168e..3bf7a60fce 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -3,20 +3,18 @@ if TYPE_CHECKING: from haystack.nodes.retriever import BaseRetriever -import json import logging from pathlib import Path from typing import Union, List, Optional, Dict, Generator from tqdm.auto import tqdm import pinecone -import faiss import numpy as np from haystack.schema import Document from haystack.document_stores.sql import SQLDocumentStore from haystack.document_stores.base import get_batches_from_generator -from inspect import Signature, signature +from inspect import Signature logger = logging.getLogger(__name__) @@ -210,20 +208,9 @@ def _convert_pinecone_result_to_document( return document def _validate_params_load_from_disk(self, sig: Signature, locals: dict, kwargs: dict): - # TODO probably not needed raise NotImplementedError("_validate_params_load_from_disk not implemented for PineconeDocumentStore") - allowed_params = ["faiss_index_path", "faiss_config_path", "self", "kwargs"] - invalid_param_set = False - for param in sig.parameters.values(): - if param.name not in allowed_params and param.default != locals[param.name]: - invalid_param_set = True - break - - if invalid_param_set or len(kwargs) > 0: - raise ValueError("if faiss_index_path is passed no other params besides faiss_config_path are allowed.") - - def _validate_index_sync(self): + def _validate_index_sync(self): # This check ensures the correct document database was loaded. # If it fails, make sure you provided the path to the database # used when creating the original Pinecone index @@ -234,7 +221,7 @@ def _validate_index_sync(self): "original index.") def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None, - batch_size: int = 10_000, duplicate_documents: Optional[str] = None, + batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None: """ Add new documents to the DocumentStore. @@ -278,35 +265,24 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O if len(document_objects) > 0: add_vectors = False if document_objects[0].embedding is None else True # I don't think below is required - """ - if self.duplicate_documents == "overwrite" and add_vectors: - logger.warning("You have to provide `duplicate_documents = 'overwrite'` arg and " - "`FAISSDocumentStore` does not support update in existing `faiss_index`.\n" - "Please call `update_embeddings` method to repopulate `faiss_index`") - """ - with tqdm(total = len(document_objects), disable =not self.progress_bar, position=0, desc="Writing Documents") as progress_bar: for i in range(0, len(document_objects), batch_size): ids = [doc.id for doc in document_objects[i: i + batch_size]] - # TODO find way to identify long metadata fields and split these to be stored in SQL + # metadata fields are stored in Pinecone metadata = [doc.meta for doc in document_objects[i: i + batch_size]] if add_vectors: embeddings = [doc.embedding for doc in document_objects[i: i + batch_size]] embeddings_to_index = np.array(embeddings, dtype="float32") if self.similarity=="cosine": self.normalize_embedding(embeddings_to_index) - # TODO not sure if required to convert to list objects (maybe already are) + # to convert to list objects embeddings = [embed.tolist() for embed in embeddings] vectors = zip(ids, embeddings, metadata) self.pinecone_indexes[index].upsert(vectors=vectors) docs_to_write_in_sql = [] for doc in document_objects[i: i + batch_size]: - # TODO I think this is not necessary as we have doc.id, before was required - # to map from doc.id to the integer 'vector_id' values used by faiss - but - # we do need to use vector_id as this is used by the sql doc store - #if add_vectors: doc.meta["vector_id"] = doc.id docs_to_write_in_sql.append(doc) super(PineconeDocumentStore, self).write_documents(docs_to_write_in_sql, index=index, @@ -319,13 +295,42 @@ def _create_document_field_map(self) -> Dict: self.index: self.embedding_field, } + def _build_filter_clause(self, filters: Dict[str, Union[str, int, float, bool, list]]) -> dict: + """ + Transform Haystack filter conditions to Pinecone metadata filter syntax. + Haystack syntax == {'item_id': ['B00006IBLJ', 'B000GHJM9C', 'B000CS787S']} + Pinecone syntax == {'item_id': {'$in': ['B00006IBLJ', 'B000GHJM9C', 'B000CS787S']}} + """ + pinecone_filter = {} + for key, value in filters.items(): + if key in ['$and', '$or'] and type(value) is dict: + sublist = [] + for sub_key, sub_value in value.items(): + sublist.append(self._build_filter_clause({sub_key: sub_value})) + pinecone_filter[key] = sublist + elif type(value) is list and key[0] != '$': + pinecone_filter[key] = {'$in': value} + elif type(value) is list and key in ['$and', '$or']: + # check if we have more operators in sublist + sublist = [] + for sub_item in value: + print(f"sub_item: {sub_item}") + if type(sub_item) is dict: + sublist.append(self._build_filter_clause(sub_item)) + else: + sublist.append(sub_item) + pinecone_filter[key] = sublist + else: + pinecone_filter[key] = value + return pinecone_filter + def update_embeddings( self, retriever: 'BaseRetriever', index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict] = None, - batch_size: int = 10_000 + batch_size: int = 32 ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -382,7 +387,6 @@ def update_embeddings( metadata = [] ids = [] for doc in document_batch: - # TODO if vector_id unecessary then rewrite below (maybe it is needed) metadata.append({key: value for key, value in doc.meta.items() if key != "vector_id"}) ids.append(doc.id) # update existing vectors in pinecone index @@ -396,7 +400,7 @@ def get_all_documents( index: Optional[str] = None, filters: Optional[Dict] = None, return_embedding: Optional[bool] = None, - batch_size: int = 10_000, + batch_size: int = 32, headers: Optional[Dict[str, str]] = None ) -> List[Document]: if headers: @@ -416,7 +420,7 @@ def get_all_documents_generator( index: Optional[str] = None, filters: Optional[Dict] = None, return_embedding: Optional[bool] = None, - batch_size: int = 10_000, + batch_size: int = 32, headers: Optional[Dict[str, str]] = None ) -> Generator[Document, None, None]: """ @@ -458,7 +462,7 @@ def get_all_documents_generator( yield doc def get_documents_by_id( - self, ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None + self, ids: List[str], index: Optional[str] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None ) -> List[Document]: if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") @@ -466,7 +470,7 @@ def get_documents_by_id( index = index or self.index index = self._sanitize_index_name(index) - # TODO could put this repetative chunk in a _method? + # get or create index if not self.pinecone_indexes.get(index): self.pinecone_indexes[index] = self._create_index_if_not_exist( vector_dim=self.vector_dim, @@ -585,6 +589,8 @@ def query_by_embedding( if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") self._limit_check(top_k, include_values=return_embedding) + if filters: + filters = self._build_filter_clause(filters) index = index or self.index index = self._sanitize_index_name(index) From 9657c48a9c9d1a925c1bf0f45a96994c530f77a8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sat, 26 Feb 2022 12:10:54 +0000 Subject: [PATCH 05/58] Update Documentation & Code Style --- haystack/document_stores/pinecone.py | 55 ++++++++++++++++------------ 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 90d1a1da11..8d595ddaa9 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -211,14 +211,21 @@ def _validate_index_sync(self): # If it fails, make sure you provided the path to the database # used when creating the original Pinecone index if not self.get_document_count() == self.get_embedding_count(): - raise ValueError("The number of documents present in the SQL database does not " - "match the number of embeddings in Pinecone. Make sure your Pinecone " - "index aligns to the same database that was used when creating the " - "original index.") - - def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None, - batch_size: int = 32, duplicate_documents: Optional[str] = None, - headers: Optional[Dict[str, str]] = None) -> None: + raise ValueError( + "The number of documents present in the SQL database does not " + "match the number of embeddings in Pinecone. Make sure your Pinecone " + "index aligns to the same database that was used when creating the " + "original index." + ) + + def write_documents( + self, + documents: Union[List[dict], List[Document]], + index: Optional[str] = None, + batch_size: int = 32, + duplicate_documents: Optional[str] = None, + headers: Optional[Dict[str, str]] = None, + ) -> None: """ Add new documents to the DocumentStore. @@ -262,24 +269,26 @@ def write_documents(self, documents: Union[List[dict], List[Document]], index: O if len(document_objects) > 0: add_vectors = False if document_objects[0].embedding is None else True # I don't think below is required - with tqdm(total = len(document_objects), disable =not self.progress_bar, position=0, - desc="Writing Documents") as progress_bar: + with tqdm( + total=len(document_objects), disable=not self.progress_bar, position=0, desc="Writing Documents" + ) as progress_bar: for i in range(0, len(document_objects), batch_size): - ids = [doc.id for doc in document_objects[i: i + batch_size]] + ids = [doc.id for doc in document_objects[i : i + batch_size]] # metadata fields are stored in Pinecone - metadata = [doc.meta for doc in document_objects[i: i + batch_size]] + metadata = [doc.meta for doc in document_objects[i : i + batch_size]] if add_vectors: embeddings = [doc.embedding for doc in document_objects[i : i + batch_size]] embeddings_to_index = np.array(embeddings, dtype="float32") - if self.similarity=="cosine": self.normalize_embedding(embeddings_to_index) + if self.similarity == "cosine": + self.normalize_embedding(embeddings_to_index) # to convert to list objects embeddings = [embed.tolist() for embed in embeddings] vectors = zip(ids, embeddings, metadata) self.pinecone_indexes[index].upsert(vectors=vectors) docs_to_write_in_sql = [] - for doc in document_objects[i: i + batch_size]: + for doc in document_objects[i : i + batch_size]: doc.meta["vector_id"] = doc.id docs_to_write_in_sql.append(doc) super(PineconeDocumentStore, self).write_documents( @@ -294,21 +303,21 @@ def _create_document_field_map(self) -> Dict: } def _build_filter_clause(self, filters: Dict[str, Union[str, int, float, bool, list]]) -> dict: - """ + """ Transform Haystack filter conditions to Pinecone metadata filter syntax. Haystack syntax == {'item_id': ['B00006IBLJ', 'B000GHJM9C', 'B000CS787S']} Pinecone syntax == {'item_id': {'$in': ['B00006IBLJ', 'B000GHJM9C', 'B000CS787S']}} """ pinecone_filter = {} for key, value in filters.items(): - if key in ['$and', '$or'] and type(value) is dict: + if key in ["$and", "$or"] and type(value) is dict: sublist = [] for sub_key, sub_value in value.items(): sublist.append(self._build_filter_clause({sub_key: sub_value})) pinecone_filter[key] = sublist - elif type(value) is list and key[0] != '$': - pinecone_filter[key] = {'$in': value} - elif type(value) is list and key in ['$and', '$or']: + elif type(value) is list and key[0] != "$": + pinecone_filter[key] = {"$in": value} + elif type(value) is list and key in ["$and", "$or"]: # check if we have more operators in sublist sublist = [] for sub_item in value: @@ -328,7 +337,7 @@ def update_embeddings( index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict] = None, - batch_size: int = 32 + batch_size: int = 32, ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. @@ -401,7 +410,7 @@ def get_all_documents( filters: Optional[Dict] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, - headers: Optional[Dict[str, str]] = None + headers: Optional[Dict[str, str]] = None, ) -> List[Document]: if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") @@ -421,7 +430,7 @@ def get_all_documents_generator( filters: Optional[Dict] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, - headers: Optional[Dict[str, str]] = None + headers: Optional[Dict[str, str]] = None, ) -> Generator[Document, None, None]: """ Get all documents from the document store. Under-the-hood, documents are fetched in batches from the @@ -465,7 +474,7 @@ def get_documents_by_id( ids: List[str], index: Optional[str] = None, batch_size: int = 32, - headers: Optional[Dict[str, str]] = None + headers: Optional[Dict[str, str]] = None, ) -> List[Document]: if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") From 5e06721cf0ee644ff001840639964607b5c1d7aa Mon Sep 17 00:00:00 2001 From: James Briggs Date: Sun, 27 Feb 2022 16:07:41 +0700 Subject: [PATCH 06/58] removed debugging code --- haystack/document_stores/pinecone.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 8d595ddaa9..f1dbc002e8 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -321,7 +321,6 @@ def _build_filter_clause(self, filters: Dict[str, Union[str, int, float, bool, l # check if we have more operators in sublist sublist = [] for sub_item in value: - print(f"sub_item: {sub_item}") if type(sub_item) is dict: sublist.append(self._build_filter_clause(sub_item)) else: From a7ed9ba9e145803345d84d2b4a705f5ff38f6d04 Mon Sep 17 00:00:00 2001 From: James Briggs Date: Wed, 2 Mar 2022 18:29:53 +0700 Subject: [PATCH 07/58] updated Pinecone filtering to use filter_utils --- haystack/document_stores/filter_utils.py | 70 ++++++++++++++++++++++++ haystack/document_stores/pinecone.py | 59 +++++--------------- 2 files changed, 83 insertions(+), 46 deletions(-) diff --git a/haystack/document_stores/filter_utils.py b/haystack/document_stores/filter_utils.py index d1d69196a7..d146e54bcb 100644 --- a/haystack/document_stores/filter_utils.py +++ b/haystack/document_stores/filter_utils.py @@ -145,6 +145,12 @@ def convert_to_weaviate(self): """ pass + def convert_to_pinecone(self): + """ + Converts the LogicalFilterClause instance to a Pinecone filter. + """ + pass + def _merge_es_range_queries(self, conditions: List[Dict]) -> List[Dict[str, Dict]]: """ Merges Elasticsearch range queries that perform on the same metadata field. @@ -237,6 +243,12 @@ def convert_to_weaviate(self): """ pass + def convert_to_pinecone(self): + """ + Converts the ComparisonOperation instance to a Pinecone comparison operator. + """ + pass + @abstractmethod def invert(self) -> "ComparisonOperation": """ @@ -308,6 +320,14 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, int, float, bool, List[Dic return {"operator": "Or", "operands": conditions} else: return conditions[0] + + def convert_to_pinecone(self) -> Dict[str, Union[str, int, float, bool, List[Dict]]]: + conditions = [condition.invert().convert_to_pinecone() for condition in self.conditions] + if len(conditions) > 1: + # Conditions in self.conditions are by default combined with AND which becomes OR according to DeMorgan + return {"$or": conditions} + else: + return conditions[0] def invert(self) -> Union[LogicalFilterClause, ComparisonOperation]: # This method is called when a "$not" operation is embedded in another "$not" operation. Therefore, we don't @@ -343,6 +363,10 @@ def convert_to_sql(self, meta_document_orm): def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: conditions = [condition.convert_to_weaviate() for condition in self.conditions] return {"operator": "And", "operands": conditions} + + def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: + conditions = [condition.convert_to_pinecone() for condition in self.conditions] + return {"$and": conditions} def invert(self) -> "OrOperation": return OrOperation([condition.invert() for condition in self.conditions]) @@ -372,6 +396,10 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: conditions = [condition.convert_to_weaviate() for condition in self.conditions] return {"operator": "Or", "operands": conditions} + def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: + conditions = [condition.convert_to_pinecone() for condition in self.conditions] + return {"$or": conditions} + def invert(self) -> AndOperation: return AndOperation([condition.invert() for condition in self.conditions]) @@ -398,6 +426,9 @@ def convert_to_sql(self, meta_document_orm): def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]: comp_value_type, comp_value = self._get_weaviate_datatype() return {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value} + + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, int, float, bool]]: + return {self.field_name: {"$eq": self.comparison_value}} def invert(self) -> "NeOperation": return NeOperation(self.field_name, self.comparison_value) @@ -435,6 +466,16 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: return filter_dict + def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: + filter_dict: Dict[str, Union[str, List[Dict]]] = {self.field_name: {"$in": []}} + assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list." + for value in self.comparison_value: + assert isinstance(filter_dict[self.field_name]["$in"], list) # Necessary for mypy + filter_dict[self.field_name]["$in"].append( + value + ) + return filter_dict + def invert(self) -> "NinOperation": return NinOperation(self.field_name, self.comparison_value) @@ -461,6 +502,9 @@ def convert_to_sql(self, meta_document_orm): def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]: comp_value_type, comp_value = self._get_weaviate_datatype() return {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value} + + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, int, float, bool]]: + return {self.field_name: {"$ne": self.comparison_value}} def invert(self) -> "EqOperation": return EqOperation(self.field_name, self.comparison_value) @@ -497,6 +541,16 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: ) return filter_dict + + def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: + filter_dict: Dict[str, Union[str, List[Dict]]] = {self.field_name: {"$nin": []}} + assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list." + for value in self.comparison_value: + assert isinstance(filter_dict[self.field_name]["$nin"], list) # Necessary for mypy + filter_dict[self.field_name]["$nin"].append( + value + ) + return filter_dict def invert(self) -> "InOperation": return InOperation(self.field_name, self.comparison_value) @@ -525,6 +579,10 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: comp_value_type, comp_value = self._get_weaviate_datatype() assert not isinstance(comp_value, list), "Comparison value for '$gt' operation must not be a list." return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value} + + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: + assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list." + return {self.field_name: {"$gt": self.comparison_value}} def invert(self) -> "LteOperation": return LteOperation(self.field_name, self.comparison_value) @@ -553,6 +611,10 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: comp_value_type, comp_value = self._get_weaviate_datatype() assert not isinstance(comp_value, list), "Comparison value for '$gte' operation must not be a list." return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value} + + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: + assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list." + return {self.field_name: {"$gte": self.comparison_value}} def invert(self) -> "LtOperation": return LtOperation(self.field_name, self.comparison_value) @@ -582,6 +644,10 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(comp_value, list), "Comparison value for '$lt' operation must not be a list." return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value} + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: + assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list." + return {self.field_name: {"$lt": self.comparison_value}} + def invert(self) -> "GteOperation": return GteOperation(self.field_name, self.comparison_value) @@ -609,6 +675,10 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: comp_value_type, comp_value = self._get_weaviate_datatype() assert not isinstance(comp_value, list), "Comparison value for '$lte' operation must not be a list." return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value} + + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: + assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list." + return {self.field_name: {"$lte": self.comparison_value}} def invert(self) -> "GtOperation": return GtOperation(self.field_name, self.comparison_value) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index f1dbc002e8..96ff1ba58a 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -14,6 +14,7 @@ from haystack.schema import Document from haystack.document_stores.sql import SQLDocumentStore from haystack.document_stores.base import get_batches_from_generator +from haystack.document_stores.filter_utils import LogicalFilterClause from inspect import Signature logger = logging.getLogger(__name__) @@ -39,7 +40,7 @@ def __init__( environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional["pinecone.Index"] = None, - vector_dim: int = 768, + embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", @@ -57,7 +58,7 @@ def __init__( :param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale deployment, Postgres is recommended. :param pinecone_index: pinecone-client Index object, an index will be initialized or loaded if not specified. - :param vector_dim: the embedding vector size. + :param embedding_dim: the embedding vector size. :param return_embedding: To return document embedding :param index: Name of index in document store to use. :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default since it is @@ -93,7 +94,7 @@ def __init__( ) self.index = index - self.vector_dim = vector_dim + self.embedding_dim = embedding_dim self.return_embedding = return_embedding self.embedding_field = embedding_field self.progress_bar = progress_bar @@ -110,7 +111,7 @@ def __init__( self.pinecone_indexes[clean_index] = pinecone_index else: self.pinecone_indexes[clean_index] = self._create_index_if_not_exist( - vector_dim=self.vector_dim, + embedding_dim=self.embedding_dim, index=clean_index, metric_type=self.metric_type, replicas=self.replicas, @@ -138,7 +139,7 @@ def _sanitize_index_name(self, index: Optional[str]) -> Optional[str]: def _create_index_if_not_exist( self, - vector_dim: int, + embedding_dim: int, index: Optional[str] = None, metric_type: Optional[str] = "cosine", replicas: Optional[int] = 1, @@ -158,7 +159,7 @@ def _create_index_if_not_exist( # search pinecone hosted indexes and create if it does not exist if index not in pinecone.list_indexes(): pinecone.create_index( - name=index, dimension=vector_dim, metric=metric_type, replicas=replicas, shards=shards + name=index, dimension=embedding_dim, metric=metric_type, replicas=replicas, shards=shards ) index_conn = pinecone.Index(index) @@ -172,9 +173,7 @@ def _create_index_if_not_exist( def _convert_pinecone_result_to_document(self, result: dict, return_embedding: bool) -> Document: """ - Convert Pinecone result dict into haystack document object. This is more involved because - weaviate search result dict varies between get and query interfaces. - Weaviate get methods return the data items in properties key, whereas the query doesn't. + Convert Pinecone result dict into haystack document object. """ score = None content = "" @@ -203,9 +202,6 @@ def _convert_pinecone_result_to_document(self, result: dict, return_embedding: b ) return document - def _validate_params_load_from_disk(self, sig: Signature, locals: dict, kwargs: dict): - raise NotImplementedError("_validate_params_load_from_disk not implemented for PineconeDocumentStore") - def _validate_index_sync(self): # This check ensures the correct document database was loaded. # If it fails, make sure you provided the path to the database @@ -254,7 +250,7 @@ def write_documents( if not self.pinecone_indexes.get(index): self.pinecone_indexes[index] = self._create_index_if_not_exist( - vector_dim=self.vector_dim, + embedding_dim=self.embedding_dim, index=index, metric_type=self.metric_type, replicas=self.replicas, @@ -268,7 +264,6 @@ def write_documents( ) if len(document_objects) > 0: add_vectors = False if document_objects[0].embedding is None else True - # I don't think below is required with tqdm( total=len(document_objects), disable=not self.progress_bar, position=0, desc="Writing Documents" ) as progress_bar: @@ -302,34 +297,6 @@ def _create_document_field_map(self) -> Dict: self.index: self.embedding_field, } - def _build_filter_clause(self, filters: Dict[str, Union[str, int, float, bool, list]]) -> dict: - """ - Transform Haystack filter conditions to Pinecone metadata filter syntax. - Haystack syntax == {'item_id': ['B00006IBLJ', 'B000GHJM9C', 'B000CS787S']} - Pinecone syntax == {'item_id': {'$in': ['B00006IBLJ', 'B000GHJM9C', 'B000CS787S']}} - """ - pinecone_filter = {} - for key, value in filters.items(): - if key in ["$and", "$or"] and type(value) is dict: - sublist = [] - for sub_key, sub_value in value.items(): - sublist.append(self._build_filter_clause({sub_key: sub_value})) - pinecone_filter[key] = sublist - elif type(value) is list and key[0] != "$": - pinecone_filter[key] = {"$in": value} - elif type(value) is list and key in ["$and", "$or"]: - # check if we have more operators in sublist - sublist = [] - for sub_item in value: - if type(sub_item) is dict: - sublist.append(self._build_filter_clause(sub_item)) - else: - sublist.append(sub_item) - pinecone_filter[key] = sublist - else: - pinecone_filter[key] = value - return pinecone_filter - def update_embeddings( self, retriever: "BaseRetriever", @@ -484,7 +451,7 @@ def get_documents_by_id( # get or create index if not self.pinecone_indexes.get(index): self.pinecone_indexes[index] = self._create_index_if_not_exist( - vector_dim=self.vector_dim, + embedding_dim=self.embedding_dim, index=index, metric_type=self.metric_type, replicas=self.replicas, @@ -521,7 +488,7 @@ def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dic index = self._sanitize_index_name(index) if not self.pinecone_indexes.get(index): self.pinecone_indexes[index] = self._create_index_if_not_exist( - vector_dim=self.vector_dim, + embedding_dim=self.embedding_dim, index=self.index, metric_type=self.metric_type, replicas=self.replicas, @@ -571,7 +538,7 @@ def delete_documents( index = self._sanitize_index_name(index) if not self.pinecone_indexes.get(index): self.pinecone_indexes[index] = self._create_index_if_not_exist( - vector_dim=self.vector_dim, + embedding_dim=self.embedding_dim, index=self.index, metric_type=self.metric_type, replicas=self.replicas, @@ -606,7 +573,7 @@ def query_by_embedding( raise NotImplementedError("PineconeDocumentStore does not support headers.") self._limit_check(top_k, include_values=return_embedding) if filters: - filters = self._build_filter_clause(filters) + filters = LogicalFilterClause.parse(filters).convert_to_pinecone() index = index or self.index index = self._sanitize_index_name(index) From dc2b80f69833270c3ba0e9fede5a1882d3767c2c Mon Sep 17 00:00:00 2001 From: James Briggs Date: Wed, 2 Mar 2022 18:43:50 +0700 Subject: [PATCH 08/58] removed uneeded methods and minor tweaks to current methods --- haystack/document_stores/pinecone.py | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 96ff1ba58a..200febf8be 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -460,7 +460,7 @@ def get_documents_by_id( # check there are vectors count = self.get_embedding_count(index) if count == 0: - raise Exception("No documents exist, try creating documents with write_embeddings first.") + raise Exception("No documents exist, try creating documents with either write_embeddings or update_embeddings first.") res = self.pinecone_indexes[index].fetch(ids=ids) # convert Pinecone responses to documents documents = [] @@ -500,17 +500,6 @@ def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dic count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 return count - def train_index( - self, - documents: Optional[Union[List[dict], List[Document]]], - embeddings: Optional[np.ndarray] = None, - index: Optional[str] = None, - ): - """ - Not applicable to PineconeDocumentStore. - """ - raise NotImplementedError("PineconeDocumentStore does not require training") - def delete_documents( self, index: Optional[str] = None, @@ -613,17 +602,6 @@ def query_by_embedding( return documents - def save(self): - """ - Save index to the specified file, not implemented for PineconeDocumentStore. - """ - raise NotImplementedError("save method not implemented for PineconeDocumentStore") - - def _load_init_params_from_config( - self, index_path: Optional[Union[str, Path]] = None, config_path: Optional[Union[str, Path]] = None - ): - raise NotImplementedError("Load init params from config not implemented for Pinecone") - def _limit_check(self, top_k: str, include_values: Optional[bool] = None): """ Confirms the top_k value does not exceed Pinecone vector database limits. @@ -642,7 +620,7 @@ def _limit_check(self, top_k: str, include_values: Optional[bool] = None): ) @classmethod - def load(): + def load(cls): """ Default class method used for loading indexes. Not applicable to the PineconeDocumentStore. """ From 227da6d178b28d7a26c8c6ea28975565f33d6d96 Mon Sep 17 00:00:00 2001 From: James Briggs Date: Wed, 2 Mar 2022 19:25:42 +0700 Subject: [PATCH 09/58] fixed typing issues --- haystack/document_stores/pinecone.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 200febf8be..4a4a2ee8d1 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -129,10 +129,8 @@ def __init__( self._validate_index_sync() - def _sanitize_index_name(self, index: Optional[str]) -> Optional[str]: - if index is None: - return None - elif "_" in index: + def _sanitize_index_name(self, index: str) -> str: + if "_" in index: return index.replace("_", "-").lower() else: return index.lower() @@ -181,7 +179,7 @@ def _convert_pinecone_result_to_document(self, result: dict, return_embedding: b id = result.get("id") score = result.get("score") embedding = result.get("values") - meta = result.get("metadata") + meta = result.get("metadata") or {} content_type = None if meta.get("contenttype") is not None: @@ -602,7 +600,7 @@ def query_by_embedding( return documents - def _limit_check(self, top_k: str, include_values: Optional[bool] = None): + def _limit_check(self, top_k: int, include_values: Optional[bool] = None): """ Confirms the top_k value does not exceed Pinecone vector database limits. """ From 384391bab3714cd000875bacc1f619ceac59d58d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:28:18 +0000 Subject: [PATCH 10/58] Update Documentation & Code Style --- haystack/document_stores/filter_utils.py | 26 ++++++++++-------------- haystack/document_stores/pinecone.py | 4 +++- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/haystack/document_stores/filter_utils.py b/haystack/document_stores/filter_utils.py index d146e54bcb..8a0eec5b1f 100644 --- a/haystack/document_stores/filter_utils.py +++ b/haystack/document_stores/filter_utils.py @@ -320,7 +320,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, int, float, bool, List[Dic return {"operator": "Or", "operands": conditions} else: return conditions[0] - + def convert_to_pinecone(self) -> Dict[str, Union[str, int, float, bool, List[Dict]]]: conditions = [condition.invert().convert_to_pinecone() for condition in self.conditions] if len(conditions) > 1: @@ -363,7 +363,7 @@ def convert_to_sql(self, meta_document_orm): def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: conditions = [condition.convert_to_weaviate() for condition in self.conditions] return {"operator": "And", "operands": conditions} - + def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: conditions = [condition.convert_to_pinecone() for condition in self.conditions] return {"$and": conditions} @@ -426,7 +426,7 @@ def convert_to_sql(self, meta_document_orm): def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]: comp_value_type, comp_value = self._get_weaviate_datatype() return {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value} - + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, int, float, bool]]: return {self.field_name: {"$eq": self.comparison_value}} @@ -471,9 +471,7 @@ def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list." for value in self.comparison_value: assert isinstance(filter_dict[self.field_name]["$in"], list) # Necessary for mypy - filter_dict[self.field_name]["$in"].append( - value - ) + filter_dict[self.field_name]["$in"].append(value) return filter_dict def invert(self) -> "NinOperation": @@ -502,7 +500,7 @@ def convert_to_sql(self, meta_document_orm): def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, bool]]: comp_value_type, comp_value = self._get_weaviate_datatype() return {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value} - + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, int, float, bool]]: return {self.field_name: {"$ne": self.comparison_value}} @@ -541,15 +539,13 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: ) return filter_dict - + def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: filter_dict: Dict[str, Union[str, List[Dict]]] = {self.field_name: {"$nin": []}} assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list." for value in self.comparison_value: assert isinstance(filter_dict[self.field_name]["$nin"], list) # Necessary for mypy - filter_dict[self.field_name]["$nin"].append( - value - ) + filter_dict[self.field_name]["$nin"].append(value) return filter_dict def invert(self) -> "InOperation": @@ -579,7 +575,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: comp_value_type, comp_value = self._get_weaviate_datatype() assert not isinstance(comp_value, list), "Comparison value for '$gt' operation must not be a list." return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value} - + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list." return {self.field_name: {"$gt": self.comparison_value}} @@ -611,7 +607,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: comp_value_type, comp_value = self._get_weaviate_datatype() assert not isinstance(comp_value, list), "Comparison value for '$gte' operation must not be a list." return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value} - + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list." return {self.field_name: {"$gte": self.comparison_value}} @@ -647,7 +643,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list." return {self.field_name: {"$lt": self.comparison_value}} - + def invert(self) -> "GteOperation": return GteOperation(self.field_name, self.comparison_value) @@ -675,7 +671,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: comp_value_type, comp_value = self._get_weaviate_datatype() assert not isinstance(comp_value, list), "Comparison value for '$lte' operation must not be a list." return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value} - + def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list." return {self.field_name: {"$lte": self.comparison_value}} diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 4a4a2ee8d1..8c1b84e722 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -458,7 +458,9 @@ def get_documents_by_id( # check there are vectors count = self.get_embedding_count(index) if count == 0: - raise Exception("No documents exist, try creating documents with either write_embeddings or update_embeddings first.") + raise Exception( + "No documents exist, try creating documents with either write_embeddings or update_embeddings first." + ) res = self.pinecone_indexes[index].fetch(ids=ids) # convert Pinecone responses to documents documents = [] From bd9c35521ef33cafb0c73a70e90d2b69d52d024a Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Wed, 2 Mar 2022 16:57:18 +0100 Subject: [PATCH 11/58] Allow filters in al methods except get_embedding_count --- haystack/document_stores/filter_utils.py | 26 +- haystack/document_stores/pinecone.py | 485 ++++++++++++++--------- 2 files changed, 296 insertions(+), 215 deletions(-) diff --git a/haystack/document_stores/filter_utils.py b/haystack/document_stores/filter_utils.py index 8a0eec5b1f..608f69facd 100644 --- a/haystack/document_stores/filter_utils.py +++ b/haystack/document_stores/filter_utils.py @@ -466,13 +466,8 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: return filter_dict - def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: - filter_dict: Dict[str, Union[str, List[Dict]]] = {self.field_name: {"$in": []}} - assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list." - for value in self.comparison_value: - assert isinstance(filter_dict[self.field_name]["$in"], list) # Necessary for mypy - filter_dict[self.field_name]["$in"].append(value) - return filter_dict + def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]: + return {self.field_name: {"$in": self.comparison_value}} def invert(self) -> "NinOperation": return NinOperation(self.field_name, self.comparison_value) @@ -540,13 +535,8 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: return filter_dict - def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: - filter_dict: Dict[str, Union[str, List[Dict]]] = {self.field_name: {"$nin": []}} - assert isinstance(self.comparison_value, list), "'$nin' operation requires comparison value to be a list." - for value in self.comparison_value: - assert isinstance(filter_dict[self.field_name]["$nin"], list) # Necessary for mypy - filter_dict[self.field_name]["$nin"].append(value) - return filter_dict + def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]: + return {self.field_name: {"$nin": self.comparison_value}} def invert(self) -> "InOperation": return InOperation(self.field_name, self.comparison_value) @@ -576,7 +566,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(comp_value, list), "Comparison value for '$gt' operation must not be a list." return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value} - def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list." return {self.field_name: {"$gt": self.comparison_value}} @@ -608,7 +598,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(comp_value, list), "Comparison value for '$gte' operation must not be a list." return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value} - def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list." return {self.field_name: {"$gte": self.comparison_value}} @@ -640,7 +630,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(comp_value, list), "Comparison value for '$lt' operation must not be a list." return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value} - def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list." return {self.field_name: {"$lt": self.comparison_value}} @@ -672,7 +662,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: assert not isinstance(comp_value, list), "Comparison value for '$lte' operation must not be a list." return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value} - def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, float, int]]: + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list." return {self.field_name: {"$lte": self.comparison_value}} diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 8c1b84e722..4605254d9c 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -4,7 +4,6 @@ from haystack.nodes.retriever import BaseRetriever import logging -from pathlib import Path from typing import Union, List, Optional, Dict, Generator from tqdm.auto import tqdm @@ -15,7 +14,7 @@ from haystack.document_stores.sql import SQLDocumentStore from haystack.document_stores.base import get_batches_from_generator from haystack.document_stores.filter_utils import LogicalFilterClause -from inspect import Signature + logger = logging.getLogger(__name__) @@ -24,7 +23,7 @@ class PineconeDocumentStore(SQLDocumentStore): """ Document store for very large scale embedding based dense retrievers like the DPR. - It implements the Pinecone vector database (https://www.pinecone.io) + It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io)) to perform similarity search on vectors. The document text is stored using the SQLDocumentStore, while @@ -49,40 +48,46 @@ def __init__( embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", - **kwargs, ): """ - :param api_key: Pinecone vector database API key (https://app.pinecone.io) - :param environment: Pinecone cloud environment uses "us-west1-gcp" by default. Other GCP and AWS regions are supported, - contact Pinecone if required. + :param api_key: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). + :param environment: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are + supported, contact Pinecone if required. :param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale - deployment, Postgres is recommended. + deployment, Postgres is recommended. :param pinecone_index: pinecone-client Index object, an index will be initialized or loaded if not specified. - :param embedding_dim: the embedding vector size. - :param return_embedding: To return document embedding + :param embedding_dim: The embedding vector size. + :param return_embedding: Whether to return document embeddings. :param index: Name of index in document store to use. - :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default since it is - more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence-Transformer model. - In both cases, the returned values in Document.score are normalized to be in range [0,1]: - For `dot_product`: expit(np.asarray(raw_score / 100)) - For `cosine`: (raw_score + 1) / 2 - :param replicas: The number of replicas. Replicas duplicate your index. They provide higher availability and - throughput. - :param shards: The number of shards to be used in the index. We recommend you use 1 shard per 1GB of data. + :param similarity: The similarity function used to compare document vectors. `"dot_product"` is the default + since it is more performant with DPR embeddings. `"cosine"` is recommended if you are using a + Sentence-Transformer model. + In both cases, the returned values in Document.score are normalized to be in range [0,1]: + - For `"dot_product"`: `expit(np.asarray(raw_score / 100))` + - For `"cosine"`: `(raw_score + 1) / 2` + :param replicas: The number of replicas. Replicas duplicate the index. They provide higher availability and + throughput. + :param shards: The number of shards to be used in the index. We recommend to use 1 shard per 1GB of data. :param embedding_field: Name of field containing an embedding vector. :param progress_bar: Whether to show a tqdm progress bar or not. - Can be helpful to disable in production deployments to keep the logs clean. - :param duplicate_documents: Handle duplicates document based on parameter options. - Parameter options : ( 'skip','overwrite','fail') - skip: Ignore the duplicates documents - overwrite: Update any existing documents with the same ID when adding documents. - fail: an error is raised if the document ID of the document being added already - exists. + Can be helpful to disable in production deployments to keep the logs clean. + :param duplicate_documents: Handle duplicates document based on parameter options.\ + + Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. """ + # Save init parameters to enable export of component config as YAML + self.set_config(api_key=api_key, environment=environment, sql_url=sql_url, embedding_dim=embedding_dim, + return_embedding=return_embedding, index=index, similarity=similarity, replicas=replicas, + shards=shards, embedding_field=embedding_field, progress_bar=progress_bar, + duplicate_documents=duplicate_documents) + # Connect to Pinecone server using python client binding pinecone.init(api_key=api_key, environment=environment) - # formal similarity string + # Formal similarity string if similarity in ("dot_product", "cosine"): self.metric_type = similarity elif similarity in ("l2", "euclidean"): @@ -104,7 +109,7 @@ def __init__( self.replicas = replicas self.shards = shards - # initialize dictionary of index connections + # Initialize dictionary of index connections self.pinecone_indexes: Dict[str, pinecone.Index] = {} clean_index = self._sanitize_index_name(index) if pinecone_index: @@ -144,46 +149,42 @@ def _create_index_if_not_exist( shards: Optional[int] = 1, ): """ - Create a new index for storing documents in case if an + Create a new index for storing documents in case an index with the name doesn't exist already. """ index = index or self.index index = self._sanitize_index_name(index) - # if index already loaded can skip + # Skip if already exists if index in self.pinecone_indexes.keys(): - index_conn = self.pinecone_indexes[index] + index_connection = self.pinecone_indexes[index] else: - # search pinecone hosted indexes and create if it does not exist + # Search pinecone hosted indexes and create an index if it does not exist if index not in pinecone.list_indexes(): pinecone.create_index( name=index, dimension=embedding_dim, metric=metric_type, replicas=replicas, shards=shards ) - index_conn = pinecone.Index(index) + index_connection = pinecone.Index(index) - # get index statistics - stats = index_conn.describe_index_stats() + # Get index statistics + stats = index_connection.describe_index_stats() dims = stats["dimension"] count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") # return index connection - return index_conn + return index_connection def _convert_pinecone_result_to_document(self, result: dict, return_embedding: bool) -> Document: """ Convert Pinecone result dict into haystack document object. """ - score = None content = "" id = result.get("id") - score = result.get("score") + score = result.get("score", None) embedding = result.get("values") - meta = result.get("metadata") or {} - - content_type = None - if meta.get("contenttype") is not None: - content_type = str(meta.pop("contenttype")) + meta = result.get("metadata") + content_type = meta.pop("content_type") if "content_type" in meta else None if return_embedding and embedding: embedding = np.asarray(embedding, dtype=np.float32) @@ -201,9 +202,10 @@ def _convert_pinecone_result_to_document(self, result: dict, return_embedding: b return document def _validate_index_sync(self): - # This check ensures the correct document database was loaded. - # If it fails, make sure you provided the path to the database - # used when creating the original Pinecone index + """ + This check ensures the correct document database was loaded. If it fails, make sure you provided the same path + to the SQL database as when you created the original Pinecone index. + """ if not self.get_document_count() == self.get_embedding_count(): raise ValueError( "The number of documents present in the SQL database does not " @@ -219,22 +221,22 @@ def write_documents( batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, - ) -> None: + ): """ Add new documents to the DocumentStore. :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index - them right away in Pinecone. If not, you can later call update_embeddings() to create & index them. - :param index: (SQL) index name for storing the docs and metadata - :param batch_size: When working with large number of documents, batching can help reduce memory footprint. - :param duplicate_documents: Handle duplicates document based on parameter options. - Parameter options : ( 'skip','overwrite','fail') - skip: Ignore the duplicates documents - overwrite: Update any existing documents with the same ID when adding documents. - fail: an error is raised if the document ID of the document being added already - exists. - :raises DuplicateDocumentError: Exception trigger on duplicate document - :return: None + them right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them. + :param index: (SQL) index name for storing the docs and metadata. + :param batch_size: Number of Documents to process at a time. When working with large number of documents, + batching can help reduce memory footprint. + :param duplicate_documents: Handle duplicates document based on parameter options.\ + + Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. + :raises DuplicateDocumentError: Exception trigger on duplicate document. """ if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") @@ -242,11 +244,10 @@ def write_documents( index = index or self.index index = self._sanitize_index_name(index) duplicate_documents = duplicate_documents or self.duplicate_documents - assert ( - duplicate_documents in self.duplicate_documents_options - ), f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}" + assert duplicate_documents in self.duplicate_documents_options, \ + f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}" - if not self.pinecone_indexes.get(index): + if index not in self.pinecone_indexes: self.pinecone_indexes[index] = self._create_index_if_not_exist( embedding_dim=self.embedding_dim, index=index, @@ -266,69 +267,87 @@ def write_documents( total=len(document_objects), disable=not self.progress_bar, position=0, desc="Writing Documents" ) as progress_bar: for i in range(0, len(document_objects), batch_size): - ids = [doc.id for doc in document_objects[i : i + batch_size]] - # metadata fields are stored in Pinecone - metadata = [doc.meta for doc in document_objects[i : i + batch_size]] + ids = [doc.id for doc in document_objects[i: i + batch_size]] + metadata = [doc.meta for doc in document_objects[i: i + batch_size]] if add_vectors: embeddings = [doc.embedding for doc in document_objects[i : i + batch_size]] embeddings_to_index = np.array(embeddings, dtype="float32") if self.similarity == "cosine": self.normalize_embedding(embeddings_to_index) - # to convert to list objects + # Convert embeddings to list objects embeddings = [embed.tolist() for embed in embeddings] - vectors = zip(ids, embeddings, metadata) - self.pinecone_indexes[index].upsert(vectors=vectors) + data_to_write_to_pinecone = zip(ids, embeddings, metadata) + # Metadata fields and embeddings are stored in Pinecone + self.pinecone_indexes[index].upsert(vectors=data_to_write_to_pinecone) - docs_to_write_in_sql = [] + docs_to_write_to_sql = [] for doc in document_objects[i : i + batch_size]: doc.meta["vector_id"] = doc.id - docs_to_write_in_sql.append(doc) + docs_to_write_to_sql.append(doc) super(PineconeDocumentStore, self).write_documents( - docs_to_write_in_sql, index=index, duplicate_documents=duplicate_documents + docs_to_write_to_sql, index=index, duplicate_documents=duplicate_documents ) progress_bar.update(batch_size) progress_bar.close() def _create_document_field_map(self) -> Dict: - return { - self.index: self.embedding_field, - } + return {self.embedding_field: "embedding"} def update_embeddings( self, retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, - filters: Optional[Dict] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 32, ): """ Updates the embeddings in the the document store using the encoding model specified in the retriever. - This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). - - :param retriever: Retriever to use to get embeddings for text - :param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used. - :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False, - only documents without embeddings are processed. This mode can be used for - incremental updating of embeddings, wherein, only newly indexed documents - get processed. + This can be useful if you want to add or change the embeddings for your documents (e.g. after changing the + retriever config). + + :param retriever: Retriever to use to get embeddings for text. + :param index: Index name for which embeddings are to be updated. If set to `None`, the default `self.index` is + used. + :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to `False`, + only documents without embeddings are processed. This mode can be used for incremental updating of + embeddings, wherein, only newly indexed documents get processed. :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. - Example: {"genre": {"$in": ["documentary", "action"]}}, - more info on filtering syntax here https://www.pinecone.io/docs/metadata-filtering/ - :param batch_size: When working with large number of documents, batching can help reduce memory footprint. - :return: None + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + :param batch_size: Number of documents to process at a time. When working with large number of documents, + batching can help reduce memory footprint. """ - if filters: - raise Exception("update_embeddings does not support filtering.") - index = index or self.index index = self._sanitize_index_name(index) - if not self.pinecone_indexes.get(index): - raise ValueError("Couldn't find a Pinecone index. Try to init the PineconeDocumentStore() again ...") + if index not in self.pinecone_indexes: + raise ValueError(f"Couldn't find a the index '{index}' in Pinecone. Try to init the " + f"PineconeDocumentStore() again ...") - document_count = self.get_document_count(index=index) + document_count = self.get_document_count(index=index, filters=filters) if document_count == 0: logger.warning("Calling DocumentStore.update_embeddings() on an empty index") return @@ -351,10 +370,8 @@ def update_embeddings( assert len(document_batch) == len(embeddings) embeddings_to_index = np.array(embeddings, dtype="float32") - if self.similarity == "cosine": self.normalize_embedding(embeddings_to_index) - embeddings = embeddings.tolist() metadata = [] @@ -371,27 +388,24 @@ def update_embeddings( def get_all_documents( self, index: Optional[str] = None, - filters: Optional[Dict] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, ) -> List[Document]: + if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") - if filters: - raise Exception("get_all_documents does not support filters.") - self._limit_check(batch_size) - result = self.get_all_documents_generator( - index=index, filters=filters, return_embedding=return_embedding, batch_size=batch_size - ) + result = self.get_all_documents_generator(index=index, filters=filters, return_embedding=return_embedding, + batch_size=batch_size) documents = list(result) return documents def get_all_documents_generator( self, index: Optional[str] = None, - filters: Optional[Dict] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, @@ -403,34 +417,47 @@ def get_all_documents_generator( :param index: Name of the index to get the documents from. If None, the DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"genre": {"$in": ["documentary", "action"]}}, - more info on filtering syntax here https://www.pinecone.io/docs/metadata-filtering/ + :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` :param return_embedding: Whether to return the document embeddings. :param batch_size: When working with large number of documents, batching can help reduce memory footprint. """ if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") - if filters: - raise Exception("get_all_documents_generator does not support filters.") - self._limit_check(batch_size) + if return_embedding is None: + return_embedding = self.return_embedding index = index or self.index index = self._sanitize_index_name(index) documents = super(PineconeDocumentStore, self).get_all_documents_generator( index=index, filters=filters, batch_size=batch_size, return_embedding=False ) - if return_embedding is None: - return_embedding = self.return_embedding for doc in documents: if return_embedding: - if doc.meta and doc.meta.get("vector_id") is not None: - res = self.pinecone_indexes[index].fetch(ids=[doc.id]) - if res["vectors"].get(doc.id): - doc.embedding = self._convert_pinecone_result_to_document( - result=res["vectors"][doc.id], return_embedding=return_embedding - ).embedding + self._attach_embedding_to_document(document=doc, index=index) yield doc def get_documents_by_id( @@ -440,107 +467,104 @@ def get_documents_by_id( batch_size: int = 32, headers: Optional[Dict[str, str]] = None, ) -> List[Document]: + if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") - self._limit_check(batch_size) index = index or self.index index = self._sanitize_index_name(index) - # get or create index - if not self.pinecone_indexes.get(index): - self.pinecone_indexes[index] = self._create_index_if_not_exist( - embedding_dim=self.embedding_dim, - index=index, - metric_type=self.metric_type, - replicas=self.replicas, - shards=self.shards, - ) - # check there are vectors - count = self.get_embedding_count(index) - if count == 0: - raise Exception( - "No documents exist, try creating documents with either write_embeddings or update_embeddings first." - ) - res = self.pinecone_indexes[index].fetch(ids=ids) - # convert Pinecone responses to documents - documents = [] - for id_val in ids: - # check exists - if res["vectors"].get(id_val): - documents.append( - self._convert_pinecone_result_to_document( - result=res["vectors"][id_val], return_embedding=self.return_embedding - ) - ) - # get content from SQL - content = super().get_documents_by_id([doc.id for doc in documents]) - for i, doc in enumerate(documents): - doc.content = content[i].content + + documents = super().get_documents_by_id(ids=ids, index=index, batch_size=batch_size) + if self.return_embedding: + for doc in documents: + self._attach_embedding_to_document(document=doc, index=index) + return documents - def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dict] = None) -> int: + def get_embedding_count( + self, + index: Optional[str] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None + ) -> int: """ Return the count of embeddings in the document store. """ if filters: - raise Exception("Filters are not supported for get_embedding_count in PineconeDocumentStore") + raise NotImplementedError("Filters are not supported for get_embedding_count in PineconeDocumentStore") + index = index or self.index index = self._sanitize_index_name(index) - if not self.pinecone_indexes.get(index): - self.pinecone_indexes[index] = self._create_index_if_not_exist( - embedding_dim=self.embedding_dim, - index=self.index, - metric_type=self.metric_type, - replicas=self.replicas, - shards=self.shards, - ) + if not self.pinecone_indexes.get(index, False): + raise ValueError(f"No index named {index} found in Pinecone.") stats = self.pinecone_indexes[index].describe_index_stats() # if no namespace return zero - count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 + count = stats["namespaces"][""]["vector_count"] if "" in stats["namespaces"] else 0 return count def delete_documents( self, index: Optional[str] = None, ids: Optional[List[str]] = None, - filters: Optional[Dict] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, ): """ Delete documents from the document store. - :param index: Index name to delete the documents from. If None, the - DocumentStore's default index (self.index) will be used. + :param index: Index name to delete the documents from. If `None`, the DocumentStore's default index + (`self.index`) will be used. :param ids: Optional list of IDs to narrow down the documents to be deleted. - :param filters: Optional filters to narrow down the documents to be deleted (not supported by PineconeDocumentStore). - Example: {"genre": {"$in": ["documentary", "action"]}}, - more info on filtering syntax here https://www.pinecone.io/docs/metadata-filtering/ - :return: None + :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` """ if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") - if filters: - raise NotImplementedError("PineconeDocumentStore does not support filtering during document deletion.") index = index or self.index index = self._sanitize_index_name(index) - if not self.pinecone_indexes.get(index): - self.pinecone_indexes[index] = self._create_index_if_not_exist( - embedding_dim=self.embedding_dim, - index=self.index, - metric_type=self.metric_type, - replicas=self.replicas, - shards=self.shards, - ) - _ = self.pinecone_indexes[index].delete(ids=ids) - # delete from SQL + if index not in self.pinecone_indexes: + raise ValueError(f"No index named {index} found in Pinecone.") + + if ids is None and filters is None: + self.pinecone_indexes[index].delete(delete_all=True) + else: + affected_docs = self.get_all_documents(filters=filters, return_embedding=False) + if ids: + affected_docs = [doc for doc in affected_docs if doc.id in ids] + + doc_ids = [doc.meta.get("vector_id") for doc in affected_docs + if doc.meta and doc.meta.get("vector_id") is not None] + self.pinecone_indexes[index].delete(ids=doc_ids) + super().delete_documents(index=index, ids=ids, filters=filters) def query_by_embedding( self, query_emb: np.ndarray, - filters: Optional[Dict] = None, + filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, @@ -549,34 +573,91 @@ def query_by_embedding( """ Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. - :param query_emb: Embedding of the query (e.g. gathered from DPR) - :param filters: Optional filters to narrow down the search space. - Example: {"genre": {"$in": ["documentary", "action"]}}, - more info on filtering syntax here https://www.pinecone.io/docs/metadata-filtering/ - :param top_k: How many documents to return - :param index: Index name to query the document from. - :param return_embedding: To return document embedding - :return: + :param query_emb: Embedding of the query (e.g. gathered from DPR). + :param filters: Optional filters to narrow down the search space to documents whose metadata fulfill certain + conditions. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` + :param top_k: How many documents to return. + :param index: The name of the index from which to retrieve documents. + :param return_embedding: Whether to return document embedding. """ if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") + + if return_embedding is None: + return_embedding = self.return_embedding self._limit_check(top_k, include_values=return_embedding) - if filters: + + if filters is not None: filters = LogicalFilterClause.parse(filters).convert_to_pinecone() index = index or self.index index = self._sanitize_index_name(index) - if not self.pinecone_indexes.get(index): + if index not in self.pinecone_indexes.get: raise Exception( - f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running 'update_embeddings()' to create and populate an index." + f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running " + f"'update_embeddings()' to create and populate an index." ) - if return_embedding is None: - return_embedding = self.return_embedding - query_emb = query_emb.reshape(1, -1).astype(np.float32) - if self.similarity == "cosine": self.normalize_embedding(query_emb) @@ -587,7 +668,6 @@ def query_by_embedding( for match in res["results"][0]["matches"]: score_matrix.append(match["score"]) vector_id_matrix.append(match["id"]) - documents = self.get_documents_by_vector_ids(vector_id_matrix, index=index) # assign query score to each document @@ -596,12 +676,23 @@ def query_by_embedding( raw_score = scores_for_vector_ids[doc.id] doc.score = self.finalize_raw_score(raw_score, self.similarity) - if return_embedding is True: - # get embedding from Pinecone response - doc.embedding = self.pinecone_indexes[index].reconstruct(int(doc.id)) + if return_embedding: + # Get embedding from Pinecone response + self._attach_embedding_to_document(document=doc, index=index) return documents + def _attach_embedding_to_document(self, document: Document, index: str): + """ + Fetches the Document's embedding from the specified Pinecone index and attaches it to the Document's + embedding field. + """ + if document.meta and document.meta.get("vector_id", None) is not None: + result = self.pinecone_indexes[index].fetch(ids=[document.id]) + if result["vectors"].get(document.id, False): + embedding = result["vectors"][document.id].get("values", None) + document.embedding = np.asarray(embedding, dtype=np.float32) + def _limit_check(self, top_k: int, include_values: Optional[bool] = None): """ Confirms the top_k value does not exceed Pinecone vector database limits. From c9823f216bcd47939329e9d3babf958ce6517379 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Wed, 2 Mar 2022 18:24:41 +0100 Subject: [PATCH 12/58] Fix skipping document store tests --- conftest.py | 3 ++- test/conftest.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index 9b7dbc9b4f..4e36ee9222 100644 --- a/conftest.py +++ b/conftest.py @@ -1,5 +1,6 @@ def pytest_addoption(parser): - parser.addoption("--document_store_type", action="store", default="elasticsearch, faiss, memory, milvus, weaviate") + parser.addoption("--document_store_type", action="store", + default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate") def pytest_generate_tests(metafunc): diff --git a/test/conftest.py b/test/conftest.py index cab604b0a5..b0fadbdcfc 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -116,7 +116,7 @@ def pytest_collection_modifyitems(config, items): # if the cli argument "--document_store_type" is used, we want to skip all tests that have markers of other docstores # Example: pytest -v test_document_store.py --document_store_type="memory" => skip all tests marked with "elasticsearch" document_store_types_to_run = config.getoption("--document_store_type") - document_store_types_to_run = document_store_types_to_run.split(",") + document_store_types_to_run = document_store_types_to_run.split(", ") keywords = [] if "milvus1" in document_store_types_to_run and not os.getenv("MILVUS1_ENABLED"): From b1249254ec8b754584fb379fb49cdca6c971244a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 2 Mar 2022 17:29:06 +0000 Subject: [PATCH 13/58] Update Documentation & Code Style --- conftest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index 4e36ee9222..8d673d46d5 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,7 @@ def pytest_addoption(parser): - parser.addoption("--document_store_type", action="store", - default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate") + parser.addoption( + "--document_store_type", action="store", default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate" + ) def pytest_generate_tests(metafunc): From 8526342520dd55d99c0d08ed0cfbd7a03554ad4b Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Wed, 2 Mar 2022 18:54:09 +0100 Subject: [PATCH 14/58] Fix handling of Milvus1 and Milvus2 in tests --- test/conftest.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index b0fadbdcfc..475a0e0029 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -119,14 +119,6 @@ def pytest_collection_modifyitems(config, items): document_store_types_to_run = document_store_types_to_run.split(", ") keywords = [] - if "milvus1" in document_store_types_to_run and not os.getenv("MILVUS1_ENABLED"): - document_store_types_to_run.remove("milvus1") - document_store_types_to_run.append("milvus") - if not milvus1: - raise Exception( - "Milvus1 is enabled, but your pymilvus version only supports Milvus 2. Please select the correct pymilvus version." - ) - for i in item.keywords: if "-" in i: keywords.extend(i.split("-")) @@ -138,7 +130,16 @@ def pytest_collection_modifyitems(config, items): reason=f'{cur_doc_store} is disabled. Enable via pytest --document_store_type="{cur_doc_store}"' ) item.add_marker(skip_docstore) - + elif cur_doc_store == "milvus1" and not milvus1: + skip_milvus1 = pytest.mark.skip( + reason="Skipping Tests for 'milvus1', as Milvus2 seems to be installed." + ) + item.add_marker(skip_milvus1) + elif cur_doc_store == "milvus" and milvus1: + skip_milvus = pytest.mark.skip( + reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed." + ) + item.add_marker(skip_milvus) @pytest.fixture(scope="function", autouse=True) def gc_cleanup(request): From ebf53440d7c1f7eff2a5865564826e59e6ac7645 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 2 Mar 2022 17:56:31 +0000 Subject: [PATCH 15/58] Update Documentation & Code Style --- test/conftest.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 475a0e0029..cc10eb2f8a 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -136,11 +136,10 @@ def pytest_collection_modifyitems(config, items): ) item.add_marker(skip_milvus1) elif cur_doc_store == "milvus" and milvus1: - skip_milvus = pytest.mark.skip( - reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed." - ) + skip_milvus = pytest.mark.skip(reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed.") item.add_marker(skip_milvus) + @pytest.fixture(scope="function", autouse=True) def gc_cleanup(request): """ From 422f82e5589f32ef07e2ce9ffe5bdd16921cfa09 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Wed, 2 Mar 2022 19:10:51 +0100 Subject: [PATCH 16/58] Fix handling of Milvus1 and Milvus2 in tests --- test/conftest.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 475a0e0029..2d015e9938 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -130,16 +130,17 @@ def pytest_collection_modifyitems(config, items): reason=f'{cur_doc_store} is disabled. Enable via pytest --document_store_type="{cur_doc_store}"' ) item.add_marker(skip_docstore) - elif cur_doc_store == "milvus1" and not milvus1: - skip_milvus1 = pytest.mark.skip( - reason="Skipping Tests for 'milvus1', as Milvus2 seems to be installed." - ) - item.add_marker(skip_milvus1) - elif cur_doc_store == "milvus" and milvus1: - skip_milvus = pytest.mark.skip( - reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed." - ) - item.add_marker(skip_milvus) + + if "milvus1" in keywords and not milvus1: + skip_milvus1 = pytest.mark.skip( + reason="Skipping Tests for 'milvus1', as Milvus2 seems to be installed." + ) + item.add_marker(skip_milvus1) + elif "milvus" in keywords and milvus1: + skip_milvus = pytest.mark.skip( + reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed." + ) + item.add_marker(skip_milvus) @pytest.fixture(scope="function", autouse=True) def gc_cleanup(request): From 66f7dd03338fc9ed735609e77e6ae9f361f338dd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 2 Mar 2022 18:13:32 +0000 Subject: [PATCH 17/58] Update Documentation & Code Style --- test/conftest.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 2d015e9938..5c16573597 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -132,16 +132,13 @@ def pytest_collection_modifyitems(config, items): item.add_marker(skip_docstore) if "milvus1" in keywords and not milvus1: - skip_milvus1 = pytest.mark.skip( - reason="Skipping Tests for 'milvus1', as Milvus2 seems to be installed." - ) + skip_milvus1 = pytest.mark.skip(reason="Skipping Tests for 'milvus1', as Milvus2 seems to be installed.") item.add_marker(skip_milvus1) elif "milvus" in keywords and milvus1: - skip_milvus = pytest.mark.skip( - reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed." - ) + skip_milvus = pytest.mark.skip(reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed.") item.add_marker(skip_milvus) + @pytest.fixture(scope="function", autouse=True) def gc_cleanup(request): """ From 1bcf7597bf1174ac04411629f8b90615ab9815d3 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Thu, 3 Mar 2022 10:19:04 +0100 Subject: [PATCH 18/58] Remove SQL from tests requiring embeddings --- test/test_document_store.py | 3 +++ test/test_retriever.py | 2 ++ test/test_standard_pipelines.py | 1 + 3 files changed, 6 insertions(+) diff --git a/test/test_document_store.py b/test/test_document_store.py index fe6892353c..ec27ad04ba 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -450,6 +450,7 @@ def test_write_document_index(document_store): assert len(document_store.get_all_documents()) == 0 +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) def test_document_with_embeddings(document_store): documents = [ {"content": "text1", "id": "1", "embedding": np.random.rand(768).astype(np.float32)}, @@ -471,6 +472,7 @@ def test_document_with_embeddings(document_store): assert isinstance(documents_with_embedding[0].embedding, (list, np.ndarray)) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) def test_update_embeddings(document_store, retriever): documents = [] @@ -582,6 +584,7 @@ def test_update_embeddings(document_store, retriever): assert document_store.get_embedding_count(index="haystack_test_one") == 14 +@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) @pytest.mark.parametrize("retriever", ["table_text_retriever"], indirect=True) @pytest.mark.embedding_dim(512) def test_update_embeddings_table_text_retriever(document_store, retriever): diff --git a/test/test_retriever.py b/test/test_retriever.py index ae44820198..67065a7466 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -150,6 +150,7 @@ def test_elasticsearch_custom_query(): @pytest.mark.slow +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) def test_dpr_embedding(document_store, retriever, docs): @@ -178,6 +179,7 @@ def test_dpr_embedding(document_store, retriever, docs): @pytest.mark.slow +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) @pytest.mark.parametrize("retriever", ["retribert"], indirect=True) @pytest.mark.embedding_dim(128) def test_retribert_embedding(document_store, retriever, docs): diff --git a/test/test_standard_pipelines.py b/test/test_standard_pipelines.py index f0fd5e493c..e032e1ac7a 100644 --- a/test/test_standard_pipelines.py +++ b/test/test_standard_pipelines.py @@ -77,6 +77,7 @@ def test_faq_pipeline(retriever, document_store): @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) def test_document_search_pipeline(retriever, document_store): documents = [ {"content": "Sample text for document-1", "meta": {"source": "wiki1"}}, From 34c29ff8d7f1ed141535733ffda2305129b89858 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 3 Mar 2022 09:21:27 +0000 Subject: [PATCH 19/58] Update Documentation & Code Style --- test/test_document_store.py | 8 ++++++-- test/test_retriever.py | 8 ++++++-- test/test_standard_pipelines.py | 4 +++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/test/test_document_store.py b/test/test_document_store.py index ec27ad04ba..b4bfe03ee7 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -450,7 +450,9 @@ def test_write_document_index(document_store): assert len(document_store.get_all_documents()) == 0 -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) +@pytest.mark.parametrize( + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True +) def test_document_with_embeddings(document_store): documents = [ {"content": "text1", "id": "1", "embedding": np.random.rand(768).astype(np.float32)}, @@ -472,7 +474,9 @@ def test_document_with_embeddings(document_store): assert isinstance(documents_with_embedding[0].embedding, (list, np.ndarray)) -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) +@pytest.mark.parametrize( + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True +) @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) def test_update_embeddings(document_store, retriever): documents = [] diff --git a/test/test_retriever.py b/test/test_retriever.py index 67065a7466..a2fb39095f 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -150,7 +150,9 @@ def test_elasticsearch_custom_query(): @pytest.mark.slow -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) +@pytest.mark.parametrize( + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True +) @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) def test_dpr_embedding(document_store, retriever, docs): @@ -179,7 +181,9 @@ def test_dpr_embedding(document_store, retriever, docs): @pytest.mark.slow -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) +@pytest.mark.parametrize( + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True +) @pytest.mark.parametrize("retriever", ["retribert"], indirect=True) @pytest.mark.embedding_dim(128) def test_retribert_embedding(document_store, retriever, docs): diff --git a/test/test_standard_pipelines.py b/test/test_standard_pipelines.py index e032e1ac7a..299af880d6 100644 --- a/test/test_standard_pipelines.py +++ b/test/test_standard_pipelines.py @@ -77,7 +77,9 @@ def test_faq_pipeline(retriever, document_store): @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True) +@pytest.mark.parametrize( + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True +) def test_document_search_pipeline(retriever, document_store): documents = [ {"content": "Sample text for document-1", "meta": {"source": "wiki1"}}, From 4cc08c4736f4087de846ec1a474f527fb6d5abed Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Thu, 3 Mar 2022 11:10:25 +0100 Subject: [PATCH 20/58] Fix get_embedding_count of Milvus2 --- haystack/document_stores/milvus2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/document_stores/milvus2.py b/haystack/document_stores/milvus2.py index c655e503ff..353595d600 100644 --- a/haystack/document_stores/milvus2.py +++ b/haystack/document_stores/milvus2.py @@ -661,4 +661,4 @@ def get_embedding_count(self, index: Optional[str] = None, filters: Optional[Dic """ if filters: raise Exception("filters are not supported for get_embedding_count in MilvusDocumentStore.") - return len(self.get_all_documents()) + return len(self.get_all_documents(index=index)) From 2b91993867f7776dc74957d444d20c4d8a2a4fd9 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Thu, 3 Mar 2022 11:45:33 +0100 Subject: [PATCH 21/58] Make sure to start Milvus2 tests with a new collection --- test/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/conftest.py b/test/conftest.py index 5c16573597..805cad89fc 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -562,6 +562,10 @@ def document_store(request, tmp_path): yield document_store document_store.delete_documents() + # Make sure to drop Milvus2 collection, required for tests using different embedding dimensions + if isinstance(document_store, MilvusDocumentStore) and not milvus1: + document_store.collection.drop() + @pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch"]) def document_store_dot_product(request, tmp_path): From e8aa4140fcd00145229897885c77f382711b027c Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Fri, 4 Mar 2022 18:31:38 +0100 Subject: [PATCH 22/58] Add pinecone to test suite --- conftest.py | 3 +- haystack/document_stores/pinecone.py | 68 +++++++++++++++------------- test/conftest.py | 36 ++++++++++++--- test/test_document_store.py | 12 ++--- test/test_retriever.py | 4 +- test/test_standard_pipelines.py | 2 +- 6 files changed, 77 insertions(+), 48 deletions(-) diff --git a/conftest.py b/conftest.py index 8d673d46d5..f9e20a5aed 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,7 @@ def pytest_addoption(parser): parser.addoption( - "--document_store_type", action="store", default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate" + "--document_store_type", action="store", + default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone" ) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 4605254d9c..41779b5278 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -129,7 +129,7 @@ def __init__( self.progress_bar = progress_bar super().__init__( - url=sql_url, index=index, duplicate_documents=duplicate_documents # no sanitation for SQL index name + url=sql_url, index=clean_index, duplicate_documents=duplicate_documents ) self._validate_index_sync() @@ -281,10 +281,7 @@ def write_documents( # Metadata fields and embeddings are stored in Pinecone self.pinecone_indexes[index].upsert(vectors=data_to_write_to_pinecone) - docs_to_write_to_sql = [] - for doc in document_objects[i : i + batch_size]: - doc.meta["vector_id"] = doc.id - docs_to_write_to_sql.append(doc) + docs_to_write_to_sql = document_objects[i : i + batch_size] super(PineconeDocumentStore, self).write_documents( docs_to_write_to_sql, index=index, duplicate_documents=duplicate_documents ) @@ -372,12 +369,12 @@ def update_embeddings( embeddings_to_index = np.array(embeddings, dtype="float32") if self.similarity == "cosine": self.normalize_embedding(embeddings_to_index) - embeddings = embeddings.tolist() + embeddings = embeddings_to_index.tolist() metadata = [] ids = [] for doc in document_batch: - metadata.append({key: value for key, value in doc.meta.items() if key != "vector_id"}) + metadata.append(doc.meta) ids.append(doc.id) # update existing vectors in pinecone index self.pinecone_indexes[index].upsert(vectors=zip(ids, embeddings, metadata)) @@ -464,6 +461,7 @@ def get_documents_by_id( self, ids: List[str], index: Optional[str] = None, + return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, ) -> List[Document]: @@ -471,11 +469,14 @@ def get_documents_by_id( if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") + if return_embedding is None: + return_embedding = self.return_embedding + index = index or self.index index = self._sanitize_index_name(index) documents = super().get_documents_by_id(ids=ids, index=index, batch_size=batch_size) - if self.return_embedding: + if return_embedding: for doc in documents: self._attach_embedding_to_document(document=doc, index=index) @@ -502,6 +503,19 @@ def get_embedding_count( count = stats["namespaces"][""]["vector_count"] if "" in stats["namespaces"] else 0 return count + def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None): + """ + Update the metadata dictionary of a document by specifying its string id + """ + index = index or self.index + index = self._sanitize_index_name(index) + if index in self.pinecone_indexes: + doc = self.get_documents_by_id(ids=[id], index=index, return_embedding=True)[0] + if doc.embedding is not None: + self.pinecone_indexes[index].upsert(vectors=([id], [doc.embedding.tolist()], [meta])) + + super().update_document_meta(id=id, meta=meta, index=index) + def delete_documents( self, index: Optional[str] = None, @@ -545,19 +559,16 @@ def delete_documents( index = index or self.index index = self._sanitize_index_name(index) - if index not in self.pinecone_indexes: - raise ValueError(f"No index named {index} found in Pinecone.") - - if ids is None and filters is None: - self.pinecone_indexes[index].delete(delete_all=True) - else: - affected_docs = self.get_all_documents(filters=filters, return_embedding=False) - if ids: - affected_docs = [doc for doc in affected_docs if doc.id in ids] + if index in self.pinecone_indexes: + if ids is None and filters is None: + self.pinecone_indexes[index].delete(delete_all=True) + else: + affected_docs = self.get_all_documents(filters=filters, return_embedding=False) + if ids: + affected_docs = [doc for doc in affected_docs if doc.id in ids] - doc_ids = [doc.meta.get("vector_id") for doc in affected_docs - if doc.meta and doc.meta.get("vector_id") is not None] - self.pinecone_indexes[index].delete(ids=doc_ids) + doc_ids = [doc.id for doc in affected_docs] + self.pinecone_indexes[index].delete(ids=doc_ids) super().delete_documents(index=index, ids=ids, filters=filters) @@ -651,7 +662,7 @@ def query_by_embedding( index = index or self.index index = self._sanitize_index_name(index) - if index not in self.pinecone_indexes.get: + if index not in self.pinecone_indexes: raise Exception( f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running " f"'update_embeddings()' to create and populate an index." @@ -668,7 +679,7 @@ def query_by_embedding( for match in res["results"][0]["matches"]: score_matrix.append(match["score"]) vector_id_matrix.append(match["id"]) - documents = self.get_documents_by_vector_ids(vector_id_matrix, index=index) + documents = self.get_documents_by_id(vector_id_matrix, index=index, return_embedding=return_embedding) # assign query score to each document scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)} @@ -676,10 +687,6 @@ def query_by_embedding( raw_score = scores_for_vector_ids[doc.id] doc.score = self.finalize_raw_score(raw_score, self.similarity) - if return_embedding: - # Get embedding from Pinecone response - self._attach_embedding_to_document(document=doc, index=index) - return documents def _attach_embedding_to_document(self, document: Document, index: str): @@ -687,11 +694,10 @@ def _attach_embedding_to_document(self, document: Document, index: str): Fetches the Document's embedding from the specified Pinecone index and attaches it to the Document's embedding field. """ - if document.meta and document.meta.get("vector_id", None) is not None: - result = self.pinecone_indexes[index].fetch(ids=[document.id]) - if result["vectors"].get(document.id, False): - embedding = result["vectors"][document.id].get("values", None) - document.embedding = np.asarray(embedding, dtype=np.float32) + result = self.pinecone_indexes[index].fetch(ids=[document.id]) + if result["vectors"].get(document.id, False): + embedding = result["vectors"][document.id].get("values", None) + document.embedding = np.asarray(embedding, dtype=np.float32) def _limit_check(self, top_k: int, include_values: Optional[bool] = None): """ diff --git a/test/conftest.py b/test/conftest.py index 805cad89fc..77f1bcb460 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -7,6 +7,8 @@ import uuid import logging from pathlib import Path + +import pinecone import responses from sqlalchemy import create_engine, text @@ -29,7 +31,7 @@ import weaviate from haystack.document_stores.weaviate import WeaviateDocumentStore - from haystack.document_stores import MilvusDocumentStore + from haystack.document_stores import MilvusDocumentStore, PineconeDocumentStore from haystack.document_stores.graphdb import GraphDBKnowledgeGraph from haystack.document_stores.faiss import FAISSDocumentStore from haystack.document_stores.sql import SQLDocumentStore @@ -124,7 +126,7 @@ def pytest_collection_modifyitems(config, items): keywords.extend(i.split("-")) else: keywords.append(i) - for cur_doc_store in ["elasticsearch", "faiss", "sql", "memory", "milvus1", "milvus", "weaviate"]: + for cur_doc_store in ["elasticsearch", "faiss", "sql", "memory", "milvus1", "milvus", "weaviate", "pinecone"]: if cur_doc_store in keywords and cur_doc_store not in document_store_types_to_run: skip_docstore = pytest.mark.skip( reason=f'{cur_doc_store} is disabled. Enable via pytest --document_store_type="{cur_doc_store}"' @@ -138,6 +140,11 @@ def pytest_collection_modifyitems(config, items): skip_milvus = pytest.mark.skip(reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed.") item.add_marker(skip_milvus) + # Skip PineconeDocumentStore if PINECONE_API_KEY not in environment variables + if "PINECONE_API_KEY" not in os.environ and "pinecone" in keywords: + skip_pinecone = pytest.mark.skip(reason="PINECONE_API_KEY not in environment variables.") + item.add_marker(skip_pinecone) + @pytest.fixture(scope="function", autouse=True) def gc_cleanup(request): @@ -542,7 +549,7 @@ def ensure_ids_are_correct_uuids(docs: list, document_store: object) -> None: d["id"] = str(uuid.uuid4()) -@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"]) +@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"]) def document_store_with_docs(request, test_docs_xs, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) document_store = get_document_store( @@ -566,8 +573,13 @@ def document_store(request, tmp_path): if isinstance(document_store, MilvusDocumentStore) and not milvus1: document_store.collection.drop() + # Make sure to delete Pinecone indexes, required for tests using different embedding dimensions + if isinstance(document_store, PineconeDocumentStore): + for index in document_store.pinecone_indexes: + pinecone.delete_index(index) + -@pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch"]) +@pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch", "pinecone"]) def document_store_dot_product(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) document_store = get_document_store( @@ -580,7 +592,7 @@ def document_store_dot_product(request, tmp_path): document_store.delete_documents() -@pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch"]) +@pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch", "pinecone"]) def document_store_dot_product_with_docs(request, test_docs_xs, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) document_store = get_document_store( @@ -594,7 +606,7 @@ def document_store_dot_product_with_docs(request, test_docs_xs, tmp_path): document_store.delete_documents() -@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1"]) +@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1", "pinecone"]) def document_store_dot_product_small(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(3)) document_store = get_document_store( @@ -607,7 +619,7 @@ def document_store_dot_product_small(request, tmp_path): document_store.delete_documents() -@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"]) +@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"]) def document_store_small(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(3)) document_store = get_document_store( @@ -741,6 +753,16 @@ def get_document_store( ) document_store.weaviate_client.schema.delete_all() document_store._create_schema_and_index_if_not_exist() + + elif document_store_type == "pinecone": + document_store = PineconeDocumentStore( + api_key=os.environ["PINECONE_API_KEY"], + embedding_dim=embedding_dim, + embedding_field=embedding_field, + index=index, + similarity=similarity, + ) + else: raise Exception(f"No document store fixture for '{document_store_type}'") diff --git a/test/test_document_store.py b/test/test_document_store.py index b4bfe03ee7..7a2d005e45 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -125,7 +125,7 @@ def test_write_with_duplicate_doc_ids(document_store): document_store.write_documents(duplicate_documents, duplicate_documents="fail") -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "weaviate"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "weaviate", "pinecone"], indirect=True) def test_write_with_duplicate_doc_ids_custom_index(document_store): duplicate_documents = [ Document(content="Doc1", id_hash_keys=["content"]), @@ -218,7 +218,7 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs) assert len(documents) == 0 -@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "sql", "weaviate", "memory"], indirect=True) +@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "sql", "weaviate", "memory", "pinecone"], indirect=True) def test_extended_filter(document_store_with_docs): # Test comparison operators individually documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}}) @@ -733,7 +733,7 @@ def test_delete_documents_by_id_with_filters(document_store_with_docs): # exclude weaviate because it does not support storing labels -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) def test_labels(document_store): label = Label( query="question1", @@ -821,7 +821,7 @@ def test_labels(document_store): # exclude weaviate because it does not support storing labels -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) def test_multilabel(document_store): labels = [ Label( @@ -937,7 +937,7 @@ def test_multilabel(document_store): # exclude weaviate because it does not support storing labels -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) def test_multilabel_no_answer(document_store): labels = [ Label( @@ -1192,7 +1192,7 @@ def test_multilabel_meta_aggregations(document_store): assert multi_label.filters == l.filters -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "milvus1", "weaviate"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "milvus1", "weaviate", "pinecone"], indirect=True) # Currently update_document_meta() is not implemented for Memory doc store def test_update_meta(document_store): documents = [ diff --git a/test/test_retriever.py b/test/test_retriever.py index a2fb39095f..945aca4939 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -151,7 +151,7 @@ def test_elasticsearch_custom_query(): @pytest.mark.slow @pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"], indirect=True ) @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) def test_dpr_embedding(document_store, retriever, docs): @@ -182,7 +182,7 @@ def test_dpr_embedding(document_store, retriever, docs): @pytest.mark.slow @pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"], indirect=True ) @pytest.mark.parametrize("retriever", ["retribert"], indirect=True) @pytest.mark.embedding_dim(128) diff --git a/test/test_standard_pipelines.py b/test/test_standard_pipelines.py index 299af880d6..4b2ae7418b 100644 --- a/test/test_standard_pipelines.py +++ b/test/test_standard_pipelines.py @@ -78,7 +78,7 @@ def test_faq_pipeline(retriever, document_store): @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) @pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"], indirect=True ) def test_document_search_pipeline(retriever, document_store): documents = [ From 4f28bb3f8392596f580e579c72c3090094389464 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 6 Mar 2022 11:50:14 +0000 Subject: [PATCH 23/58] Update Documentation & Code Style --- conftest.py | 5 +-- docs/_src/tutorials/tutorials/5.md | 2 +- haystack/document_stores/pinecone.py | 46 +++++++++++++++++----------- test/test_document_store.py | 8 +++-- 4 files changed, 38 insertions(+), 23 deletions(-) diff --git a/conftest.py b/conftest.py index f9e20a5aed..a381d802f8 100644 --- a/conftest.py +++ b/conftest.py @@ -1,7 +1,8 @@ def pytest_addoption(parser): parser.addoption( - "--document_store_type", action="store", - default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone" + "--document_store_type", + action="store", + default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone", ) diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md index e69fb84911..0d1579fb1a 100644 --- a/docs/_src/tutorials/tutorials/5.md +++ b/docs/_src/tutorials/tutorials/5.md @@ -143,7 +143,7 @@ retriever = ElasticsearchRetriever(document_store=document_store) # from haystack.retriever import EmbeddingRetriever, DensePassageRetriever # retriever = EmbeddingRetriever(document_store=document_store, model_format="sentence_transformers", -# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1") +# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1") # retriever = DensePassageRetriever(document_store=document_store, # query_embedding_model="facebook/dpr-question_encoder-single-nq-base", # passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 41779b5278..05371da220 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -79,10 +79,20 @@ def __init__( - `"fail"`: An error is raised if the document ID of the document being added already exists. """ # Save init parameters to enable export of component config as YAML - self.set_config(api_key=api_key, environment=environment, sql_url=sql_url, embedding_dim=embedding_dim, - return_embedding=return_embedding, index=index, similarity=similarity, replicas=replicas, - shards=shards, embedding_field=embedding_field, progress_bar=progress_bar, - duplicate_documents=duplicate_documents) + self.set_config( + api_key=api_key, + environment=environment, + sql_url=sql_url, + embedding_dim=embedding_dim, + return_embedding=return_embedding, + index=index, + similarity=similarity, + replicas=replicas, + shards=shards, + embedding_field=embedding_field, + progress_bar=progress_bar, + duplicate_documents=duplicate_documents, + ) # Connect to Pinecone server using python client binding pinecone.init(api_key=api_key, environment=environment) @@ -128,9 +138,7 @@ def __init__( self.progress_bar = progress_bar - super().__init__( - url=sql_url, index=clean_index, duplicate_documents=duplicate_documents - ) + super().__init__(url=sql_url, index=clean_index, duplicate_documents=duplicate_documents) self._validate_index_sync() @@ -244,8 +252,9 @@ def write_documents( index = index or self.index index = self._sanitize_index_name(index) duplicate_documents = duplicate_documents or self.duplicate_documents - assert duplicate_documents in self.duplicate_documents_options, \ - f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}" + assert ( + duplicate_documents in self.duplicate_documents_options + ), f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}" if index not in self.pinecone_indexes: self.pinecone_indexes[index] = self._create_index_if_not_exist( @@ -267,8 +276,8 @@ def write_documents( total=len(document_objects), disable=not self.progress_bar, position=0, desc="Writing Documents" ) as progress_bar: for i in range(0, len(document_objects), batch_size): - ids = [doc.id for doc in document_objects[i: i + batch_size]] - metadata = [doc.meta for doc in document_objects[i: i + batch_size]] + ids = [doc.id for doc in document_objects[i : i + batch_size]] + metadata = [doc.meta for doc in document_objects[i : i + batch_size]] if add_vectors: embeddings = [doc.embedding for doc in document_objects[i : i + batch_size]] embeddings_to_index = np.array(embeddings, dtype="float32") @@ -341,8 +350,10 @@ def update_embeddings( index = self._sanitize_index_name(index) if index not in self.pinecone_indexes: - raise ValueError(f"Couldn't find a the index '{index}' in Pinecone. Try to init the " - f"PineconeDocumentStore() again ...") + raise ValueError( + f"Couldn't find a the index '{index}' in Pinecone. Try to init the " + f"PineconeDocumentStore() again ..." + ) document_count = self.get_document_count(index=index, filters=filters) if document_count == 0: @@ -394,8 +405,9 @@ def get_all_documents( if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") - result = self.get_all_documents_generator(index=index, filters=filters, return_embedding=return_embedding, - batch_size=batch_size) + result = self.get_all_documents_generator( + index=index, filters=filters, return_embedding=return_embedding, batch_size=batch_size + ) documents = list(result) return documents @@ -483,9 +495,7 @@ def get_documents_by_id( return documents def get_embedding_count( - self, - index: Optional[str] = None, - filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None + self, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None ) -> int: """ Return the count of embeddings in the document store. diff --git a/test/test_document_store.py b/test/test_document_store.py index 7a2d005e45..838a647d70 100644 --- a/test/test_document_store.py +++ b/test/test_document_store.py @@ -125,7 +125,9 @@ def test_write_with_duplicate_doc_ids(document_store): document_store.write_documents(duplicate_documents, duplicate_documents="fail") -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "weaviate", "pinecone"], indirect=True) +@pytest.mark.parametrize( + "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "weaviate", "pinecone"], indirect=True +) def test_write_with_duplicate_doc_ids_custom_index(document_store): duplicate_documents = [ Document(content="Doc1", id_hash_keys=["content"]), @@ -218,7 +220,9 @@ def test_get_all_documents_with_incorrect_filter_value(document_store_with_docs) assert len(documents) == 0 -@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch", "sql", "weaviate", "memory", "pinecone"], indirect=True) +@pytest.mark.parametrize( + "document_store_with_docs", ["elasticsearch", "sql", "weaviate", "memory", "pinecone"], indirect=True +) def test_extended_filter(document_store_with_docs): # Test comparison operators individually documents = document_store_with_docs.get_all_documents(filters={"meta_field": {"$eq": "test1"}}) From b44dc33257ea48698032cc2c14f6892ae43bebdf Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Sun, 6 Mar 2022 22:16:21 +0100 Subject: [PATCH 24/58] Fix typing --- haystack/document_stores/filter_utils.py | 14 ++++++++------ haystack/document_stores/pinecone.py | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/haystack/document_stores/filter_utils.py b/haystack/document_stores/filter_utils.py index 608f69facd..106556dd84 100644 --- a/haystack/document_stores/filter_utils.py +++ b/haystack/document_stores/filter_utils.py @@ -427,7 +427,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, boo comp_value_type, comp_value = self._get_weaviate_datatype() return {"path": [self.field_name], "operator": "Equal", comp_value_type: comp_value} - def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, int, float, bool]]: + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[List[str], str, int, float, bool]]]: return {self.field_name: {"$eq": self.comparison_value}} def invert(self) -> "NeOperation": @@ -467,6 +467,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: return filter_dict def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]: + assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list." return {self.field_name: {"$in": self.comparison_value}} def invert(self) -> "NinOperation": @@ -496,7 +497,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, int, float, boo comp_value_type, comp_value = self._get_weaviate_datatype() return {"path": [self.field_name], "operator": "NotEqual", comp_value_type: comp_value} - def convert_to_pinecone(self) -> Dict[str, Union[List[str], str, int, float, bool]]: + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[List[str], str, int, float, bool]]]: return {self.field_name: {"$ne": self.comparison_value}} def invert(self) -> "EqOperation": @@ -536,6 +537,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[str, List[Dict]]]: return filter_dict def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]: + assert isinstance(self.comparison_value, list), "'$in' operation requires comparison value to be a list." return {self.field_name: {"$nin": self.comparison_value}} def invert(self) -> "InOperation": @@ -567,7 +569,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value} def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - assert not isinstance(self.comparison_value, list), "Comparison value for '$gt' operation must not be a list." + assert not isinstance(self.comparison_value, (list, str)), "Comparison value for '$gt' operation must be a float or int." return {self.field_name: {"$gt": self.comparison_value}} def invert(self) -> "LteOperation": @@ -599,7 +601,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value} def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - assert not isinstance(self.comparison_value, list), "Comparison value for '$gte' operation must not be a list." + assert not isinstance(self.comparison_value, (list, str)), "Comparison value for '$gte' operation must be a float or int." return {self.field_name: {"$gte": self.comparison_value}} def invert(self) -> "LtOperation": @@ -631,7 +633,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value} def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - assert not isinstance(self.comparison_value, list), "Comparison value for '$lt' operation must not be a list." + assert not isinstance(self.comparison_value, (list, str)), "Comparison value for '$lt' operation must be a float or int." return {self.field_name: {"$lt": self.comparison_value}} def invert(self) -> "GteOperation": @@ -663,7 +665,7 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value} def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - assert not isinstance(self.comparison_value, list), "Comparison value for '$lte' operation must not be a list." + assert not isinstance(self.comparison_value, (list, str)), "Comparison value for '$lte' operation must be a float or int." return {self.field_name: {"$lte": self.comparison_value}} def invert(self) -> "GtOperation": diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 41779b5278..87a50c8211 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -184,7 +184,7 @@ def _convert_pinecone_result_to_document(self, result: dict, return_embedding: b score = result.get("score", None) embedding = result.get("values") meta = result.get("metadata") - content_type = meta.pop("content_type") if "content_type" in meta else None + content_type = meta.pop("content_type") if isinstance(meta, dict) and "content_type" in meta else None if return_embedding and embedding: embedding = np.asarray(embedding, dtype=np.float32) @@ -461,9 +461,9 @@ def get_documents_by_id( self, ids: List[str], index: Optional[str] = None, - return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None, + return_embedding: Optional[bool] = None, ) -> List[Document]: if headers: From c80246d9f076772fe1c1a2e5242ef4eb5af3bd19 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 6 Mar 2022 21:18:27 +0000 Subject: [PATCH 25/58] Update Documentation & Code Style --- haystack/document_stores/filter_utils.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/haystack/document_stores/filter_utils.py b/haystack/document_stores/filter_utils.py index 106556dd84..9b7bc8c5f7 100644 --- a/haystack/document_stores/filter_utils.py +++ b/haystack/document_stores/filter_utils.py @@ -569,7 +569,9 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: return {"path": [self.field_name], "operator": "GreaterThan", comp_value_type: comp_value} def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - assert not isinstance(self.comparison_value, (list, str)), "Comparison value for '$gt' operation must be a float or int." + assert not isinstance( + self.comparison_value, (list, str) + ), "Comparison value for '$gt' operation must be a float or int." return {self.field_name: {"$gt": self.comparison_value}} def invert(self) -> "LteOperation": @@ -601,7 +603,9 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: return {"path": [self.field_name], "operator": "GreaterThanEqual", comp_value_type: comp_value} def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - assert not isinstance(self.comparison_value, (list, str)), "Comparison value for '$gte' operation must be a float or int." + assert not isinstance( + self.comparison_value, (list, str) + ), "Comparison value for '$gte' operation must be a float or int." return {self.field_name: {"$gte": self.comparison_value}} def invert(self) -> "LtOperation": @@ -633,7 +637,9 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: return {"path": [self.field_name], "operator": "LessThan", comp_value_type: comp_value} def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - assert not isinstance(self.comparison_value, (list, str)), "Comparison value for '$lt' operation must be a float or int." + assert not isinstance( + self.comparison_value, (list, str) + ), "Comparison value for '$lt' operation must be a float or int." return {self.field_name: {"$lt": self.comparison_value}} def invert(self) -> "GteOperation": @@ -665,7 +671,9 @@ def convert_to_weaviate(self) -> Dict[str, Union[List[str], str, float, int]]: return {"path": [self.field_name], "operator": "LessThanEqual", comp_value_type: comp_value} def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - assert not isinstance(self.comparison_value, (list, str)), "Comparison value for '$lte' operation must be a float or int." + assert not isinstance( + self.comparison_value, (list, str) + ), "Comparison value for '$lte' operation must be a float or int." return {self.field_name: {"$lte": self.comparison_value}} def invert(self) -> "GtOperation": From e0995ad17ce4ba8505d8d7d74214f24e88202825 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Sun, 6 Mar 2022 22:27:57 +0100 Subject: [PATCH 26/58] Add pinecone to docstores dependendcy --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 220c5af5f4..27b6197b3a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -131,9 +131,9 @@ pinecone = graphdb = SPARQLWrapper docstores = - farm-haystack[faiss,milvus,weaviate,graphdb] + farm-haystack[faiss,milvus,weaviate,graphdb,pinecone] docstores-gpu = - farm-haystack[faiss-gpu,milvus,weaviate,graphdb] + farm-haystack[faiss-gpu,milvus,weaviate,graphdb,pinecone] crawler = selenium webdriver-manager From 71ba09851c822bfa88174a51651ebe657ba42d79 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Thu, 10 Mar 2022 17:09:42 +0100 Subject: [PATCH 27/58] Add PineconeDocStore to API Documentation --- docs/_src/api/pydoc/document-store.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_src/api/pydoc/document-store.yml b/docs/_src/api/pydoc/document-store.yml index e70bb65457..47c227848c 100644 --- a/docs/_src/api/pydoc/document-store.yml +++ b/docs/_src/api/pydoc/document-store.yml @@ -1,7 +1,7 @@ loaders: - type: python search_path: [../../../../haystack/document_stores] - modules: ['base', 'elasticsearch', 'memory', 'sql', 'faiss', 'milvus1', 'milvus2', 'weaviate', 'graphdb', 'deepsetcloud', 'utils'] + modules: ['base', 'elasticsearch', 'memory', 'sql', 'faiss', 'milvus1', 'milvus2', 'weaviate', 'graphdb', 'deepsetcloud', 'pinecone' 'utils'] ignore_when_discovered: ['__init__'] processors: - type: filter From 2cc268dbd438a25a28f8b83e9066337316844591 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Thu, 10 Mar 2022 17:12:47 +0100 Subject: [PATCH 28/58] Add missing comma --- docs/_src/api/pydoc/document-store.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_src/api/pydoc/document-store.yml b/docs/_src/api/pydoc/document-store.yml index 47c227848c..ae233e1567 100644 --- a/docs/_src/api/pydoc/document-store.yml +++ b/docs/_src/api/pydoc/document-store.yml @@ -1,7 +1,7 @@ loaders: - type: python search_path: [../../../../haystack/document_stores] - modules: ['base', 'elasticsearch', 'memory', 'sql', 'faiss', 'milvus1', 'milvus2', 'weaviate', 'graphdb', 'deepsetcloud', 'pinecone' 'utils'] + modules: ['base', 'elasticsearch', 'memory', 'sql', 'faiss', 'milvus1', 'milvus2', 'weaviate', 'graphdb', 'deepsetcloud', 'pinecone', 'utils'] ignore_when_discovered: ['__init__'] processors: - type: filter From d635baa0739d0302e7d10f1eb78799ed78bbcfb7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 10 Mar 2022 16:15:10 +0000 Subject: [PATCH 29/58] Update Documentation & Code Style --- docs/_src/api/api/document_store.md | 322 ++++++++++++++++++++++++++++ 1 file changed, 322 insertions(+) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 02469ee1a7..be232ac179 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -4072,6 +4072,328 @@ exists. None + + +# Module pinecone + + + +## PineconeDocumentStore + +```python +class PineconeDocumentStore(SQLDocumentStore) +``` + +Document store for very large scale embedding based dense retrievers like the DPR. + +It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io)) +to perform similarity search on vectors. + +The document text is stored using the SQLDocumentStore, while +the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index. + + + +#### \_\_init\_\_ + +```python +def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional["pinecone.Index"] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite") +``` + +**Arguments**: + +- `api_key`: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). +- `environment`: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are +supported, contact Pinecone if required. +- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale +deployment, Postgres is recommended. +- `pinecone_index`: pinecone-client Index object, an index will be initialized or loaded if not specified. +- `embedding_dim`: The embedding vector size. +- `return_embedding`: Whether to return document embeddings. +- `index`: Name of index in document store to use. +- `similarity`: The similarity function used to compare document vectors. `"dot_product"` is the default +since it is more performant with DPR embeddings. `"cosine"` is recommended if you are using a +Sentence-Transformer model. +In both cases, the returned values in Document.score are normalized to be in range [0,1]: + - For `"dot_product"`: `expit(np.asarray(raw_score / 100))` + - For `"cosine"`: `(raw_score + 1) / 2` +- `replicas`: The number of replicas. Replicas duplicate the index. They provide higher availability and +throughput. +- `shards`: The number of shards to be used in the index. We recommend to use 1 shard per 1GB of data. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options.\ +Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. + + + +#### write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Add new documents to the DocumentStore. + +:param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index + them right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them. + :param index: (SQL) index name for storing the docs and metadata. + :param batch_size: Number of Documents to process at a time. When working with large number of documents, + batching can help reduce memory footprint. + +**Arguments**: + +- `duplicate_documents`: Handle duplicates document based on parameter options.\ + Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. +:raises DuplicateDocumentError: Exception trigger on duplicate document. + + + +#### update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 32) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if you want to add or change the embeddings for your documents (e.g. after changing the +retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text. +- `index`: Index name for which embeddings are to be updated. If set to `None`, the default `self.index` is +used. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to `False`, +only documents without embeddings are processed. This mode can be used for incremental updating of +embeddings, wherein, only newly indexed documents get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: Number of documents to process at a time. When working with large number of documents, +batching can help reduce memory footprint. + + + +#### get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, str], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id + + + +#### delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents from the document store. + +**Arguments**: + +- `index`: Index name to delete the documents from. If `None`, the DocumentStore's default index +(`self.index`) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + + + +#### query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR). +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return. +- `index`: The name of the index from which to retrieve documents. +- `return_embedding`: Whether to return document embedding. + + + +#### load + +```python +@classmethod +def load(cls) +``` + +Default class method used for loading indexes. Not applicable to the PineconeDocumentStore. + # Module utils From 5cc6df600ce4536788711da8a4f03b0d6de08106 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Thu, 10 Mar 2022 18:09:11 +0100 Subject: [PATCH 30/58] Adapt format of doc strings --- haystack/document_stores/pinecone.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 27b67ed2a5..74ab4f7fbb 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -71,7 +71,7 @@ def __init__( :param embedding_field: Name of field containing an embedding vector. :param progress_bar: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. - :param duplicate_documents: Handle duplicates document based on parameter options.\ + :param duplicate_documents: Handle duplicate documents based on parameter options.\ Parameter options: - `"skip"`: Ignore the duplicate documents. @@ -233,17 +233,18 @@ def write_documents( """ Add new documents to the DocumentStore. - :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index - them right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them. - :param index: (SQL) index name for storing the docs and metadata. - :param batch_size: Number of Documents to process at a time. When working with large number of documents, - batching can help reduce memory footprint. - :param duplicate_documents: Handle duplicates document based on parameter options.\ + :param documents: List of `Dicts` or list of `Documents`. If they already contain embeddings, we'll index them + right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them. + :param index: Index name for storing the docs and metadata. + :param batch_size: Number of documents to process at a time. When working with large number of documents, + batching can help to reduce the memory footprint. + :param duplicate_documents: handle duplicate documents based on parameter options. Parameter options: - `"skip"`: Ignore the duplicate documents. - `"overwrite"`: Update any existing documents with the same ID when adding documents. - `"fail"`: An error is raised if the document ID of the document being added already exists. + :raises DuplicateDocumentError: Exception trigger on duplicate document. """ if headers: From 7bb62b9c4b3936a39946a3912cd03e3aeec6a406 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 10 Mar 2022 17:11:43 +0000 Subject: [PATCH 31/58] Update Documentation & Code Style --- docs/_src/api/api/document_store.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index be232ac179..f1c6faa6bc 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -4123,7 +4123,7 @@ throughput. - `embedding_field`: Name of field containing an embedding vector. - `progress_bar`: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. -- `duplicate_documents`: Handle duplicates document based on parameter options.\ +- `duplicate_documents`: Handle duplicate documents based on parameter options.\ Parameter options: - `"skip"`: Ignore the duplicate documents. - `"overwrite"`: Update any existing documents with the same ID when adding documents. @@ -4139,20 +4139,22 @@ def write_documents(documents: Union[List[dict], List[Document]], index: Optiona Add new documents to the DocumentStore. -:param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index - them right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them. - :param index: (SQL) index name for storing the docs and metadata. - :param batch_size: Number of Documents to process at a time. When working with large number of documents, - batching can help reduce memory footprint. - **Arguments**: -- `duplicate_documents`: Handle duplicates document based on parameter options.\ - Parameter options: - - `"skip"`: Ignore the duplicate documents. - - `"overwrite"`: Update any existing documents with the same ID when adding documents. - - `"fail"`: An error is raised if the document ID of the document being added already exists. -:raises DuplicateDocumentError: Exception trigger on duplicate document. +- `documents`: List of `Dicts` or list of `Documents`. If they already contain embeddings, we'll index them +right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them. +- `index`: Index name for storing the docs and metadata. +- `batch_size`: Number of documents to process at a time. When working with large number of documents, +batching can help to reduce the memory footprint. +- `duplicate_documents`: handle duplicate documents based on parameter options. +Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document. From 9df14367e9879bee32393a6363418c2804eb6a48 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Thu, 10 Mar 2022 18:39:56 +0100 Subject: [PATCH 32/58] Set API key as environment variable --- .github/workflows/linux_ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 7a38d4c0d6..8ce54b4c74 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -301,6 +301,8 @@ jobs: pip install ui/ - name: Run tests + env: + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} run: pytest -s ${{ matrix.test-path }} From e2e5da409424fc5bbcb75fdd79511d0342d93427 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 11:08:16 +0100 Subject: [PATCH 33/58] Skip Pinecone tests in forks --- test/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/conftest.py b/test/conftest.py index 924054bcec..1b5522594e 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -141,7 +141,7 @@ def pytest_collection_modifyitems(config, items): item.add_marker(skip_milvus) # Skip PineconeDocumentStore if PINECONE_API_KEY not in environment variables - if "PINECONE_API_KEY" not in os.environ and "pinecone" in keywords: + if not os.environ.get("PINECONE_API_KEY", False) and "pinecone" in keywords: skip_pinecone = pytest.mark.skip(reason="PINECONE_API_KEY not in environment variables.") item.add_marker(skip_pinecone) From 97ce7e66d5fa0ac2051ae8022663f77ee21021e6 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 14:34:54 +0100 Subject: [PATCH 34/58] Add sleep after deleting index --- test/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/conftest.py b/test/conftest.py index 1b5522594e..482dc41bd8 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -577,6 +577,7 @@ def document_store(request, tmp_path): if isinstance(document_store, PineconeDocumentStore): for index in document_store.pinecone_indexes: pinecone.delete_index(index) + time.sleep(30) @pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch", "pinecone"]) From 9c967f69c44054f43ead2503025b4a19dbb5c00b Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 15:20:32 +0100 Subject: [PATCH 35/58] Add sleep after deleting index --- test/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/conftest.py b/test/conftest.py index 482dc41bd8..fa6cacdce6 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -577,7 +577,7 @@ def document_store(request, tmp_path): if isinstance(document_store, PineconeDocumentStore): for index in document_store.pinecone_indexes: pinecone.delete_index(index) - time.sleep(30) + time.sleep(60) @pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch", "pinecone"]) From 3bb77511332ca05265b2056bee678089753c1b87 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 15:54:58 +0100 Subject: [PATCH 36/58] Add sleep after creating index --- haystack/document_stores/pinecone.py | 3 +++ test/conftest.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 74ab4f7fbb..1a94cf8d9e 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -6,6 +6,7 @@ import logging from typing import Union, List, Optional, Dict, Generator from tqdm.auto import tqdm +import time import pinecone import numpy as np @@ -173,6 +174,8 @@ def _create_index_if_not_exist( name=index, dimension=embedding_dim, metric=metric_type, replicas=replicas, shards=shards ) index_connection = pinecone.Index(index) + # Wait until index has been created + time.sleep(5) # Get index statistics stats = index_connection.describe_index_stats() diff --git a/test/conftest.py b/test/conftest.py index fa6cacdce6..482dc41bd8 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -577,7 +577,7 @@ def document_store(request, tmp_path): if isinstance(document_store, PineconeDocumentStore): for index in document_store.pinecone_indexes: pinecone.delete_index(index) - time.sleep(60) + time.sleep(30) @pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch", "pinecone"]) From 58550e98496a746266a4855bf04b34f0cbe251a8 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 16:42:50 +0100 Subject: [PATCH 37/58] Add check if index ready --- haystack/document_stores/pinecone.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 1a94cf8d9e..1a341d3386 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -174,8 +174,10 @@ def _create_index_if_not_exist( name=index, dimension=embedding_dim, metric=metric_type, replicas=replicas, shards=shards ) index_connection = pinecone.Index(index) - # Wait until index has been created - time.sleep(5) + # Wait until index has been created and is ready + index_description = pinecone.describe_index(index) + while not index_description.status["ready"]: + index_description = pinecone.describe_index(index) # Get index statistics stats = index_connection.describe_index_stats() From b0ae486428c66193bc92e3eadee838a94f1c4ae6 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 17:12:42 +0100 Subject: [PATCH 38/58] Remove printing of index stats --- haystack/document_stores/pinecone.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 1a341d3386..33940c078d 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -174,17 +174,7 @@ def _create_index_if_not_exist( name=index, dimension=embedding_dim, metric=metric_type, replicas=replicas, shards=shards ) index_connection = pinecone.Index(index) - # Wait until index has been created and is ready - index_description = pinecone.describe_index(index) - while not index_description.status["ready"]: - index_description = pinecone.describe_index(index) - - # Get index statistics - stats = index_connection.describe_index_stats() - dims = stats["dimension"] - count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 - logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") - # return index connection + return index_connection def _convert_pinecone_result_to_document(self, result: dict, return_embedding: bool) -> Document: From 4ade61537ca86516921b261e944f2372b64dd73e Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 17:45:30 +0100 Subject: [PATCH 39/58] Create new index for each pinecone test --- test/conftest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/conftest.py b/test/conftest.py index 482dc41bd8..c35c476fa7 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -563,8 +563,10 @@ def document_store_with_docs(request, test_docs_xs, tmp_path): @pytest.fixture def document_store(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) + i = 0 + index = f"haystack_test_{i}" if request.param == "pinecone_indexes" else "haystack_test" document_store = get_document_store( - document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path + document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path, index=index ) yield document_store document_store.delete_documents() @@ -575,6 +577,7 @@ def document_store(request, tmp_path): # Make sure to delete Pinecone indexes, required for tests using different embedding dimensions if isinstance(document_store, PineconeDocumentStore): + i += 1 for index in document_store.pinecone_indexes: pinecone.delete_index(index) time.sleep(30) From 971abc57440281dc5f00c11664f813c7085fdccf Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 18:42:42 +0100 Subject: [PATCH 40/58] Use RestAPI instead of Python API for describe_index_stats --- haystack/document_stores/pinecone.py | 16 ++++++++++++++-- test/conftest.py | 5 +---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 33940c078d..dbf4bf4ae9 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -6,7 +6,8 @@ import logging from typing import Union, List, Optional, Dict, Generator from tqdm.auto import tqdm -import time +import requests +import json import pinecone import numpy as np @@ -97,6 +98,7 @@ def __init__( # Connect to Pinecone server using python client binding pinecone.init(api_key=api_key, environment=environment) + self._api_key = api_key # Formal similarity string if similarity in ("dot_product", "cosine"): @@ -175,6 +177,14 @@ def _create_index_if_not_exist( ) index_connection = pinecone.Index(index) + # Get index statistics + stats_endpoint = self.pinecone_indexes[index].configuration.host + "/describe_index_stats" + stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) + stats = json.loads(stats_request.content) + dims = stats["dimension"] + count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 + logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") + # return index connection return index_connection def _convert_pinecone_result_to_document(self, result: dict, return_embedding: bool) -> Document: @@ -504,7 +514,9 @@ def get_embedding_count( if not self.pinecone_indexes.get(index, False): raise ValueError(f"No index named {index} found in Pinecone.") - stats = self.pinecone_indexes[index].describe_index_stats() + stats_endpoint = self.pinecone_indexes[index].configuration.host + "/describe_index_stats" + stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) + stats = json.loads(stats_request.content) # if no namespace return zero count = stats["namespaces"][""]["vector_count"] if "" in stats["namespaces"] else 0 return count diff --git a/test/conftest.py b/test/conftest.py index c35c476fa7..482dc41bd8 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -563,10 +563,8 @@ def document_store_with_docs(request, test_docs_xs, tmp_path): @pytest.fixture def document_store(request, tmp_path): embedding_dim = request.node.get_closest_marker("embedding_dim", pytest.mark.embedding_dim(768)) - i = 0 - index = f"haystack_test_{i}" if request.param == "pinecone_indexes" else "haystack_test" document_store = get_document_store( - document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path, index=index + document_store_type=request.param, embedding_dim=embedding_dim.args[0], tmp_path=tmp_path ) yield document_store document_store.delete_documents() @@ -577,7 +575,6 @@ def document_store(request, tmp_path): # Make sure to delete Pinecone indexes, required for tests using different embedding dimensions if isinstance(document_store, PineconeDocumentStore): - i += 1 for index in document_store.pinecone_indexes: pinecone.delete_index(index) time.sleep(30) From 05e8cfd20fb23af6f651fef2f0db9eb49294144b Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 18:58:10 +0100 Subject: [PATCH 41/58] Fix accessing describe_index_stats --- haystack/document_stores/pinecone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index dbf4bf4ae9..42700cb840 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -178,7 +178,7 @@ def _create_index_if_not_exist( index_connection = pinecone.Index(index) # Get index statistics - stats_endpoint = self.pinecone_indexes[index].configuration.host + "/describe_index_stats" + stats_endpoint = index_connection.configuration.host + "/describe_index_stats" stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) stats = json.loads(stats_request.content) dims = stats["dimension"] From c4b956e2ce220bb908bfd71f5343cb1bb76640d5 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 19:28:53 +0100 Subject: [PATCH 42/58] Remove usages of describe_index_stats --- haystack/document_stores/pinecone.py | 43 ++++++++++++++-------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 42700cb840..827043f6f0 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -143,7 +143,7 @@ def __init__( super().__init__(url=sql_url, index=clean_index, duplicate_documents=duplicate_documents) - self._validate_index_sync() + # self._validate_index_sync() def _sanitize_index_name(self, index: str) -> str: if "_" in index: @@ -178,12 +178,12 @@ def _create_index_if_not_exist( index_connection = pinecone.Index(index) # Get index statistics - stats_endpoint = index_connection.configuration.host + "/describe_index_stats" - stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) - stats = json.loads(stats_request.content) - dims = stats["dimension"] - count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 - logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") + # stats_endpoint = index_connection.configuration.host + "/describe_index_stats" + # stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) + # stats = json.loads(stats_request.content) + # dims = stats["dimension"] + # count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 + # logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") # return index connection return index_connection @@ -506,20 +506,21 @@ def get_embedding_count( """ Return the count of embeddings in the document store. """ - if filters: - raise NotImplementedError("Filters are not supported for get_embedding_count in PineconeDocumentStore") - - index = index or self.index - index = self._sanitize_index_name(index) - if not self.pinecone_indexes.get(index, False): - raise ValueError(f"No index named {index} found in Pinecone.") - - stats_endpoint = self.pinecone_indexes[index].configuration.host + "/describe_index_stats" - stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) - stats = json.loads(stats_request.content) - # if no namespace return zero - count = stats["namespaces"][""]["vector_count"] if "" in stats["namespaces"] else 0 - return count + raise NotImplementedError() + # if filters: + # raise NotImplementedError("Filters are not supported for get_embedding_count in PineconeDocumentStore") + # + # index = index or self.index + # index = self._sanitize_index_name(index) + # if not self.pinecone_indexes.get(index, False): + # raise ValueError(f"No index named {index} found in Pinecone.") + # + # stats_endpoint = self.pinecone_indexes[index].configuration.host + "/describe_index_stats" + # stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) + # stats = json.loads(stats_request.content) + # # if no namespace return zero + # count = stats["namespaces"][""]["vector_count"] if "" in stats["namespaces"] else 0 + # return count def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None): """ From 179547144e8523de2bec96e67392750c914e011d Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 20:00:32 +0100 Subject: [PATCH 43/58] Run pinecone tests separately --- .github/workflows/linux_ci.yml | 30 ++++++++++++++++++++++++++++++ conftest.py | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 8ce54b4c74..3beab04e5e 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -351,3 +351,33 @@ jobs: run: | export MILVUS1_ENABLED=1 pytest -s test/test_document_store.py test/test_eval.py test/test_faiss_and_milvus.py test/test_pipeline.py test/test_retriever.py test/test_standard_pipelines.py --document_store_type="milvus1" + + test-pinecone: + needs: build-cache + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v2 + - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV + + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Cache Python + uses: actions/cache@v2 + with: + path: ${{ env.pythonLocation }} + key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }} + + # Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested. + # The cache can last way longer than a specific action's run, so older Haystack version could be carried over. + - name: Reinstall Haystack + run: | + pip install .[test] + + - name: Run tests + env: + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} + run: pytest -s test/test_document_store.py test/test_pipeline.py test/test_retriever.py test/test_standard_pipelines.py test/test_pipeline_extractive_qa.py --document_store_type="pinecone" diff --git a/conftest.py b/conftest.py index a381d802f8..f932de9dd4 100644 --- a/conftest.py +++ b/conftest.py @@ -2,7 +2,7 @@ def pytest_addoption(parser): parser.addoption( "--document_store_type", action="store", - default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone", + default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate", ) From 2c7c3c795e07b36892eee870b4d4efd557191796 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 14 Mar 2022 19:03:03 +0000 Subject: [PATCH 44/58] Update Documentation & Code Style --- conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index f932de9dd4..8d673d46d5 100644 --- a/conftest.py +++ b/conftest.py @@ -1,8 +1,6 @@ def pytest_addoption(parser): parser.addoption( - "--document_store_type", - action="store", - default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate", + "--document_store_type", action="store", default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate" ) From 50773e481945ff451eba0690ba03b0830a1ac7c9 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 21:56:13 +0100 Subject: [PATCH 45/58] Add pdftotext to pinecone tests --- .github/workflows/linux_ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 3beab04e5e..30008a249e 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -371,6 +371,9 @@ jobs: path: ${{ env.pythonLocation }} key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }} + - name: Install pdftotext + run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz && tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin + # Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested. # The cache can last way longer than a specific action's run, so older Haystack version could be carried over. - name: Reinstall Haystack From 220f913365afea95db48ffdcab36bac579b732ba Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 22:44:29 +0100 Subject: [PATCH 46/58] Remove sleep from doc store fixture --- .github/workflows/linux_ci.yml | 2 +- test/conftest.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 30008a249e..981747693b 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -383,4 +383,4 @@ jobs: - name: Run tests env: PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} - run: pytest -s test/test_document_store.py test/test_pipeline.py test/test_retriever.py test/test_standard_pipelines.py test/test_pipeline_extractive_qa.py --document_store_type="pinecone" + run: pytest -s test/test_document_store.py test/test_pipeline.py test/test_standard_pipelines.py test/test_pipeline_extractive_qa.py --document_store_type="pinecone" diff --git a/test/conftest.py b/test/conftest.py index 482dc41bd8..1b5522594e 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -577,7 +577,6 @@ def document_store(request, tmp_path): if isinstance(document_store, PineconeDocumentStore): for index in document_store.pinecone_indexes: pinecone.delete_index(index) - time.sleep(30) @pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch", "pinecone"]) From 54f672f441f5697c58387e65bdc073577d86de12 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 14 Mar 2022 22:50:04 +0100 Subject: [PATCH 47/58] Add describe_index_stats --- haystack/document_stores/pinecone.py | 37 ++++++++++++---------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 827043f6f0..372ef7d7a8 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -178,12 +178,10 @@ def _create_index_if_not_exist( index_connection = pinecone.Index(index) # Get index statistics - # stats_endpoint = index_connection.configuration.host + "/describe_index_stats" - # stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) - # stats = json.loads(stats_request.content) - # dims = stats["dimension"] - # count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 - # logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") + stats = index_connection.describe_index_stats() + dims = stats["dimension"] + count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 + logger.info(f"Index statistics: name: {index}, embedding dimensions: {dims}, record count: {count}") # return index connection return index_connection @@ -506,21 +504,18 @@ def get_embedding_count( """ Return the count of embeddings in the document store. """ - raise NotImplementedError() - # if filters: - # raise NotImplementedError("Filters are not supported for get_embedding_count in PineconeDocumentStore") - # - # index = index or self.index - # index = self._sanitize_index_name(index) - # if not self.pinecone_indexes.get(index, False): - # raise ValueError(f"No index named {index} found in Pinecone.") - # - # stats_endpoint = self.pinecone_indexes[index].configuration.host + "/describe_index_stats" - # stats_request = requests.get(stats_endpoint, headers={"Api-Key": self._api_key}) - # stats = json.loads(stats_request.content) - # # if no namespace return zero - # count = stats["namespaces"][""]["vector_count"] if "" in stats["namespaces"] else 0 - # return count + if filters: + raise NotImplementedError("Filters are not supported for get_embedding_count in PineconeDocumentStore") + + index = index or self.index + index = self._sanitize_index_name(index) + if not self.pinecone_indexes.get(index, False): + raise ValueError(f"No index named {index} found in Pinecone.") + + stats = self.pinecone_indexes[index].describe_index_stats() + # if no namespace return zero + count = stats["namespaces"][""]["vector_count"] if "" in stats["namespaces"] else 0 + return count def update_document_meta(self, id: str, meta: Dict[str, str], index: str = None): """ From fe5035a1375f14eec8ef6aae5d4c168482f7895a Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Tue, 15 Mar 2022 09:04:00 +0100 Subject: [PATCH 48/58] Remove unused imports --- haystack/document_stores/pinecone.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 372ef7d7a8..db3328ac78 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -6,8 +6,6 @@ import logging from typing import Union, List, Optional, Dict, Generator from tqdm.auto import tqdm -import requests -import json import pinecone import numpy as np From 2f0b383a28cb536a65a8e5d168323da08f86d4d6 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Tue, 15 Mar 2022 21:47:28 +0100 Subject: [PATCH 49/58] Use pull_request_target trigger --- .github/workflows/linux_ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index 981747693b..be70ed1169 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -6,7 +6,7 @@ on: # Activate this workflow when the PR is opened and code is added to it # Note: using pull_request instead of push to keep the CI workflows # running on our repo, not the contributor's. See autoformat.yml - pull_request: + pull_request_target: types: - opened - synchronize From 452cc165fcf80b3fb67d781a4ec0c04f5b6c5a1d Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Tue, 15 Mar 2022 21:50:54 +0100 Subject: [PATCH 50/58] Revert use pull_request_target trigger --- .github/workflows/linux_ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml index be70ed1169..981747693b 100644 --- a/.github/workflows/linux_ci.yml +++ b/.github/workflows/linux_ci.yml @@ -6,7 +6,7 @@ on: # Activate this workflow when the PR is opened and code is added to it # Note: using pull_request instead of push to keep the CI workflows # running on our repo, not the contributor's. See autoformat.yml - pull_request_target: + pull_request: types: - opened - synchronize From 8822e9eaa79d2110efb9e3047cfd79257060c0ff Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 21 Mar 2022 10:00:27 +0100 Subject: [PATCH 51/58] Remove set_config --- haystack/document_stores/pinecone.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index db3328ac78..4708ad2482 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -78,21 +78,6 @@ def __init__( - `"overwrite"`: Update any existing documents with the same ID when adding documents. - `"fail"`: An error is raised if the document ID of the document being added already exists. """ - # Save init parameters to enable export of component config as YAML - self.set_config( - api_key=api_key, - environment=environment, - sql_url=sql_url, - embedding_dim=embedding_dim, - return_embedding=return_embedding, - index=index, - similarity=similarity, - replicas=replicas, - shards=shards, - embedding_field=embedding_field, - progress_bar=progress_bar, - duplicate_documents=duplicate_documents, - ) # Connect to Pinecone server using python client binding pinecone.init(api_key=api_key, environment=environment) From 7e9af75e1665681db48ec36b46f9e851a31e89bf Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 21 Mar 2022 11:07:47 +0100 Subject: [PATCH 52/58] Add os to conftest --- test/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/conftest.py b/test/conftest.py index eabbbc8c36..a823bfc275 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -8,6 +8,7 @@ import uuid import logging from pathlib import Path +import os import pinecone import responses From c88a305524951991a7b2b77905d4882254fa32ff Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 21 Mar 2022 11:40:59 +0100 Subject: [PATCH 53/58] Integrate review comments --- haystack/document_stores/pinecone.py | 57 +++++++++------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 4708ad2482..1e3b70b16d 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -14,6 +14,7 @@ from haystack.document_stores.sql import SQLDocumentStore from haystack.document_stores.base import get_batches_from_generator from haystack.document_stores.filter_utils import LogicalFilterClause +from haystack.errors import DocumentStoreError logger = logging.getLogger(__name__) @@ -21,10 +22,13 @@ class PineconeDocumentStore(SQLDocumentStore): """ - Document store for very large scale embedding based dense retrievers like the DPR. + Document store for very large scale embedding based dense retrievers like the DPR. This is a hosted document store, + this means that your vectors will not be stored locally but in the cloud. This means that the similarity + search will be run on the cloud as well. It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io)) - to perform similarity search on vectors. + to perform similarity search on vectors. In order to use this document store, you need an API key that you can + obtain by creating an account on the [Pinecone website](https://www.pinecone.io). The document text is stored using the SQLDocumentStore, while the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index. @@ -52,7 +56,7 @@ def __init__( """ :param api_key: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). :param environment: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are - supported, contact Pinecone if required. + supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required. :param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale deployment, Postgres is recommended. :param pinecone_index: pinecone-client Index object, an index will be initialized or loaded if not specified. @@ -129,10 +133,7 @@ def __init__( # self._validate_index_sync() def _sanitize_index_name(self, index: str) -> str: - if "_" in index: - return index.replace("_", "-").lower() - else: - return index.lower() + return index.replace("_", "-").lower() def _create_index_if_not_exist( self, @@ -168,40 +169,13 @@ def _create_index_if_not_exist( # return index connection return index_connection - def _convert_pinecone_result_to_document(self, result: dict, return_embedding: bool) -> Document: - """ - Convert Pinecone result dict into haystack document object. - """ - content = "" - - id = result.get("id") - score = result.get("score", None) - embedding = result.get("values") - meta = result.get("metadata") - content_type = meta.pop("content_type") if isinstance(meta, dict) and "content_type" in meta else None - - if return_embedding and embedding: - embedding = np.asarray(embedding, dtype=np.float32) - - document = Document.from_dict( - { - "id": id, - "content": content, - "content_type": content_type, - "meta": meta, - "score": score, - "embedding": embedding, - } - ) - return document - def _validate_index_sync(self): """ This check ensures the correct document database was loaded. If it fails, make sure you provided the same path to the SQL database as when you created the original Pinecone index. """ if not self.get_document_count() == self.get_embedding_count(): - raise ValueError( + raise DocumentStoreError( "The number of documents present in the SQL database does not " "match the number of embeddings in Pinecone. Make sure your Pinecone " "index aligns to the same database that was used when creating the " @@ -230,7 +204,7 @@ def write_documents( - `"skip"`: Ignore the duplicate documents. - `"overwrite"`: Update any existing documents with the same ID when adding documents. - `"fail"`: An error is raised if the document ID of the document being added already exists. - + :param headers: PineconeDocumentStore does not support headers. :raises DuplicateDocumentError: Exception trigger on duplicate document. """ if headers: @@ -296,7 +270,7 @@ def update_embeddings( batch_size: int = 32, ): """ - Updates the embeddings in the the document store using the encoding model specified in the retriever. + Updates the embeddings in the document store using the encoding model specified in the retriever. This can be useful if you want to add or change the embeddings for your documents (e.g. after changing the retriever config). @@ -439,6 +413,7 @@ def get_all_documents_generator( ``` :param return_embedding: Whether to return the document embeddings. :param batch_size: When working with large number of documents, batching can help reduce memory footprint. + :param headers: PineconeDocumentStore does not support headers. """ if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") @@ -550,6 +525,7 @@ def delete_documents( } } ``` + :param headers: PineconeDocumentStore does not support headers. """ if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") @@ -645,6 +621,7 @@ def query_by_embedding( :param top_k: How many documents to return. :param index: The name of the index from which to retrieve documents. :param return_embedding: Whether to return document embedding. + :param headers: PineconeDocumentStore does not support headers. """ if headers: raise NotImplementedError("PineconeDocumentStore does not support headers.") @@ -660,7 +637,7 @@ def query_by_embedding( index = self._sanitize_index_name(index) if index not in self.pinecone_indexes: - raise Exception( + raise DocumentStoreError( f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running " f"'update_embeddings()' to create and populate an index." ) @@ -702,13 +679,13 @@ def _limit_check(self, top_k: int, include_values: Optional[bool] = None): """ if include_values: if top_k > self.top_k_limit_vectors: - raise Exception( + raise DocumentStoreError( f"PineconeDocumentStore allows requests of no more than {self.top_k_limit_vectors} records ", f"when returning embedding values. This request is attempting to return {top_k} records.", ) else: if top_k > self.top_k_limit: - raise Exception( + raise DocumentStoreError( f"PineconeDocumentStore allows requests of no more than {self.top_k_limit} records. ", f"This request is attempting to return {top_k} records.", ) From 77fb60a84208a60c9df99c7c5b57092f83b605d8 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 21 Mar 2022 11:47:05 +0100 Subject: [PATCH 54/58] Set include_values to False --- haystack/document_stores/pinecone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 1e3b70b16d..dadfd85dfa 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -646,7 +646,7 @@ def query_by_embedding( if self.similarity == "cosine": self.normalize_embedding(query_emb) - res = self.pinecone_indexes[index].query(query_emb.tolist(), top_k=top_k, include_values=True, filter=filters) + res = self.pinecone_indexes[index].query(query_emb.tolist(), top_k=top_k, include_values=False, filter=filters) score_matrix = [] vector_id_matrix = [] From a2eb6cd66eb2b0e8147ee821f4a5f18018c339eb Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 21 Mar 2022 12:18:38 +0100 Subject: [PATCH 55/58] Remove quotation marks from pinecone.Index type --- haystack/document_stores/pinecone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index dadfd85dfa..6545e082b8 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -42,7 +42,7 @@ def __init__( api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", - pinecone_index: Optional["pinecone.Index"] = None, + pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", From 756f2987d5ace94d7163f96ec8f24a0793a083e7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 21 Mar 2022 11:23:50 +0000 Subject: [PATCH 56/58] Update Documentation & Code Style --- docs/_src/api/api/document_store.md | 17 ++++++++++++----- docs/_src/api/api/evaluation.md | 5 ++++- docs/_src/api/api/pipelines.md | 5 ++++- haystack/nodes/evaluator/evaluator.py | 6 +++--- haystack/pipelines/base.py | 7 +++++-- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index aa2f411154..c5422c956c 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -4132,10 +4132,13 @@ None class PineconeDocumentStore(SQLDocumentStore) ``` -Document store for very large scale embedding based dense retrievers like the DPR. +Document store for very large scale embedding based dense retrievers like the DPR. This is a hosted document store, +this means that your vectors will not be stored locally but in the cloud. This means that the similarity +search will be run on the cloud as well. It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io)) -to perform similarity search on vectors. +to perform similarity search on vectors. In order to use this document store, you need an API key that you can +obtain by creating an account on the [Pinecone website](https://www.pinecone.io). The document text is stored using the SQLDocumentStore, while the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index. @@ -4145,14 +4148,14 @@ the vector embeddings and metadata (for filtering) are indexed in a Pinecone Ind #### \_\_init\_\_ ```python -def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional["pinecone.Index"] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite") +def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite") ``` **Arguments**: - `api_key`: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). - `environment`: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are -supported, contact Pinecone if required. +supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required. - `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale deployment, Postgres is recommended. - `pinecone_index`: pinecone-client Index object, an index will be initialized or loaded if not specified. @@ -4199,6 +4202,7 @@ Parameter options: - `"skip"`: Ignore the duplicate documents. - `"overwrite"`: Update any existing documents with the same ID when adding documents. - `"fail"`: An error is raised if the document ID of the document being added already exists. +- `headers`: PineconeDocumentStore does not support headers. **Raises**: @@ -4212,7 +4216,7 @@ Parameter options: def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 32) ``` -Updates the embeddings in the the document store using the encoding model specified in the retriever. +Updates the embeddings in the document store using the encoding model specified in the retriever. This can be useful if you want to add or change the embeddings for your documents (e.g. after changing the retriever config). @@ -4295,6 +4299,7 @@ operation. ``` - `return_embedding`: Whether to return the document embeddings. - `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: PineconeDocumentStore does not support headers. @@ -4355,6 +4360,7 @@ operation. } } ``` +- `headers`: PineconeDocumentStore does not support headers. @@ -4432,6 +4438,7 @@ operation. - `top_k`: How many documents to return. - `index`: The name of the index from which to retrieve documents. - `return_embedding`: Whether to return document embedding. +- `headers`: PineconeDocumentStore does not support headers. diff --git a/docs/_src/api/api/evaluation.md b/docs/_src/api/api/evaluation.md index 2cbf0b4dd9..fed855e80b 100644 --- a/docs/_src/api/api/evaluation.md +++ b/docs/_src/api/api/evaluation.md @@ -123,7 +123,7 @@ Print the evaluation results #### semantic\_answer\_similarity ```python -def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2") -> Tuple[List[float], List[float]] +def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, use_gpu: bool = True) -> Tuple[List[float], List[float]] ``` Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1. @@ -137,6 +137,9 @@ Returns per QA pair a) the similarity of the most likely prediction (top 1) to a - `gold_labels`: Labels as list of multiple possible answers per question - `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model, should be path or string pointing to downloadable models. +- `batch_size`: Number of prediction label pairs to encode at once. +- `use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. **Returns**: diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 88888ec3f9..914b324c63 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -439,7 +439,7 @@ then be found in the dict returned by this method under the key "_debug" #### eval ```python -def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, add_isolated_node_eval: bool = False) -> EvaluationResult +def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False) -> EvaluationResult ``` Evaluates the pipeline by running the pipeline once per query in debug mode @@ -465,6 +465,9 @@ If you use custom cross encoders please make sure they work with sentence_transf - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" - Large model for German only: "deepset/gbert-large-sts" +- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. +- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. - `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. diff --git a/haystack/nodes/evaluator/evaluator.py b/haystack/nodes/evaluator/evaluator.py index 134c855ed1..6e4ac3184b 100644 --- a/haystack/nodes/evaluator/evaluator.py +++ b/haystack/nodes/evaluator/evaluator.py @@ -394,7 +394,7 @@ def semantic_answer_similarity( gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, - use_gpu: bool = True + use_gpu: bool = True, ) -> Tuple[List[float], List[float]]: """ Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1. @@ -416,8 +416,8 @@ def semantic_answer_similarity( cross_encoder_used = False if config.architectures is not None: cross_encoder_used = any(arch.endswith("ForSequenceClassification") for arch in config.architectures) - - device = None if use_gpu else 'cpu' + + device = None if use_gpu else "cpu" # Compute similarities top_1_sas = [] diff --git a/haystack/pipelines/base.py b/haystack/pipelines/base.py index c4251b1657..c01ed27e55 100644 --- a/haystack/pipelines/base.py +++ b/haystack/pipelines/base.py @@ -766,8 +766,11 @@ def eval( gold_labels = df["gold_answers"].values predictions = [[a] for a in df["answer"].values] sas, _ = semantic_answer_similarity( - predictions=predictions, gold_labels=gold_labels, sas_model_name_or_path=sas_model_name_or_path, - batch_size=sas_batch_size, use_gpu=sas_use_gpu + predictions=predictions, + gold_labels=gold_labels, + sas_model_name_or_path=sas_model_name_or_path, + batch_size=sas_batch_size, + use_gpu=sas_use_gpu, ) df["sas"] = sas From d9ef4047400b912ea751701e4ef819cb61019cc2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 21 Mar 2022 11:37:47 +0000 Subject: [PATCH 57/58] Update Documentation & Code Style --- docs/_src/api/api/pipelines.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index 91f401853e..700eca06fd 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -440,7 +440,7 @@ then be found in the dict returned by this method under the key "_debug" ```python @send_event -def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, add_isolated_node_eval: bool = False) -> EvaluationResult +def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False) -> EvaluationResult ``` Evaluates the pipeline by running the pipeline once per query in debug mode @@ -466,6 +466,9 @@ If you use custom cross encoders please make sure they work with sentence_transf - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" - Large model for German only: "deepset/gbert-large-sts" +- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. +- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. - `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. From cbd8c16879a8e9d418a87799f35a3ac7c00ffcf0 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Mon, 21 Mar 2022 14:02:07 +0100 Subject: [PATCH 58/58] Fix number of args in error messages --- haystack/document_stores/pinecone.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 6545e082b8..a964d57fb8 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -680,14 +680,14 @@ def _limit_check(self, top_k: int, include_values: Optional[bool] = None): if include_values: if top_k > self.top_k_limit_vectors: raise DocumentStoreError( - f"PineconeDocumentStore allows requests of no more than {self.top_k_limit_vectors} records ", - f"when returning embedding values. This request is attempting to return {top_k} records.", + f"PineconeDocumentStore allows requests of no more than {self.top_k_limit_vectors} records " + f"when returning embedding values. This request is attempting to return {top_k} records." ) else: if top_k > self.top_k_limit: raise DocumentStoreError( - f"PineconeDocumentStore allows requests of no more than {self.top_k_limit} records. ", - f"This request is attempting to return {top_k} records.", + f"PineconeDocumentStore allows requests of no more than {self.top_k_limit} records. " + f"This request is attempting to return {top_k} records." ) @classmethod