deepset-ai · bogdankostic · Mar 21, 2022 · Feb 5, 2022 · Feb 13, 2022 · Feb 13, 2022
diff --git a/.github/workflows/linux_ci.yml b/.github/workflows/linux_ci.yml
@@ -298,6 +298,8 @@ jobs:
         pip install ui/
 
     - name: Run tests
+      env:
+        PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
       run: pytest -s ${{ matrix.test-path }}
 
 
@@ -346,3 +348,36 @@ jobs:
       run: |
         export MILVUS1_ENABLED=1
         pytest -s test/test_document_store.py test/test_eval.py test/test_faiss_and_milvus.py test/test_pipeline.py test/test_retriever.py test/test_standard_pipelines.py --document_store_type="milvus1"
+
+  test-pinecone:
+    needs: build-cache
+    runs-on: ubuntu-20.04
+
+    steps:
+      - uses: actions/checkout@v2
+      - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
+
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+
+      - name: Cache Python
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.pythonLocation }}
+          key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }}
+
+      - name: Install pdftotext
+        run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz && tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin
+
+      # Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested.
+      # The cache can last way longer than a specific action's run, so older Haystack version could be carried over.
+      - name: Reinstall Haystack
+        run: |
+          pip install .[test]
+
+      - name: Run tests
+        env:
+          PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }}
+        run: pytest -s test/test_document_store.py test/test_pipeline.py test/test_standard_pipelines.py test/test_pipeline_extractive_qa.py --document_store_type="pinecone"
diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
@@ -4120,6 +4120,337 @@ exists.
 
 None
 
+<a id="pinecone"></a>
+
+# Module pinecone
+
+<a id="pinecone.PineconeDocumentStore"></a>
+
+## PineconeDocumentStore
+
+```python
+class PineconeDocumentStore(SQLDocumentStore)
+```
+
+Document store for very large scale embedding based dense retrievers like the DPR. This is a hosted document store,
+this means that your vectors will not be stored locally but in the cloud. This means that the similarity
+search will be run on the cloud as well.
+
+It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io))
+to perform similarity search on vectors. In order to use this document store, you need an API key that you can
+obtain by creating an account on the [Pinecone website](https://www.pinecone.io).
+
+The document text is stored using the SQLDocumentStore, while
+the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index.
+
+<a id="pinecone.PineconeDocumentStore.__init__"></a>
+
+#### \_\_init\_\_
+
+```python
+def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite")
+```
+
+**Arguments**:
+
+- `api_key`: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)).
+- `environment`: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are
+supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required.
+- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale
+deployment, Postgres is recommended.
+- `pinecone_index`: pinecone-client Index object, an index will be initialized or loaded if not specified.
+- `embedding_dim`: The embedding vector size.
+- `return_embedding`: Whether to return document embeddings.
+- `index`: Name of index in document store to use.
+- `similarity`: The similarity function used to compare document vectors. `"dot_product"` is the default
+since it is more performant with DPR embeddings. `"cosine"` is recommended if you are using a
+Sentence-Transformer model.
+In both cases, the returned values in Document.score are normalized to be in range [0,1]:
+    - For `"dot_product"`: `expit(np.asarray(raw_score / 100))`
+    - For `"cosine"`: `(raw_score + 1) / 2`
+- `replicas`: The number of replicas. Replicas duplicate the index. They provide higher availability and
+throughput.
+- `shards`: The number of shards to be used in the index. We recommend to use 1 shard per 1GB of data.
+- `embedding_field`: Name of field containing an embedding vector.
+- `progress_bar`: Whether to show a tqdm progress bar or not.
+Can be helpful to disable in production deployments to keep the logs clean.
+- `duplicate_documents`: Handle duplicate documents based on parameter options.\
+Parameter options:
+    - `"skip"`: Ignore the duplicate documents.
+    - `"overwrite"`: Update any existing documents with the same ID when adding documents.
+    - `"fail"`: An error is raised if the document ID of the document being added already exists.
+
+<a id="pinecone.PineconeDocumentStore.write_documents"></a>
+
+#### write\_documents
+
+```python
+def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Add new documents to the DocumentStore.
+
+**Arguments**:
+
+- `documents`: List of `Dicts` or list of `Documents`. If they already contain embeddings, we'll index them
+right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them.
+- `index`: Index name for storing the docs and metadata.
+- `batch_size`: Number of documents to process at a time. When working with large number of documents,
+batching can help to reduce the memory footprint.
+- `duplicate_documents`: handle duplicate documents based on parameter options.
+Parameter options:
+    - `"skip"`: Ignore the duplicate documents.
+    - `"overwrite"`: Update any existing documents with the same ID when adding documents.
+    - `"fail"`: An error is raised if the document ID of the document being added already exists.
+- `headers`: PineconeDocumentStore does not support headers.
+
+**Raises**:
+
+- `DuplicateDocumentError`: Exception trigger on duplicate document.
+
+<a id="pinecone.PineconeDocumentStore.update_embeddings"></a>
+
+#### update\_embeddings
+
+```python
+def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 32)
+```
+
+Updates the embeddings in the document store using the encoding model specified in the retriever.
+
+This can be useful if you want to add or change the embeddings for your documents (e.g. after changing the
+retriever config).
+
+**Arguments**:
+
+- `retriever`: Retriever to use to get embeddings for text.
+- `index`: Index name for which embeddings are to be updated. If set to `None`, the default `self.index` is
+used.
+- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to `False`,
+only documents without embeddings are processed. This mode can be used for incremental updating of
+embeddings, wherein, only newly indexed documents get processed.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+    __Example__:
+    ```python
+    filters = {
+        "$and": {
+            "type": {"$eq": "article"},
+            "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+            "rating": {"$gte": 3},
+            "$or": {
+                "genre": {"$in": ["economy", "politics"]},
+                "publisher": {"$eq": "nytimes"}
+            }
+        }
+    }
+    ```
+- `batch_size`: Number of documents to process at a time. When working with large number of documents,
+batching can help reduce memory footprint.
+
+<a id="pinecone.PineconeDocumentStore.get_all_documents_generator"></a>
+
+#### get\_all\_documents\_generator
+
+```python
+def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None]
+```
+
+Get all documents from the document store. Under-the-hood, documents are fetched in batches from the
+
+document store and yielded as individual documents. This method can be used to iteratively process
+a large number of documents without having to load all documents in memory.
+
+**Arguments**:
+
+- `index`: Name of the index to get the documents from. If None, the
+DocumentStore's default index (self.index) will be used.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+    __Example__:
+    ```python
+    filters = {
+        "$and": {
+            "type": {"$eq": "article"},
+            "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+            "rating": {"$gte": 3},
+            "$or": {
+                "genre": {"$in": ["economy", "politics"]},
+                "publisher": {"$eq": "nytimes"}
+            }
+        }
+    }
+    ```
+- `return_embedding`: Whether to return the document embeddings.
+- `batch_size`: When working with large number of documents, batching can help reduce memory footprint.
+- `headers`: PineconeDocumentStore does not support headers.
+
+<a id="pinecone.PineconeDocumentStore.get_embedding_count"></a>
+
+#### get\_embedding\_count
+
+```python
+def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None) -> int
+```
+
+Return the count of embeddings in the document store.
+
+<a id="pinecone.PineconeDocumentStore.update_document_meta"></a>
+
+#### update\_document\_meta
+
+```python
+def update_document_meta(id: str, meta: Dict[str, str], index: str = None)
+```
+
+Update the metadata dictionary of a document by specifying its string id
+
+<a id="pinecone.PineconeDocumentStore.delete_documents"></a>
+
+#### delete\_documents
+
+```python
+def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None)
+```
+
+Delete documents from the document store.
+
+**Arguments**:
+
+- `index`: Index name to delete the documents from. If `None`, the DocumentStore's default index
+(`self.index`) will be used.
+- `ids`: Optional list of IDs to narrow down the documents to be deleted.
+- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+    __Example__:
+    ```python
+    filters = {
+        "$and": {
+            "type": {"$eq": "article"},
+            "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+            "rating": {"$gte": 3},
+            "$or": {
+                "genre": {"$in": ["economy", "politics"]},
+                "publisher": {"$eq": "nytimes"}
+            }
+        }
+    }
+    ```
+- `headers`: PineconeDocumentStore does not support headers.
+
+<a id="pinecone.PineconeDocumentStore.query_by_embedding"></a>
+
+#### query\_by\_embedding
+
+```python
+def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None) -> List[Document]
+```
+
+Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
+
+**Arguments**:
+
+- `query_emb`: Embedding of the query (e.g. gathered from DPR).
+- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain
+conditions.
+Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical
+operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`,
+`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name.
+Logical operator keys take a dictionary of metadata field names and/or logical operators as
+value. Metadata field names take a dictionary of comparison operators as value. Comparison
+operator keys take a single value or (in case of `"$in"`) a list of values as value.
+If no logical operator is provided, `"$and"` is used as default operation. If no comparison
+operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default
+operation.
+    __Example__:
+    ```python
+    filters = {
+        "$and": {
+            "type": {"$eq": "article"},
+            "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+            "rating": {"$gte": 3},
+            "$or": {
+                "genre": {"$in": ["economy", "politics"]},
+                "publisher": {"$eq": "nytimes"}
+            }
+        }
+    }
+    # or simpler using default operators
+    filters = {
+        "type": "article",
+        "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"},
+        "rating": {"$gte": 3},
+        "$or": {
+            "genre": ["economy", "politics"],
+            "publisher": "nytimes"
+        }
+    }
+    ```
+    To use the same logical operator multiple times on the same level, logical operators take
+    optionally a list of dictionaries as value.
+    __Example__:
+    ```python
+    filters = {
+        "$or": [
+            {
+                "$and": {
+                    "Type": "News Paper",
+                    "Date": {
+                        "$lt": "2019-01-01"
+                    }
+                }
+            },
+            {
+                "$and": {
+                    "Type": "Blog Post",
+                    "Date": {
+                        "$gte": "2019-01-01"
+                    }
+                }
+            }
+        ]
+    }
+    ```
+- `top_k`: How many documents to return.
+- `index`: The name of the index from which to retrieve documents.
+- `return_embedding`: Whether to return document embedding.
+- `headers`: PineconeDocumentStore does not support headers.
+
+<a id="pinecone.PineconeDocumentStore.load"></a>
+
+#### load
+
+```python
+@classmethod
+def load(cls)
+```
+
+Default class method used for loading indexes. Not applicable to the PineconeDocumentStore.
+
 <a id="utils"></a>
 
 # Module utils