Skip to content

Commit

Permalink
Make check of document & embedding count optional in FAISS and Pineco…
Browse files Browse the repository at this point in the history
…ne (deepset-ai#2677)

* make validation optional & add method call in pinecone init

* Update Documentation & Code Style

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and andrch-FS committed Jul 26, 2022
1 parent ea38942 commit 6f9f255
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 3 deletions.
6 changes: 4 additions & 2 deletions docs/_src/api/api/document_store.md
Original file line number Diff line number Diff line change
Expand Up @@ -2473,7 +2473,7 @@ the vector embeddings are indexed in a FAISS Index.
#### FAISSDocumentStore.\_\_init\_\_

```python
def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80)
def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80, validate_index_sync: bool = True)
```

**Arguments**:
Expand Down Expand Up @@ -2523,6 +2523,7 @@ Can be created via calling `save()`
- `n_links`: used only if index_factory == "HNSW"
- `ef_search`: used only if index_factory == "HNSW"
- `ef_construction`: used only if index_factory == "HNSW"
- `validate_index_sync`: Whether to check that the document count equals the embedding count at initialization time

<a id="faiss.FAISSDocumentStore.write_documents"></a>

Expand Down Expand Up @@ -4672,7 +4673,7 @@ the vector embeddings and metadata (for filtering) are indexed in a Pinecone Ind
#### PineconeDocumentStore.\_\_init\_\_

```python
def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []})
def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []}, validate_index_sync: bool = True)
```

**Arguments**:
Expand Down Expand Up @@ -4709,6 +4710,7 @@ lost if you choose to recreate the index. Be aware that both the document_index
be recreated.
- `metadata_config`: Which metadata fields should be indexed. Should be in the format
`{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`.
- `validate_index_sync`: Whether to check that the document count equals the embedding count at initialization time

<a id="pinecone.PineconeDocumentStore.write_documents"></a>

Expand Down
5 changes: 4 additions & 1 deletion haystack/document_stores/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(
n_links: int = 64,
ef_search: int = 20,
ef_construction: int = 80,
validate_index_sync: bool = True,
):
"""
:param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale
Expand Down Expand Up @@ -107,6 +108,7 @@ def __init__(
:param n_links: used only if index_factory == "HNSW"
:param ef_search: used only if index_factory == "HNSW"
:param ef_construction: used only if index_factory == "HNSW"
:param validate_index_sync: Whether to check that the document count equals the embedding count at initialization time
"""
# special case if we want to load an existing index from disk
# load init params from disk and run init again
Expand Down Expand Up @@ -162,7 +164,8 @@ def __init__(
url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level
)

self._validate_index_sync()
if validate_index_sync:
self._validate_index_sync()

def _validate_params_load_from_disk(self, sig: Signature, locals: dict):
allowed_params = ["faiss_index_path", "faiss_config_path", "self"]
Expand Down
5 changes: 5 additions & 0 deletions haystack/document_stores/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
duplicate_documents: str = "overwrite",
recreate_index: bool = False,
metadata_config: dict = {"indexed": []},
validate_index_sync: bool = True,
):
"""
:param api_key: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)).
Expand Down Expand Up @@ -88,6 +89,7 @@ def __init__(
be recreated.
:param metadata_config: Which metadata fields should be indexed. Should be in the format
`{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`.
:param validate_index_sync: Whether to check that the document count equals the embedding count at initialization time
"""
# Connect to Pinecone server using python client binding
pinecone.init(api_key=api_key, environment=environment)
Expand Down Expand Up @@ -141,6 +143,9 @@ def __init__(
metadata_config=self.metadata_config,
)

if validate_index_sync:
self._validate_index_sync()

def _sanitize_index_name(self, index: str) -> str:
return index.replace("_", "-").lower()

Expand Down
10 changes: 10 additions & 0 deletions haystack/json-schemas/haystack-pipeline-master.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,11 @@
"title": "Ef Construction",
"default": 80,
"type": "integer"
},
"validate_index_sync": {
"title": "Validate Index Sync",
"default": true,
"type": "boolean"
}
},
"additionalProperties": false,
Expand Down Expand Up @@ -1537,6 +1542,11 @@
"indexed": []
},
"type": "object"
},
"validate_index_sync": {
"title": "Validate Index Sync",
"default": true,
"type": "boolean"
}
},
"required": [
Expand Down

0 comments on commit 6f9f255

Please sign in to comment.