Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Milvus integration #771

Merged
merged 16 commits into from
Jan 29, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ jobs:
- name: Run Elasticsearch
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx128m" elasticsearch:7.9.2

- name: Run Milvus
run: docker run -d -p 19530:19530 -p 19121:19121 milvusdb/milvus:0.10.5-cpu-d010621-4eda95

- name: Run Apache Tika
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1

Expand Down
7 changes: 4 additions & 3 deletions docs/_src/api/api/document_store.md
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ None
#### delete\_all\_documents

```python
| delete_all_documents(index: str, filters: Optional[Dict[str, List[str]]] = None)
| delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
```

Delete documents in an index. All documents are deleted if no filters are passed.
Expand Down Expand Up @@ -763,7 +763,7 @@ the vector embeddings are indexed in a FAISS Index.
#### \_\_init\_\_

```python
| __init__(sql_url: str = "sqlite:///", vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, update_existing_documents: bool = False, index: str = "document", similarity: str = "dot_product", **kwargs, ,)
| __init__(sql_url: str = "sqlite:///", vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, update_existing_documents: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", **kwargs, ,)
```

**Arguments**:
Expand Down Expand Up @@ -796,6 +796,7 @@ added already exists.
- `index`: Name of index in document store to use.
- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
- `embedding_field`: Name of field containing an embedding vector.

<a name="faiss.FAISSDocumentStore.write_documents"></a>
#### write\_documents
Expand Down Expand Up @@ -881,7 +882,7 @@ None
#### delete\_all\_documents

```python
| delete_all_documents(index=None)
| delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
```

Delete all documents from the document store.
Expand Down
2 changes: 1 addition & 1 deletion haystack/document_store/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,6 @@ def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_i
logger.error("File needs to be in json or jsonl format.")

@abstractmethod
def delete_all_documents(self, index: str, filters: Optional[Dict[str, List[str]]] = None):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
pass

3 changes: 2 additions & 1 deletion haystack/document_store/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,14 +757,15 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non

bulk(self.client, doc_updates, request_timeout=300, refresh=self.refresh_type)

def delete_all_documents(self, index: str, filters: Optional[Dict[str, List[str]]] = None):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete documents in an index. All documents are deleted if no filters are passed.

:param index: Index name to delete the document from.
:param filters: Optional filters to narrow down the documents to be deleted.
:return: None
"""
index = index or self.index
query: Dict[str, Any] = {"query": {}}
if filters:
filter_clause = []
Expand Down
9 changes: 6 additions & 3 deletions haystack/document_store/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
update_existing_documents: bool = False,
index: str = "document",
similarity: str = "dot_product",
embedding_field: str = "embedding",
**kwargs,
):
"""
Expand Down Expand Up @@ -72,6 +73,7 @@ def __init__(
:param index: Name of index in document store to use.
:param similarity: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
:param embedding_field: Name of field containing an embedding vector.
"""
self.vector_dim = vector_dim

Expand All @@ -83,6 +85,7 @@ def __init__(
self.faiss_index.set_direct_map_type(faiss.DirectMap.Hashtable)

self.return_embedding = return_embedding
self.embedding_field = embedding_field
if similarity == "dot_product":
self.similarity = similarity
else:
Expand Down Expand Up @@ -154,7 +157,7 @@ def write_documents(

def _create_document_field_map(self) -> Dict:
return {
self.index: "embedding",
self.index: self.embedding_field,
}

def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = None, batch_size: int = 10_000):
Expand Down Expand Up @@ -275,13 +278,13 @@ def train_index(self, documents: Optional[Union[List[dict], List[Document]]], em
embeddings = np.array(embeddings, dtype="float32")
self.faiss_index.train(embeddings)

def delete_all_documents(self, index=None):
def delete_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None):
"""
Delete all documents from the document store.
"""
index = index or self.index
self.faiss_index.reset()
super().delete_all_documents(index=index)
super().delete_all_documents(index=index, filters=filters)

def query_by_embedding(self,
query_emb: np.array,
Expand Down
Loading