Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add versioning docs #495

Merged
merged 60 commits into from
Oct 19, 2020
Merged
Changes from 1 commit
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
a516733
add time and perf benchmark for es
brandenchan Aug 27, 2020
36fed3b
Add retriever benchmarking
Aug 31, 2020
a7cab15
Add Reader benchmarking
Aug 31, 2020
62a571f
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
brandenchan Sep 4, 2020
7b6e237
add nq to squad conversion
brandenchan Sep 9, 2020
12f10e3
add conversion stats
brandenchan Sep 15, 2020
11a013b
clean benchmarks
brandenchan Sep 16, 2020
b224812
Add link to dataset
brandenchan Sep 16, 2020
aefe431
Merge branch 'master' into benchmarks
brandenchan Sep 18, 2020
a4f7ca2
Update imports
brandenchan Sep 18, 2020
baeae8c
Merge branch 'master' into benchmarks
brandenchan Sep 21, 2020
f7ca141
add first support for neg psgs
brandenchan Sep 21, 2020
3663871
Refactor test
brandenchan Sep 23, 2020
e48b314
set max_seq_len
brandenchan Sep 24, 2020
0ca92e5
cleanup benchmark
brandenchan Sep 24, 2020
abdbed1
Merge branch 'master' into benchmarks
brandenchan Sep 24, 2020
7e31588
begin retriever speed benchmarking
brandenchan Sep 24, 2020
768ad2b
Merge branch 'master' into benchmarks
brandenchan Sep 28, 2020
7611a31
Add support for retriever query index benchmarking
brandenchan Sep 28, 2020
22543be
improve reader eval, retriever speed benchmarking
brandenchan Sep 29, 2020
8f2c3a6
improve retriever speed benchmarking
brandenchan Oct 1, 2020
8b45c3e
Add retriever accuracy benchmark
brandenchan Oct 1, 2020
c238b4f
merge latest master
tholor Oct 2, 2020
03b80f6
Add neg doc shuffling
brandenchan Oct 2, 2020
acaab3c
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
brandenchan Oct 2, 2020
8b0dc46
Add top_n
brandenchan Oct 5, 2020
5b5f44a
3x speedup of SQL. add postgres docker run. make shuffle neg a param.…
tholor Oct 5, 2020
b90b3c6
Add models to sweep
brandenchan Oct 6, 2020
5eb7aa5
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
brandenchan Oct 6, 2020
7f70cdb
merge latest master
tholor Oct 6, 2020
331ddf8
Merge branch 'benchmarks' of github.com:deepset-ai/haystack into benc…
tholor Oct 6, 2020
cc44d0d
add option for faiss index type
tholor Oct 6, 2020
009eeb9
remove unneeded line
brandenchan Oct 6, 2020
c2862a9
change faiss to faiss_flat
brandenchan Oct 6, 2020
93090ee
begin automatic benchmark script
brandenchan Oct 6, 2020
7088b57
remove existing postgres docker for benchmarking
tholor Oct 6, 2020
ce055bd
Merge branch 'benchmarks' of github.com:deepset-ai/haystack into benc…
tholor Oct 6, 2020
44376cb
Add data processing scripts
brandenchan Oct 7, 2020
f20e226
Remove shuffle in script bc data already shuffled
brandenchan Oct 7, 2020
dcc79e3
switch hnsw setup from 256 to 128
tholor Oct 7, 2020
0f4a11d
Merge branch 'benchmarks' of github.com:deepset-ai/haystack into benc…
tholor Oct 7, 2020
cabf0e1
change es similarity to dot product by default
brandenchan Oct 7, 2020
af127db
Error includes stack trace
brandenchan Oct 7, 2020
ce2da2a
Change ES default timeout
brandenchan Oct 7, 2020
4ca0cf3
remove delete_docs() from timing for indexing
tholor Oct 8, 2020
3f78eba
Merge branch 'benchmarks' of github.com:deepset-ai/haystack into benc…
tholor Oct 8, 2020
37f3c92
Add support for website export
brandenchan Oct 8, 2020
51c2df7
update website on push to benchmarks
PiffPaffM Oct 8, 2020
8b57f0e
add complete benchmarks results
brandenchan Oct 8, 2020
1e3ee45
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
brandenchan Oct 8, 2020
b9dc09f
new json format
PiffPaffM Oct 8, 2020
4f272c5
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
PiffPaffM Oct 8, 2020
716cf7b
removed NaN as is not a valid json token
PiffPaffM Oct 8, 2020
a06f032
versioning for docs
PiffPaffM Oct 8, 2020
ddea3d3
merged master
PiffPaffM Oct 13, 2020
080739e
unsaved changes
PiffPaffM Oct 13, 2020
26b5fc6
cleaning
PiffPaffM Oct 13, 2020
80b61cb
cleaning
PiffPaffM Oct 13, 2020
2cb0295
Edit format of benchmarks data
brandenchan Oct 16, 2020
a6f65b0
update also jsons in v0.4.0
PiffPaffM Oct 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add retriever benchmarking
deepset authored and deepset committed Aug 31, 2020

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 36fed3b3b74db32254650d708c9e2ef81cd089b8
26 changes: 16 additions & 10 deletions haystack/database/faiss.py
Original file line number Diff line number Diff line change
@@ -109,11 +109,17 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non
:param index: Index name to update
:return: None
"""
# Some FAISS indexes(like the default HNSWx) do not support removing vectors, so a new index is created.
faiss_index = self._create_new_index(vector_size=self.vector_size)
index = index or self.index

index = index or self.index
documents = self.get_all_documents(index=index)

if len(documents) == 0:
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
self.faiss_index = None
return

# Some FAISS indexes(like the default HNSWx) do not support removing vectors, so a new index is created.
faiss_index = self._create_new_index(vector_size=self.vector_size)
logger.info(f"Updating embeddings for {len(documents)} docs ...")
embeddings = retriever.embed_passages(documents) # type: ignore
assert len(documents) == len(embeddings)
@@ -127,14 +133,14 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non
hnsw_vectors = self._get_hnsw_vectors(embeddings=embeddings, phi=phi)
faiss_index.add(hnsw_vectors)

doc_meta_to_update = []
for vector_id, doc in enumerate(documents[i : i + self.index_buffer_size]):
meta = doc.meta or {}
meta["vector_id"] = vector_id
doc_meta_to_update.append((doc.id, meta))
doc_meta_to_update = []
for vector_id, doc in enumerate(documents[i : i + self.index_buffer_size]):
meta = doc.meta or {}
meta["vector_id"] = vector_id
doc_meta_to_update.append((doc.id, meta))

for doc_id, meta in doc_meta_to_update:
super(FAISSDocumentStore, self).update_document_meta(id=doc_id, meta=meta)
for doc_id, meta in doc_meta_to_update:
super(FAISSDocumentStore, self).update_document_meta(id=doc_id, meta=meta)

self.faiss_index = faiss_index

22 changes: 19 additions & 3 deletions haystack/retriever/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from abc import ABC, abstractmethod
from typing import List
import logging
from time import perf_counter
from functools import wraps

from haystack.database.base import Document
from haystack.database.base import BaseDocumentStore
@@ -15,6 +17,18 @@ class BaseRetriever(ABC):
def retrieve(self, query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document]:
pass

def timing(self, fn):
@wraps(fn)
def wrapper(*args, **kwargs):
if "retrieve_time" not in self.__dict__:
self.retrieve_time = 0
tic = perf_counter()
ret = fn(*args, **kwargs)
toc = perf_counter()
self.retrieve_time += toc - tic
return ret
return wrapper

def eval(
self,
label_index: str = "label",
@@ -45,6 +59,8 @@ def eval(
# Extract all questions for evaluation
filters = {"origin": [label_origin]}

timed_retrieve = self.timing(self.retrieve)

labels = self.document_store.get_all_labels_aggregated(index=label_index, filters=filters)

correct_retrievals = 0
@@ -62,7 +78,7 @@ def eval(
# Option 1: Open-domain evaluation by checking if the answer string is in the retrieved docs
if open_domain:
for question, gold_answers in question_label_dict.items():
retrieved_docs = self.retrieve(question, top_k=top_k, index=doc_index)
retrieved_docs = timed_retrieve(question, top_k=top_k, index=doc_index)
# check if correct doc in retrieved docs
for doc_idx, doc in enumerate(retrieved_docs):
for gold_answer in gold_answers:
@@ -73,7 +89,7 @@ def eval(
# Option 2: Strict evaluation by document ids that are listed in the labels
else:
for question, gold_ids in question_label_dict.items():
retrieved_docs = self.retrieve(question, top_k=top_k, index=doc_index)
retrieved_docs = timed_retrieve(question, top_k=top_k, index=doc_index)
# check if correct doc in retrieved docs
for doc_idx, doc in enumerate(retrieved_docs):
for gold_id in gold_ids:
@@ -89,4 +105,4 @@ def eval(
logger.info((f"For {correct_retrievals} out of {number_of_questions} questions ({recall:.2%}), the answer was in"
f" the top-{top_k} candidate passages selected by the retriever."))

return {"recall": recall, "map": mean_avg_precision}
return {"recall": recall, "map": mean_avg_precision, "retrieve_time": self.retrieve_time}
88 changes: 88 additions & 0 deletions test/benchmarks/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from haystack.indexing.utils import fetch_archive_from_http
import os
from haystack.database.sql import SQLDocumentStore
from haystack.database.memory import InMemoryDocumentStore
from haystack.database.elasticsearch import Elasticsearch, ElasticsearchDocumentStore
from haystack.database.faiss import FAISSDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever, TfidfRetriever
from haystack.retriever.dense import DensePassageRetriever
from time import perf_counter

from pathlib import Path


retriever_doc_stores = [("elastic", "elasticsearch"),
("dpr", "faiss")]
reader_models = [""]
reader_type = ["farm", "transformers"]

data_dir = Path("../../data/nq")
filename = "nq_dev_subset_v3.json"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v3.json.zip"
doc_index = "eval_document"
label_index = "label"

def prepare_data(data_dir):
fetch_archive_from_http(url=s3_url, output_dir=data_dir)

def get_document_store(document_store_type):
""" TODO This method is taken from test/conftest.py but maybe should be within Haystack.
Perhaps a class method of DocStore that just takes string for type of DocStore"""
if document_store_type == "sql":
if os.path.exists("haystack_test.db"):
os.remove("haystack_test.db")
document_store = SQLDocumentStore(url="sqlite:///haystack_test.db")
elif document_store_type == "memory":
document_store = InMemoryDocumentStore()
elif document_store_type == "elasticsearch":
# make sure we start from a fresh index
client = Elasticsearch()
client.indices.delete(index='haystack_test*', ignore=[404])
document_store = ElasticsearchDocumentStore(index="haystack_test")
elif document_store_type == "faiss":
if os.path.exists("haystack_test_faiss.db"):
os.remove("haystack_test_faiss.db")
document_store = FAISSDocumentStore(sql_url="sqlite:///haystack_test_faiss.db")
else:
raise Exception(f"No document store fixture for '{document_store_type}'")
return document_store

def get_retriever(retriever_name, doc_store):
if retriever_name == "elastic":
return ElasticsearchRetriever(doc_store)
if retriever_name == "tfidf":
return TfidfRetriever(doc_store)
if retriever_name == "dpr":
return DensePassageRetriever(document_store=doc_store,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
use_gpu=True)


def benchmark_indexing(doc_store, data_dir, filename, retriever):
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
tic = perf_counter()
doc_store.add_eval_data(data_dir / filename)
try:
doc_store.update_embeddings(retriever, index=doc_index)
except AttributeError:
pass
toc = perf_counter()
time = toc - tic
return doc_store, time


def main():
# prepare_data(data_dir)
for retriever_name, doc_store_name in retriever_doc_stores:
doc_store = get_document_store(doc_store_name)
retriever = get_retriever(retriever_name, doc_store)
doc_store, indexing_time = benchmark_indexing(doc_store, data_dir, filename, retriever)
results = retriever.eval()
results["indexing_time"] = indexing_time
print(results)

if __name__ == "__main__":
main()