Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add versioning docs #495

Merged
merged 60 commits into from
Oct 19, 2020
Merged
Changes from 1 commit
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
a516733
add time and perf benchmark for es
brandenchan Aug 27, 2020
36fed3b
Add retriever benchmarking
Aug 31, 2020
a7cab15
Add Reader benchmarking
Aug 31, 2020
62a571f
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
brandenchan Sep 4, 2020
7b6e237
add nq to squad conversion
brandenchan Sep 9, 2020
12f10e3
add conversion stats
brandenchan Sep 15, 2020
11a013b
clean benchmarks
brandenchan Sep 16, 2020
b224812
Add link to dataset
brandenchan Sep 16, 2020
aefe431
Merge branch 'master' into benchmarks
brandenchan Sep 18, 2020
a4f7ca2
Update imports
brandenchan Sep 18, 2020
baeae8c
Merge branch 'master' into benchmarks
brandenchan Sep 21, 2020
f7ca141
add first support for neg psgs
brandenchan Sep 21, 2020
3663871
Refactor test
brandenchan Sep 23, 2020
e48b314
set max_seq_len
brandenchan Sep 24, 2020
0ca92e5
cleanup benchmark
brandenchan Sep 24, 2020
abdbed1
Merge branch 'master' into benchmarks
brandenchan Sep 24, 2020
7e31588
begin retriever speed benchmarking
brandenchan Sep 24, 2020
768ad2b
Merge branch 'master' into benchmarks
brandenchan Sep 28, 2020
7611a31
Add support for retriever query index benchmarking
brandenchan Sep 28, 2020
22543be
improve reader eval, retriever speed benchmarking
brandenchan Sep 29, 2020
8f2c3a6
improve retriever speed benchmarking
brandenchan Oct 1, 2020
8b45c3e
Add retriever accuracy benchmark
brandenchan Oct 1, 2020
c238b4f
merge latest master
tholor Oct 2, 2020
03b80f6
Add neg doc shuffling
brandenchan Oct 2, 2020
acaab3c
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
brandenchan Oct 2, 2020
8b0dc46
Add top_n
brandenchan Oct 5, 2020
5b5f44a
3x speedup of SQL. add postgres docker run. make shuffle neg a param.…
tholor Oct 5, 2020
b90b3c6
Add models to sweep
brandenchan Oct 6, 2020
5eb7aa5
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
brandenchan Oct 6, 2020
7f70cdb
merge latest master
tholor Oct 6, 2020
331ddf8
Merge branch 'benchmarks' of github.com:deepset-ai/haystack into benc…
tholor Oct 6, 2020
cc44d0d
add option for faiss index type
tholor Oct 6, 2020
009eeb9
remove unneeded line
brandenchan Oct 6, 2020
c2862a9
change faiss to faiss_flat
brandenchan Oct 6, 2020
93090ee
begin automatic benchmark script
brandenchan Oct 6, 2020
7088b57
remove existing postgres docker for benchmarking
tholor Oct 6, 2020
ce055bd
Merge branch 'benchmarks' of github.com:deepset-ai/haystack into benc…
tholor Oct 6, 2020
44376cb
Add data processing scripts
brandenchan Oct 7, 2020
f20e226
Remove shuffle in script bc data already shuffled
brandenchan Oct 7, 2020
dcc79e3
switch hnsw setup from 256 to 128
tholor Oct 7, 2020
0f4a11d
Merge branch 'benchmarks' of github.com:deepset-ai/haystack into benc…
tholor Oct 7, 2020
cabf0e1
change es similarity to dot product by default
brandenchan Oct 7, 2020
af127db
Error includes stack trace
brandenchan Oct 7, 2020
ce2da2a
Change ES default timeout
brandenchan Oct 7, 2020
4ca0cf3
remove delete_docs() from timing for indexing
tholor Oct 8, 2020
3f78eba
Merge branch 'benchmarks' of github.com:deepset-ai/haystack into benc…
tholor Oct 8, 2020
37f3c92
Add support for website export
brandenchan Oct 8, 2020
51c2df7
update website on push to benchmarks
PiffPaffM Oct 8, 2020
8b57f0e
add complete benchmarks results
brandenchan Oct 8, 2020
1e3ee45
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
brandenchan Oct 8, 2020
b9dc09f
new json format
PiffPaffM Oct 8, 2020
4f272c5
Merge branch 'benchmarks' of https://github.com/deepset-ai/haystack i…
PiffPaffM Oct 8, 2020
716cf7b
removed NaN as is not a valid json token
PiffPaffM Oct 8, 2020
a06f032
versioning for docs
PiffPaffM Oct 8, 2020
ddea3d3
merged master
PiffPaffM Oct 13, 2020
080739e
unsaved changes
PiffPaffM Oct 13, 2020
26b5fc6
cleaning
PiffPaffM Oct 13, 2020
80b61cb
cleaning
PiffPaffM Oct 13, 2020
2cb0295
Edit format of benchmarks data
brandenchan Oct 16, 2020
a6f65b0
update also jsons in v0.4.0
PiffPaffM Oct 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add support for retriever query index benchmarking
brandenchan committed Sep 28, 2020
commit 7611a318a96e00c9ff7229014aa221affce39d67
120 changes: 86 additions & 34 deletions test/benchmarks/retriever.py
Original file line number Diff line number Diff line change
@@ -3,53 +3,47 @@
from time import perf_counter
from utils import get_document_store, get_retriever
from haystack.preprocessor.utils import eval_data_from_file
import pickle


retriever_doc_stores = [("dpr", "faiss"), ("elastic", "elasticsearch")]
n_docs_options = [1000]
n_docs_options = [1000, 5000, 10000]

data_dir = Path("../../data/retriever")
filename_gold = "nq2squad-dev.json" # Found at s3://ext-haystack-retriever-eval
filename_negative = "psgs_w100_minus_gold.tsv" # Found at s3://ext-haystack-retriever-eval
embeddings_dir = Path("data/retriever_results/nq/single")
embeddings_filenames = [f"wikipedia_passages_{i}.pkl" for i in range(50)]

doc_index = "eval_document"
label_index = "label"


def benchmark_speed():
retriever_results = []
for retriever_name, doc_store_name in retriever_doc_stores:
for n_docs in n_docs_options:
# try:
doc_store = get_document_store(doc_store_name)
retriever = get_retriever(retriever_name, doc_store)
docs, labels = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs)
benchmark_indexing_speed()
# benchmark_querying_speed()

doc_store, indexing_time = benchmark_indexing_speed(doc_store, docs, labels, retriever)
print(indexing_time)

results = retriever.eval()
# results["indexing_time"] = indexing_time
# results["retriever"] = retriever_name
# results["doc_store"] = doc_store_name
# print(results)
# retriever_results.append(results)
# except Exception as e:
# retriever_results.append(str(e))

retriever_df = pd.DataFrame.from_records(retriever_results)
retriever_df.to_csv("retriever_results.csv")

def prepare_data(data_dir, filename_gold, filename_negative, n_docs=None):
def prepare_data(data_dir, filename_gold, filename_negative, n_docs=None, add_precomputed=False):
"""
filename_gold points to a squad format file.
filename_negative points to a csv file where the first column is doc_id and second is document text.
"""

gold_docs, labels = eval_data_from_file(data_dir / filename_gold)

# Reduce number of docs and remove labels whose gold docs have been removed
gold_docs = gold_docs[:n_docs]
doc_ids = [x.id for x in gold_docs]
labels = [x for x in labels if x.document_id in doc_ids]

n_neg_docs = max(0, n_docs - len(gold_docs))
neg_docs = prepare_negative_passages(data_dir, filename_negative, n_neg_docs)
docs = gold_docs + neg_docs

if add_precomputed:
docs = add_precomputed_embeddings(data_dir / embeddings_dir, embeddings_filenames, docs)

return docs, labels

def prepare_negative_passages(data_dir, filename_negative, n_docs):
@@ -72,22 +66,80 @@ def prepare_negative_passages(data_dir, filename_negative, n_docs):
docs.append(d)
return docs

def benchmark_indexing_speed(doc_store, docs, labels, retriever):
tic = perf_counter()
index_to_doc_store(doc_store, docs, labels, retriever)
toc = perf_counter()
time = toc - tic
return doc_store, time
def benchmark_indexing_speed():

retriever_results = []
for retriever_name, doc_store_name in retriever_doc_stores:
for n_docs in n_docs_options:
# try:

doc_store = get_document_store(doc_store_name)
retriever = get_retriever(retriever_name, doc_store)

docs, _ = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs)

tic = perf_counter()
index_to_doc_store(doc_store, docs, retriever)
toc = perf_counter()
indexing_time = toc - tic

print(indexing_time)

# results = retriever.eval()
# results["indexing_time"] = indexing_time
# results["retriever"] = retriever_name
# results["doc_store"] = doc_store_name
# print(results)
# retriever_results.append(results)
# # except Exception as e:
# # retriever_results.append(str(e))

retriever_results.append({
"retriever": retriever_name,
"doc_store": doc_store_name,
"n_docs": n_docs,
"indexing_time": indexing_time})
retriever_df = pd.DataFrame.from_records(retriever_results)
retriever_df.to_csv("retriever_index_results.csv")

del doc_store
del retriever

def benchmark_querying_speed():
""" Benchmark the time it takes to perform querying. Doc embeddings are loaded from file."""
retriever_results = []
for retriever_name, doc_store_name in retriever_doc_stores:
doc_store = get_document_store(doc_store_name)
retriever = get_retriever(retriever_name, doc_store)
for n_docs in n_docs_options:
# try:
add_precomputed = retriever_name in ["dpr"]
docs, labels = prepare_data(data_dir, filename_gold, filename_negative, n_docs=n_docs, add_precomputed=add_precomputed)

index_to_doc_store(doc_store, docs, retriever, labels)

def add_precomputed_embeddings(embeddings_dir, embeddings_filenames, docs):
ret = []
id_to_doc = {x.id: x for x in docs}

for ef in embeddings_filenames:
filename = embeddings_dir / ef
print(filename)
data = pickle.load(open(filename, "rb"))
for i, vec in data:
if i in id_to_doc:
id_to_doc[i].embedding = vec

return id_to_doc.values()

def index_to_doc_store(doc_store, docs, labels, retriever):
def index_to_doc_store(doc_store, docs, retriever, labels=None):
doc_store.delete_all_documents(index=doc_index)
doc_store.delete_all_documents(index=label_index)
doc_store.write_documents(docs, doc_index)
doc_store.write_labels(labels, index=label_index)
if callable(getattr(retriever, "embed_passages", None)):
if labels:
doc_store.write_labels(labels, index=label_index)
elif callable(getattr(retriever, "embed_passages", None)) and docs[0].embedding is None:
doc_store.update_embeddings(retriever, index=doc_index)
else:
pass


if __name__ == "__main__":