Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cohere vector benchmarks #444

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions cohere_vector/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
## Cohere vector track

This track benchmarks the dataset from [Cohere/miracl-en-corpus-22-12](https://huggingface.co/datasets/Cohere/miracl-en-corpus-22-12).

Given the size of this dataset 32.8M documents with 768 dimension vectors you
need a cluster with at least 103GB of total RAM available to run performant HNSW queries.

### Generating the document dataset

To rebuild the dataset run the following commands:

```shell
$ python _tools/parse_documents.py
# Create a test file for each page of documents
$ for file in cohere-documents-*; do
head -n 1000 $file > "${file%.*}-1k.json"
done
# Zip each document file for uploading
$ for file in cohere-documents-*; do
pv $file | bzip2 -k >> $file.bz2
done
```

This will build 11 `cohere-documents-XX.json` filse for the entire dataset of 32.8M documents and then bzip then. Note that this script depends on the libraries listed `_tools/requirements.txt` to run and it takes a few hours to download and parse all the documents. This script will normalize the embeddings vector to be unit-length so that they can be indexed in an elasticsearch index.

### Example Document

```json
{
"docid": "31958810#2",
"title": "Daybehavior",
"text": "During 1998 and 1999 they, recorded their follow-up album with Kevin Petri, engineer on Massive Attack's debut album \"Blue Lines\" (1991). NONS, dealing with financial problems, went into bankruptcy 99 and the album was locked from being released. The band in despair decided to take a break and Arell moved to Thailand.",
"emb": [0.027735009072141308, 0.014094767951423247, 0.03152555797377242, ...]
}
```

### Generating the queries

The `queries.json` can be rebuilt using the `_tools/parse_queries.py`, this will load the queries dataset from hugging face and normalize the vectors outputing the result to the `queries.json` file.

### Parameters

This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:

- bulk_size (default: 500)
- bulk_indexing_clients (default: 5)
- build_warmup (default: 40)
- ingest_percentage (default: 100)
- index_settings {default: {}}
- number_of_shards (default : 1)
- number_of_replicas (default: 0)
87 changes: 87 additions & 0 deletions cohere_vector/_tools/parse_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import json
import sys

import numpy as np
from datasets import DownloadMode, load_dataset

DATASET_NAME: str = f"Cohere/miracl-en-corpus-22-12"
DATASET_DL_PROCS: int = 6
OUTPUT_FILENAME: str = "cohere-documents"
DEFAULT_MAX_DOCS = -1
TOTAL_DOCS = 32893221
MAX_DOCS_PER_FILE = 3_000_000
TOTAL_PAGES = 11
PROGRESS_EVERY = 100


def progress_bar(count, total):
bar_length = 100
filled_length = int(round(bar_length * count / float(total)))
percentage = round(100.0 * count / float(total), 1)
bar = "=" * filled_length + "-" * (bar_length - filled_length)
sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total))
sys.stdout.flush()


def output_pages(start_page, end_page):
for page in range(start_page, end_page + 1):
start_index = (page - 1) * MAX_DOCS_PER_FILE
end_index = start_index + MAX_DOCS_PER_FILE
if end_index > TOTAL_DOCS:
end_index = TOTAL_DOCS
output_filename = f"{OUTPUT_FILENAME}-{page:02d}.json"
print(f"Outputing page {page} documents to {output_filename}")
with open(output_filename, "w") as documents_file:
output_documents(documents_file, start_index, end_index)


def output_documents(docs_file, start_index, end_index):
doc_count = 0
dataset_size = end_index - start_index
print(f"Parsing {dataset_size} documents from {DATASET_NAME} [{start_index}:{end_index}]")
docs = load_dataset(
DATASET_NAME,
split=f"train[{start_index}:{end_index}]",
num_proc=DATASET_DL_PROCS,
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
)

progress_bar(doc_count, dataset_size)
for doc in docs:
v = np.array(doc["emb"])
v_unit = v / np.linalg.norm(v)
docs_file.write(
json.dumps(
{
"docid": doc["docid"],
"title": doc["title"],
"text": doc["text"],
"emb": v_unit.tolist(),
},
ensure_ascii=True,
)
)
docs_file.write("\n")
doc_count += 1
if doc_count % PROGRESS_EVERY == 0:
progress_bar(doc_count, dataset_size)
print(f"Wrote {doc_count} documents to output file.")


def parse_arguments():
if len(sys.argv) >= 3:
return (DEFAULT_MAX_DOCS, int(sys.argv[1]), int(sys.argv[2]))

if len(sys.argv) >= 2:
return (int(sys.argv[1]), 1, TOTAL_PAGES)
return (DEFAULT_MAX_DOCS, 1, TOTAL_PAGES)


if __name__ == "__main__":
(max_documents, start_page, end_page) = parse_arguments()
if max_documents == DEFAULT_MAX_DOCS:
output_pages(start_page, end_page)
else:
print("Outputing documents to {}.json".format(OUTPUT_FILENAME))
with open(f"{OUTPUT_FILENAME}.json", "w") as documents_file:
output_documents(documents_file, 0, max_documents)
22 changes: 22 additions & 0 deletions cohere_vector/_tools/parse_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import json

import numpy as np
from datasets import load_dataset

DATASET_NAME: str = f"Cohere/miracl-en-queries-22-12"
DATASET_SPLIT: str = "train"
OUTPUT_FILENAME: str = "queries.json"


def output_queries(queries_file):
queries = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
for query in queries:
v = np.array(query["emb"])
v_unit = v / np.linalg.norm(v)
queries_file.write(json.dumps(v_unit.tolist()))
queries_file.write("\n")


if __name__ == "__main__":
with open(OUTPUT_FILENAME, "w") as queries_file:
output_queries(queries_file)
1 change: 1 addition & 0 deletions cohere_vector/_tools/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
datasets
59 changes: 59 additions & 0 deletions cohere_vector/challenges/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"name": "index-and-search",
"description": "",
"default": true,
"schedule": [
{
"operation": {
"operation-type": "delete-index"
}
},
{
"name": "create-index",
"operation": "create-index"
},
{
"name": "check-cluster-health",
"operation": "check-cluster-health"
},
{
"name": "index-documents",
"operation": "index-documents",
"warmup-time-period": {{ bulk_warmup | default(40) | int }},
"clients": {{bulk_indexing_clients | default(5)}}
},
{
"name": "refresh-after-index",
"operation": {
"operation-type": "refresh",
"request-timeout": 1000,
"include-in-reporting": true
}
},
{
"name": "wait-until-merges-finish-after-index",
"operation": {
"operation-type": "index-stats",
"index": "_all",
"condition": {
"path": "_all.total.merges.current",
"expected-value": 0
},
"retry-until-success": true,
"include-in-reporting": false
}
},
{
"name": "knn-search-10-100",
"operation": "knn-search-10-100",
"warmup-iterations": 100,
"iterations": 1000
},
{
"name": "knn-search-100-1000",
"operation": "knn-search-100-1000",
"warmup-iterations": 100,
"iterations": 1000
}
]
}
22 changes: 22 additions & 0 deletions cohere_vector/files.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
cohere-documents-01.json
cohere-documents-01-1k.json
cohere-documents-02.json
cohere-documents-02-1k.json
cohere-documents-03.json
cohere-documents-03-1k.json
cohere-documents-04.json
cohere-documents-04-1k.json
cohere-documents-05.json
cohere-documents-05-1k.json
cohere-documents-06.json
cohere-documents-06-1k.json
cohere-documents-07.json
cohere-documents-07-1k.json
cohere-documents-08.json
cohere-documents-08-1k.json
cohere-documents-09.json
cohere-documents-09-1k.json
cohere-documents-10.json
cohere-documents-10-1k.json
cohere-documents-11.json
cohere-documents-11-1k.json
28 changes: 28 additions & 0 deletions cohere_vector/index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"settings": {
"index": {
"number_of_shards": {{number_of_shards | default(1)}},
"number_of_replicas": {{number_of_replicas | default(0)}}
}
},
"mappings": {
"properties": {
"docid": {
"type": "keyword"
},
"title": {
"type": "text"
},
"text": {
"type": "text"
},
"emb": {
"type": "dense_vector",
"element_type": "float",
"dims": 768,
"index": true,
"similarity": "dot_product"
}
}
}
}
33 changes: 33 additions & 0 deletions cohere_vector/operations/default.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"name": "create-index",
"operation-type": "create-index",
"settings": {{index_settings | default({}) | tojson}}
},
{
"name": "check-cluster-health",
"operation-type": "cluster-health",
"request-params": {
"wait_for_status": "green"
},
"retry-until-success": true
},
{
"name": "index-documents",
"operation-type": "bulk",
"bulk-size": {{bulk_size | default(500)}},
"ingest-percentage": {{ingest_percentage | default(100)}}
},
{
"name": "knn-search-10-100",
"operation-type": "search",
"param-source": "knn-param-source",
"k": 10,
"num-candidates": 100
},
{
"name": "knn-search-100-1000",
"operation-type": "search",
"param-source": "knn-param-source",
"k": 100,
"num-candidates": 1000
}
2,863 changes: 2,863 additions & 0 deletions cohere_vector/queries.json

Large diffs are not rendered by default.

Loading