elastic · TattdCodeMonkey · Aug 21, 2023 · Jul 28, 2023 · Jul 31, 2023 · Jul 31, 2023
diff --git a/cohere_vector/README.md b/cohere_vector/README.md
@@ -0,0 +1,51 @@
+## Cohere vector track
+
+This track benchmarks the dataset from [Cohere/miracl-en-corpus-22-12](https://huggingface.co/datasets/Cohere/miracl-en-corpus-22-12).
+
+Given the size of this dataset 32.8M documents with 768 dimension vectors you
+need a cluster with at least 103GB of total RAM available to run performant HNSW queries.
+
+### Generating the document dataset
+
+To rebuild the dataset run the following commands:
+
+```shell
+$ python _tools/parse_documents.py
+# Create a test file for each page of documents
+$ for file in cohere-documents-*; do
+  head -n 1000 $file > "${file%.*}-1k.json"
+done
+# Zip each document file for uploading
+$ for file in cohere-documents-*; do
+  pv $file | bzip2 -k >> $file.bz2
+done
+```
+
+This will build 11 `cohere-documents-XX.json` filse for the entire dataset of 32.8M documents and then bzip then. Note that this script depends on the libraries listed `_tools/requirements.txt` to run and it takes a few hours to download and parse all the documents. This script will normalize the embeddings vector to be unit-length so that they can be indexed in an elasticsearch index.
+
+### Example Document
+
+```json
+{
+  "docid": "31958810#2",
+  "title": "Daybehavior",
+  "text": "During 1998 and 1999 they, recorded their follow-up album with Kevin Petri, engineer on Massive Attack's debut album \"Blue Lines\" (1991). NONS, dealing with financial problems, went into bankruptcy 99 and the album was locked from being released. The band in despair decided to take a break and Arell moved to Thailand.",
+  "emb": [0.027735009072141308, 0.014094767951423247, 0.03152555797377242, ...]
+}
+```
+
+### Generating the queries
+
+The `queries.json` can be rebuilt using the `_tools/parse_queries.py`, this will load the queries dataset from hugging face and normalize the vectors outputing the result to the `queries.json` file.
+
+### Parameters
+
+This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:
+
+ - bulk_size (default: 500)
+ - bulk_indexing_clients (default: 5)
+ - build_warmup (default: 40)
+ - ingest_percentage (default: 100)
+ - index_settings {default: {}}
+ - number_of_shards (default : 1)
+ - number_of_replicas (default: 0)
diff --git a/cohere_vector/_tools/parse_documents.py b/cohere_vector/_tools/parse_documents.py
@@ -0,0 +1,87 @@
+import json
+import sys
+
+import numpy as np
+from datasets import DownloadMode, load_dataset
+
+DATASET_NAME: str = f"Cohere/miracl-en-corpus-22-12"
+DATASET_DL_PROCS: int = 6
+OUTPUT_FILENAME: str = "cohere-documents"
+DEFAULT_MAX_DOCS = -1
+TOTAL_DOCS = 32893221
+MAX_DOCS_PER_FILE = 3_000_000
+TOTAL_PAGES = 11
+PROGRESS_EVERY = 100
+
+
+def progress_bar(count, total):
+    bar_length = 100
+    filled_length = int(round(bar_length * count / float(total)))
+    percentage = round(100.0 * count / float(total), 1)
+    bar = "=" * filled_length + "-" * (bar_length - filled_length)
+    sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total))
+    sys.stdout.flush()
+
+
+def output_pages(start_page, end_page):
+    for page in range(start_page, end_page + 1):
+        start_index = (page - 1) * MAX_DOCS_PER_FILE
+        end_index = start_index + MAX_DOCS_PER_FILE
+        if end_index > TOTAL_DOCS:
+            end_index = TOTAL_DOCS
+        output_filename = f"{OUTPUT_FILENAME}-{page:02d}.json"
+        print(f"Outputing page {page} documents to {output_filename}")
+        with open(output_filename, "w") as documents_file:
+            output_documents(documents_file, start_index, end_index)
+
+
+def output_documents(docs_file, start_index, end_index):
+    doc_count = 0
+    dataset_size = end_index - start_index
+    print(f"Parsing {dataset_size} documents from {DATASET_NAME} [{start_index}:{end_index}]")
+    docs = load_dataset(
+        DATASET_NAME,
+        split=f"train[{start_index}:{end_index}]",
+        num_proc=DATASET_DL_PROCS,
+        download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
+    )
+
+    progress_bar(doc_count, dataset_size)
+    for doc in docs:
+        v = np.array(doc["emb"])
+        v_unit = v / np.linalg.norm(v)
+        docs_file.write(
+            json.dumps(
+                {
+                    "docid": doc["docid"],
+                    "title": doc["title"],
+                    "text": doc["text"],
+                    "emb": v_unit.tolist(),
+                },
+                ensure_ascii=True,
+            )
+        )
+        docs_file.write("\n")
+        doc_count += 1
+        if doc_count % PROGRESS_EVERY == 0:
+            progress_bar(doc_count, dataset_size)
+    print(f"Wrote {doc_count} documents to output file.")
+
+
+def parse_arguments():
+    if len(sys.argv) >= 3:
+        return (DEFAULT_MAX_DOCS, int(sys.argv[1]), int(sys.argv[2]))
+
+    if len(sys.argv) >= 2:
+        return (int(sys.argv[1]), 1, TOTAL_PAGES)
+    return (DEFAULT_MAX_DOCS, 1, TOTAL_PAGES)
+
+
+if __name__ == "__main__":
+    (max_documents, start_page, end_page) = parse_arguments()
+    if max_documents == DEFAULT_MAX_DOCS:
+        output_pages(start_page, end_page)
+    else:
+        print("Outputing documents to {}.json".format(OUTPUT_FILENAME))
+        with open(f"{OUTPUT_FILENAME}.json", "w") as documents_file:
+            output_documents(documents_file, 0, max_documents)
diff --git a/cohere_vector/_tools/parse_queries.py b/cohere_vector/_tools/parse_queries.py
@@ -0,0 +1,22 @@
+import json
+
+import numpy as np
+from datasets import load_dataset
+
+DATASET_NAME: str = f"Cohere/miracl-en-queries-22-12"
+DATASET_SPLIT: str = "train"
+OUTPUT_FILENAME: str = "queries.json"
+
+
+def output_queries(queries_file):
+    queries = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
+    for query in queries:
+        v = np.array(query["emb"])
+        v_unit = v / np.linalg.norm(v)
+        queries_file.write(json.dumps(v_unit.tolist()))
+        queries_file.write("\n")
+
+
+if __name__ == "__main__":
+    with open(OUTPUT_FILENAME, "w") as queries_file:
+        output_queries(queries_file)
diff --git a/cohere_vector/_tools/requirements.txt b/cohere_vector/_tools/requirements.txt
@@ -0,0 +1 @@
+datasets
diff --git a/cohere_vector/challenges/default.json b/cohere_vector/challenges/default.json
@@ -0,0 +1,59 @@
+{
+  "name": "index-and-search",
+  "description": "",
+  "default": true,
+  "schedule": [
+    {
+      "operation": {
+        "operation-type": "delete-index"
+      }
+    },
+    {
+      "name": "create-index",
+      "operation": "create-index"
+    },
+    {
+      "name": "check-cluster-health",
+      "operation": "check-cluster-health"
+    },
+    {
+      "name": "index-documents",
+      "operation": "index-documents",
+      "warmup-time-period": {{ bulk_warmup | default(40) | int }},
+      "clients": {{bulk_indexing_clients | default(5)}}
+    },
+    {
+      "name": "refresh-after-index",
+      "operation": {
+        "operation-type": "refresh",
+        "request-timeout": 1000,
+        "include-in-reporting": true
+      }
+    },
+    {
+      "name": "wait-until-merges-finish-after-index",
+      "operation": {
+        "operation-type": "index-stats",
+        "index": "_all",
+        "condition": {
+          "path": "_all.total.merges.current",
+          "expected-value": 0
+        },
+        "retry-until-success": true,
+        "include-in-reporting": false
+      }
+    },
+    {
+      "name": "knn-search-10-100",
+      "operation": "knn-search-10-100",
+      "warmup-iterations": 100,
+      "iterations": 1000
+    },
+    {
+      "name": "knn-search-100-1000",
+      "operation": "knn-search-100-1000",
+      "warmup-iterations": 100,
+      "iterations": 1000
+    }
+  ]
+}
diff --git a/cohere_vector/files.txt b/cohere_vector/files.txt
@@ -0,0 +1,22 @@
+cohere-documents-01.json
+cohere-documents-01-1k.json
+cohere-documents-02.json
+cohere-documents-02-1k.json
+cohere-documents-03.json
+cohere-documents-03-1k.json
+cohere-documents-04.json
+cohere-documents-04-1k.json
+cohere-documents-05.json
+cohere-documents-05-1k.json
+cohere-documents-06.json
+cohere-documents-06-1k.json
+cohere-documents-07.json
+cohere-documents-07-1k.json
+cohere-documents-08.json
+cohere-documents-08-1k.json
+cohere-documents-09.json
+cohere-documents-09-1k.json
+cohere-documents-10.json
+cohere-documents-10-1k.json
+cohere-documents-11.json
+cohere-documents-11-1k.json
diff --git a/cohere_vector/index.json b/cohere_vector/index.json
@@ -0,0 +1,28 @@
+{
+  "settings": {
+    "index": {
+      "number_of_shards": {{number_of_shards | default(1)}},
+      "number_of_replicas": {{number_of_replicas | default(0)}}
+    }
+  },
+  "mappings": {
+    "properties": {
+      "docid": {
+        "type": "keyword"
+      },
+      "title": {
+        "type": "text"
+      },
+      "text": {
+        "type": "text"
+      },
+      "emb": {
+        "type": "dense_vector",
+        "element_type": "float",
+        "dims": 768,
+        "index": true,
+        "similarity": "dot_product"
+      }
+    }
+  }
+}
diff --git a/cohere_vector/operations/default.json b/cohere_vector/operations/default.json
@@ -0,0 +1,33 @@
+{
+  "name": "create-index",
+  "operation-type": "create-index",
+  "settings": {{index_settings | default({}) | tojson}}
+},
+{
+  "name": "check-cluster-health",
+  "operation-type": "cluster-health",
+  "request-params": {
+    "wait_for_status": "green"
+  },
+  "retry-until-success": true
+},
+{
+  "name": "index-documents",
+  "operation-type": "bulk",
+  "bulk-size": {{bulk_size | default(500)}},
+  "ingest-percentage": {{ingest_percentage | default(100)}}
+},
+{
+  "name": "knn-search-10-100",
+  "operation-type": "search",
+  "param-source": "knn-param-source",
+  "k": 10,
+  "num-candidates": 100
+},
+{
+  "name": "knn-search-100-1000",
+  "operation-type": "search",
+  "param-source": "knn-param-source",
+  "k": 100,
+  "num-candidates": 1000
+}
diff --git a/cohere_vector/queries.json b/cohere_vector/queries.json