Add ANN bench scripts to generate ground truth (rapidsai#1967)

Add a command line based tool that can read a dataset and generate ground truth files with exact neighbors using raft brute force knn. Authors: - Tamas Bela Feher (https://github.com/tfeher) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: rapidsai#1967
benfred · Nov 8, 2023 · 66a6af8 · 66a6af8
1 parent be3292a
commit 66a6af8
Show file tree

Hide file tree

Showing 3 changed files with 361 additions and 1 deletion.
diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md
@@ -44,4 +44,20 @@ Commonly used datasets can be downloaded from two websites:
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
     popd
     ```
-    Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
+    Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
+
+## Generate ground truth
+
+If you have a dataset, but no corresponding ground truth file, then you can generate ground trunth using the `generate_groundtruth` utility. Example usage:
+
+```bash
+# With existing query file
+python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
+
+# With randomly generated queries
+python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000
+
+# Using only a subset of the dataset. Define queries by randomly
+# selecting vectors from the (subset of the) dataset.
+python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000
+```
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import os
+
+import cupy as cp
+import numpy as np
+import rmm
+from pylibraft.common import DeviceResources
+from pylibraft.neighbors.brute_force import knn
+from rmm.allocators.cupy import rmm_cupy_allocator
+
+from .utils import memmap_bin_file, suffix_from_dtype, write_bin
+
+
+def generate_random_queries(n_queries, n_features, dtype=np.float32):
+    print("Generating random queries")
+    if np.issubdtype(dtype, np.integer):
+        queries = cp.random.randint(
+            0, 255, size=(n_queries, n_features), dtype=dtype
+        )
+    else:
+        queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype)
+    return queries
+
+
+def choose_random_queries(dataset, n_queries):
+    print("Choosing random vector from dataset as query vectors")
+    query_idx = np.random.choice(
+        dataset.shape[0], size=(n_queries,), replace=False
+    )
+    return dataset[query_idx, :]
+
+
+def calc_truth(dataset, queries, k, metric="sqeuclidean"):
+    handle = DeviceResources()
+    n_samples = dataset.shape[0]
+    n = 500000  # batch size for processing neighbors
+    i = 0
+    indices = None
+    distances = None
+    queries = cp.asarray(queries, dtype=cp.float32)
+
+    while i < n_samples:
+        print("Step {0}/{1}:".format(i // n, n_samples // n))
+        n_batch = n if i + n <= n_samples else n_samples - i
+
+        X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)
+
+        D, Ind = knn(
+            X,
+            queries,
+            k,
+            metric=metric,
+            handle=handle,
+            global_id_offset=i,  # shift neighbor index by offset i
+        )
+        handle.sync()
+
+        D, Ind = cp.asarray(D), cp.asarray(Ind)
+        if distances is None:
+            distances = D
+            indices = Ind
+        else:
+            distances = cp.concatenate([distances, D], axis=1)
+            indices = cp.concatenate([indices, Ind], axis=1)
+            idx = cp.argsort(distances, axis=1)[:, :k]
+            distances = cp.take_along_axis(distances, idx, axis=1)
+            indices = cp.take_along_axis(indices, idx, axis=1)
+
+        i += n_batch
+
+    return distances, indices
+
+
+def main():
+    pool = rmm.mr.PoolMemoryResource(
+        rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
+    )
+    rmm.mr.set_current_device_resource(pool)
+    cp.cuda.set_allocator(rmm_cupy_allocator)
+
+    parser = argparse.ArgumentParser(
+        prog="generate_groundtruth",
+        description="Generate true neighbors using exact NN search. "
+        "The input and output files are in big-ann-benchmark's binary format.",
+        epilog="""Example usage
+    # With existing query file
+    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
+
+    # With randomly generated queries
+    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+fbin --output=groundtruth_dir --queries=random --n_queries=10000
+
+    # Using only a subset of the dataset. Define queries by randomly
+    # selecting vectors from the (subset of the) dataset.
+    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
+--queries=random-choice --n_queries=10000
+    """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument("dataset", type=str, help="input dataset file name")
+    parser.add_argument(
+        "--queries",
+        type=str,
+        default="random",
+        help="Queries file name, or one of 'random-choice' or 'random' "
+        "(default). 'random-choice': select n_queries vectors from the input "
+        "dataset. 'random': generate n_queries as uniform random numbers.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="",
+        help="output directory name (default current dir)",
+    )
+
+    parser.add_argument(
+        "--n_queries",
+        type=int,
+        default=10000,
+        help="Number of quries to generate (if no query file is given). "
+        "Default: 10000.",
+    )
+
+    parser.add_argument(
+        "-N",
+        "--rows",
+        default=None,
+        type=int,
+        help="use only first N rows from dataset, by default the whole "
+        "dataset is used",
+    )
+    parser.add_argument(
+        "-D",
+        "--cols",
+        default=None,
+        type=int,
+        help="number of features (dataset columns). "
+        "Default: read from dataset file.",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        help="Dataset dtype. When not specified, then derived from extension."
+        " Supported types: 'float32', 'float16', 'uint8', 'int8'",
+    )
+
+    parser.add_argument(
+        "-k",
+        type=int,
+        default=100,
+        help="Number of neighbors (per query) to calculate",
+    )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default="sqeuclidean",
+        help="Metric to use while calculating distances. Valid metrics are "
+        "those that are accepted by pylibraft.neighbors.brute_force.knn. Most"
+        " commonly used with RAFT ANN are 'sqeuclidean' and 'inner_product'",
+    )
+
+    args = parser.parse_args()
+
+    if args.rows is not None:
+        print("Reading subset of the data, nrows=", args.rows)
+    else:
+        print("Reading whole dataset")
+
+    # Load input data
+    dataset = memmap_bin_file(
+        args.dataset, args.dtype, shape=(args.rows, args.cols)
+    )
+    n_features = dataset.shape[1]
+    dtype = dataset.dtype
+
+    print(
+        "Dataset size {:6.1f} GB, shape {}, dtype {}".format(
+            dataset.size * dataset.dtype.itemsize / 1e9,
+            dataset.shape,
+            np.dtype(dtype),
+        )
+    )
+
+    if len(args.output) > 0:
+        os.makedirs(args.output, exist_ok=True)
+
+    if args.queries == "random" or args.queries == "random-choice":
+        if args.n_queries is None:
+            raise RuntimeError(
+                "n_queries must be given to generate random queries"
+            )
+        if args.queries == "random":
+            queries = generate_random_queries(
+                args.n_queries, n_features, dtype
+            )
+        elif args.queries == "random-choice":
+            queries = choose_random_queries(dataset, args.n_queries)
+
+        queries_filename = os.path.join(
+            args.output, "queries" + suffix_from_dtype(dtype)
+        )
+        print("Writing queries file", queries_filename)
+        write_bin(queries_filename, queries)
+    else:
+        print("Reading queries from file", args.queries)
+        queries = memmap_bin_file(args.queries, dtype)
+
+    print("Calculating true nearest neighbors")
+    distances, indices = calc_truth(dataset, queries, args.k, args.metric)
+
+    write_bin(
+        os.path.join(args.output, "groundtruth.neighbors.ibin"),
+        indices.astype(np.uint32),
+    )
+    write_bin(
+        os.path.join(args.output, "groundtruth.distances.fbin"),
+        distances.astype(np.float32),
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py
@@ -0,0 +1,103 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+import numpy as np
+
+
+def dtype_from_filename(filename):
+    ext = os.path.splitext(filename)[1]
+    if ext == ".fbin":
+        return np.float32
+    if ext == ".hbin":
+        return np.float16
+    elif ext == ".ibin":
+        return np.int32
+    elif ext == ".u8bin":
+        return np.ubyte
+    elif ext == ".i8bin":
+        return np.byte
+    else:
+        raise RuntimeError("Not supported file extension" + ext)
+
+
+def suffix_from_dtype(dtype):
+    if dtype == np.float32:
+        return ".fbin"
+    if dtype == np.float16:
+        return ".hbin"
+    elif dtype == np.int32:
+        return ".ibin"
+    elif dtype == np.ubyte:
+        return ".u8bin"
+    elif dtype == np.byte:
+        return ".i8bin"
+    else:
+        raise RuntimeError("Not supported dtype extension" + dtype)
+
+
+def memmap_bin_file(
+    bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
+):
+    extent_itemsize = np.dtype(size_dtype).itemsize
+    offset = int(extent_itemsize) * 2
+    if bin_file is None:
+        return None
+    if dtype is None:
+        dtype = dtype_from_filename(bin_file)
+
+    if mode[0] == "r":
+        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        if shape is None:
+            shape = (a[0], a[1])
+        else:
+            shape = tuple(
+                [
+                    aval if sval is None else sval
+                    for aval, sval in zip(a, shape)
+                ]
+            )
+
+        return np.memmap(
+            bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape
+        )
+    elif mode[0] == "w":
+        if shape is None:
+            raise ValueError("Need to specify shape to map file in write mode")
+
+        print("creating file", bin_file)
+        dirname = os.path.dirname(bin_file)
+        if len(dirname) > 0:
+            os.makedirs(dirname, exist_ok=True)
+        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        a[0] = shape[0]
+        a[1] = shape[1]
+        a.flush()
+        del a
+        fp = np.memmap(
+            bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape
+        )
+        return fp
+
+    # print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype))
+
+
+def write_bin(fname, data):
+    print("writing", fname, data.shape, data.dtype, "...")
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)