rapidsai · rapids-bot · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import rmm
+
+pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**30)
+rmm.mr.set_current_device_resource(pool)
+from rmm.allocators.cupy import rmm_cupy_allocator
+import cupy as cp
+
+cp.cuda.set_allocator(rmm_cupy_allocator)
+
+import argparse
+import os
+import cupy as cp
+import numpy as np
+import math
+from timeit import default_timer as timer
+from pylibraft.neighbors.brute_force import knn
+from pylibraft.common import DeviceResources
+
+from utils import dtype_from_filename, suffix_from_dtype, memmap_bin_file, write_bin
+
+
+def generate_random_queries(n_queries, n_features, dtype=np.float32):
+    print("Generating random queries")
+    if np.issubdtype(dtype, np.integer):
+        queries = cp.random.randint(0, 255, size=(n_queries, n_features), dtype=dtype)
+    else:
+        queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype)
+    return queries
+
+
+def choose_random_queries(dataset, n_queries):
+    print("Choosing random vector from dataset as query vectors")
+    query_idx = np.random.choice(dataset.shape[0], size=(n_queries,), replace=False)
+    return dataset[query_idx, :]
+
+
+def calc_truth(dataset, queries, k, metric="sqeuclidean"):
+    handle = DeviceResources()
+    n_samples = dataset.shape[0]
+    n = 500000  # batch size for processing neighbors
+    i = 0
+    indices = None
+    distances = None
+    queries = cp.asarray(queries, dtype=cp.float32)
+
+    while i < n_samples:
+        print("Step {0}/{1}:".format(i // n, n_samples // n))
+        n_batch = n if i + n <= n_samples else n_samples - i
+
+        X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)
+
+        D, I = knn(
+            X,
+            queries,
+            k,
+            metric=metric,
+            handle=handle,
+            global_id_offset=i,  # shift neighbor index by offset i
+        )
+        handle.sync()
+
+        D, I = cp.asarray(D), cp.asarray(I)
+        if distances is None:
+            distances = D
+            indices = I
+        else:
+            distances = cp.concatenate([distances, D], axis=1)
+            indices = cp.concatenate([indices, I], axis=1)
+            idx = cp.argsort(distances, axis=1)[:, :k]
+            distances = cp.take_along_axis(distances, idx, axis=1)
+            indices = cp.take_along_axis(indices, idx, axis=1)
+
+        i += n_batch
+
+    return distances, indices
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate_groundtruth",
+        description="Generate true neighbors using exact NN search. "
+        "The input and output files are in big-ann-benchmark's binary format.",
+        epilog="""Example usage
+    # With existing query file
+    python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin 
+
+    # With randomly generated queries
+    python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=random --n_queries=10000
+
+    # Using only a subset of the dataset. Define queries by randomly selecting vectors from the (subset of the) dataset.
+    python generate_groundtruth.py --dataset /dataset/base.1B.fbin --nrows=2000000 --cols=128 --output=groundtruth_dir --queries=random-choice --n_queries=10000
+    """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument("dataset", type=str, help="input dataset file name")
+    parser.add_argument(
+        "--queries",
+        type=str,
+        default="random",
+        help="Queries file name, or one of 'random-choice' or 'random' (default). "
+        "'random-choice': select n_queries vectors from the input dataset. "
+        "'random': generate n_queries as uniform random numbers.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="/tmp/groundtruth_dir",
+        help="output directory name",
+    )
+
+    parser.add_argument(
+        "--n_queries",
+        type=int,
+        default=10000,
+        help="Number of quries to generate (if no query file is given). Default: 10000.",
+    )
+
+    parser.add_argument(
+        "-N",
+        "--rows",
+        default=0,
+        type=int,
+        help="use only first N rows from dataset, by default the whole dataset is used",
+    )
+    parser.add_argument(
+        "-D",
+        "--cols",
+        default=0,
+        type=int,
+        help="number of features (dataset columns). Must be specified if --rows is used. Default: read from dataset file.",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        help="Dataset dtype. If not given, then derived from filename extension.",
+    )
+
+    parser.add_argument(
+        "-k", type=int, default=100, help="Number of neighbors (per query) to calculate"
+    )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default="sqeuclidean",
+        help="Metric to use while calculating distances.",
+    )
+
+    args = parser.parse_args()
+    if args.rows != 0 and args.cols == 0:
+        raise RuntimeError(
+            "Number of columns has to be specified with the --cols argument"
+        )
+
+    n_samples = args.rows
+    n_features = args.cols
+
+    if n_samples != 0:
+        shape = (n_samples, n_features)
+        print("Reading subset of the data, shape=", shape)
+    else:
+        print("Reading whole dataset")
+        shape = None
+
+    # Load input data
+    dataset = memmap_bin_file(args.dataset, args.dtype, shape)
+    n_samples = dataset.shape[0]
+    n_features = dataset.shape[1]
+    dtype = dataset.dtype
+
+    print(dataset.shape)
+    print(
+        "Dataset size {:6.1f} GB, dtype {}".format(
+            dataset.size * dataset.dtype.itemsize / 1e9, np.dtype(dtype)
+        )
+    )
+
+    os.makedirs(args.output, exist_ok=True)
+
+    if args.queries == "random" or args.queries == "random-choice":
+        if args.n_queries is None:
+            raise RuntimeError("n_queries must be given to generate random queries")
+        if args.queries == "random":
+            queries = generate_random_queries(args.n_queries, n_features, dtype)
+        elif args.queries == "random-choice":
+            queries = choose_random_queries(dataset, args.n_queries)
+
+        queries_filename = os.path.join(
+            args.output, "queries" + suffix_from_dtype(dtype)
+        )
+        print("Writing queries file", queries_filename)
+        write_bin(queries_filename, queries)
+    else:
+        print("Reading queries from file", args.queries)
+        queries = memmap_bin_file(args.queries, dtype)
+
+    print("Calculating true nearest neighbors")
+    distances, indices = calc_truth(dataset, queries, args.k, args.metric)
+
+    write_bin(
+        os.path.join(args.output, "groundtruth.neighbors.ibin"),
+        indices.astype(np.uint32),
+    )
+    write_bin(
+        os.path.join(args.output, "groundtruth.distances.fbin"),
+        distances.astype(np.float32),
+    )
diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py
@@ -0,0 +1,128 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import cupy as cp
+import time
+import os
+
+
+def dtype_from_filename(filename):
+    ext = os.path.splitext(filename)[1]
+    if ext == ".fbin":
+        return np.float32
+    elif ext == ".ibin":
+        return np.int32
+    elif ext == ".u8bin":
+        return np.ubyte
+    elif ext == ".i8bin":
+        return np.byte
+    else:
+        raise RuntimeError("Not supported file extension" + ext)
+
+
+def suffix_from_dtype(dtype):
+    if dtype == np.float32:
+        return ".fbin"
+    elif dtype == np.int32:
+        return ".ibin"
+    elif dtype == np.ubyte:
+        return ".u8bin"
+    elif dtype == np.byte:
+        return ".i8bin"
+    else:
+        raise RuntimeError("Not supported dtype extension" + dtype)
+
+
+def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32):
+    extent_itemsize = np.dtype(size_dtype).itemsize
+    offset = int(extent_itemsize) * 2
+    if bin_file is None:
+        return None
+    if dtype is None:
+        dtype = dtype_from_filename(bin_file)
+
+    if mode[0] == "r":
+        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        if shape is None:
+            shape = (a[0], a[1])
+            print("Read shape from file", shape)
+        return np.memmap(bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape)
+    elif mode[0] == "w":
+        if shape is None:
+            raise ValueError("Need to specify shape to map file in write mode")
+
+        print("creating file", bin_file)
+        dirname = os.path.dirname(bin_file)
+        if len(dirname) > 0:
+            os.makedirs(dirname, exist_ok=True)
+        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        a[0] = shape[0]
+        a[1] = shape[1]
+        a.flush()
+        del a
+        fp = np.memmap(bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape)
+        return fp
+
+    # print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype))
+
+
+def write_bin(fname, data):
+    print("writing", fname, data.shape, data.dtype, "...")
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+def calc_recall(ann_idx, true_nn_idx):
+    # ann_idx = np.asarray(ann_idx)
+    ann_idx = cp.asnumpy(ann_idx)
+    if ann_idx.shape != true_nn_idx.shape:
+        raise RuntimeError(
+            "Incompatible shapes {} vs {}".format(ann_idx.shape, true_nn_idx.shape)
+        )
+    n = 0
+    for i in range(ann_idx.shape[0]):
+        n += np.intersect1d(ann_idx[i, :], true_nn_idx[i, :]).size
+    recall = n / ann_idx.size
+    return recall
+
+
+class BenchmarkTimer:
+    """Provides a context manager that runs a code block `reps` times
+    and records results to the instance variable `timings`. Use like:
+    .. code-block:: python
+        timer = BenchmarkTimer(rep=5)
+        for _ in timer.benchmark_runs():
+            ... do something ...
+        print(np.min(timer.timings))
+
+        This class is part of the rapids/cuml benchmark suite
+    """
+
+    def __init__(self, reps=1, warmup=0):
+        self.warmup = warmup
+        self.reps = reps
+        self.timings = []
+
+    def benchmark_runs(self):
+        for r in range(self.reps + self.warmup):
+            t0 = time.time()
+            yield r
+            t1 = time.time()
+            self.timings.append(t1 - t0)
+            if r >= self.warmup:
+                self.timings.append(t1 - t0)