From b6e8f5fb0cde7c347d545eef7fed274b3189acc1 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 13:13:02 +0100 Subject: [PATCH 1/9] Add ANN bench scripts to generate ground truth --- .../scripts/generate_groundtruth.py | 194 ++++++++++++++++++ .../src/raft-ann-bench/scripts/utils.py | 129 ++++++++++++ 2 files changed, 323 insertions(+) create mode 100644 python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py create mode 100644 python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py new file mode 100644 index 0000000000..f6857258e4 --- /dev/null +++ b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + +import rmm + +pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**30) +rmm.mr.set_current_device_resource(pool) +from rmm.allocators.cupy import rmm_cupy_allocator +import cupy as cp + +cp.cuda.set_allocator(rmm_cupy_allocator) + +import argparse +import os +import cupy as cp +import numpy as np +import math +from timeit import default_timer as timer +from pylibraft.neighbors.brute_force import knn +from pylibraft.common import DeviceResources + +from utils import dtype_from_filename, suffix_from_dtype, memmap_bin_file, write_bin + + +def generate_random_queries(n_queries, n_features, dtype=np.float32): + print("Generating random queries") + if np.issubdtype(dtype, np.integer): + queries = cp.random.randint(0, 255, size=(n_queries, n_features), dtype=dtype) + else: + queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype) + return queries + + +def choose_random_queries(dataset, n_queries): + print("Choosing random vector from dataset as query vectors") + query_idx = np.random.choice(dataset.shape[0], size=(n_queries,), replace=False) + return dataset[query_idx, :] + + +def calc_truth(dataset, queries, k, metric="sqeuclidean"): + handle = DeviceResources() + n_samples = dataset.shape[0] + n = 500000 # batch size for processing neighbors + i = 0 + indices = None + distances = None + queries = cp.asarray(queries, dtype=cp.float32) + + while i < n_samples: + print("Step {0}/{1}:".format(i // n, n_samples // n)) + n_batch = n if i + n <= n_samples else n_samples - i + + X = cp.asarray(dataset[i : i + n_batch, :], cp.float32) + + D, I = knn( + X, + queries, + k, + metric=metric, + handle=handle, + global_id_offset=i, # shift neighbor index by offset i + ) + handle.sync() + + D, I = cp.asarray(D), cp.asarray(I) + if distances is None: + distances = D + indices = I + else: + distances = cp.concatenate([distances, D], axis=1) + indices = cp.concatenate([indices, I], axis=1) + idx = cp.argsort(distances, axis=1)[:, :k] + distances = cp.take_along_axis(distances, idx, axis=1) + indices = cp.take_along_axis(indices, idx, axis=1) + + i += n_batch + + return distances, indices + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate_groundtruth", + description="Generate true neighbors using exact NN search. " + "The input and output files are in big-ann-benchmark's binary format.", + ) + + parser.add_argument("--dataset", type=str, help="input dataset file name") + parser.add_argument( + "--queries", + type=str, + default="random-choice", + help="Queries file name, or one of 'random-choice' or 'random'. " + "'random-choice': select n_queries vectors from the input dataset. " + "'random': generate n_queries as uniform random numbers", + ) + parser.add_argument("--output", type=str, help="output directory name") + + parser.add_argument( + "--n_queries", + type=int, + default=None, + ) + + parser.add_argument( + "-N", + "--rows", + default=0, + type=int, + help="use only first N rows from dataset, by default the whole dataset is used", + ) + parser.add_argument( + "-D", + "--cols", + default=0, + type=int, + help="number of features (dataset columns)", + ) + parser.add_argument( + "--dtype", type=str, help="Dataset dtype. If not given, then derived from " + ) + parser.add_argument("--input_type", type=str, default="float32") + parser.add_argument("--output_type", type=str, default="float32") + parser.add_argument( + "-k", type=int, default=100, help="Number of neighbors (pre query) to calculate" + ) + parser.add_argument( + "--metric", + type=str, + default="sqeuclidean", + help="Metric to use while calculating distances.", + ) + + args = parser.parse_args() + if args.rows != 0 and args.cols == 0: + raise RuntimeError( + "Number of columns has to be specified with the --cols argument" + ) + + n_samples = args.rows + n_features = args.cols + + if n_samples != 0: + shape = (n_samples, n_features) + print("Reading subset of the data, shape=", shape) + else: + print("Reading whole dataset") + shape = None + + # Load input data + dataset = memmap_bin_file(args.dataset, args.dtype, shape) + n_samples = dataset.shape[0] + n_features = dataset.shape[1] + dtype = dataset.dtype + + print(dataset.shape) + print( + "Dataset size {:6.1f} GB, dtype {}".format( + dataset.size * dataset.dtype.itemsize / 1e9, np.dtype(dtype) + ) + ) + + os.makedirs(args.output, exist_ok=True) + + if args.queries == "random" or args.queries == "random-choice": + if args.n_queries is None: + raise RuntimeError("n_queries must be given to generate random queries") + if args.queries == "random": + queries = generate_random_queries(args.n_queries, n_features, dtype) + elif args.queries == "random-choice": + queries = choose_random_queries(dataset, args.n_queries) + + queries_filename = os.path.join( + args.output, "queries" + suffix_from_dtype(dtype) + ) + print("Writing queries file", queries_filename) + write_bin(queries_filename, queries) + else: + print("Reading queries from file", args.queries) + queries = memmap_bin_file(args.queries, dtype) + + print("Calculating true nearest neighbors") + distances, indices = calc_truth(dataset, queries, args.k, args.metric) + + write_bin( + os.path.join(args.output, "groundtruth.neighbors.ibin"), + indices.astype(np.uint32), + ) + write_bin( + os.path.join(args.output, "groundtruth.distances.fbin"), + distances.astype(np.float32), + ) diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py new file mode 100644 index 0000000000..e5a9057db0 --- /dev/null +++ b/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py @@ -0,0 +1,129 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# import cupy as cp + +import numpy as np +import cupy as cp +import time +import os + + +def dtype_from_filename(filename): + ext = os.path.splitext(filename)[1] + if ext == ".fbin": + return np.float32 + elif ext == ".ibin": + return np.int32 + elif ext == ".u8bin": + return np.ubyte + elif ext == ".i8bin": + return np.byte + else: + raise RuntimeError("Not supported file extension" + ext) + + +def suffix_from_dtype(dtype): + if dtype == np.float32: + return ".fbin" + elif dtype == np.int32: + return ".ibin" + elif dtype == np.ubyte: + return ".u8bin" + elif dtype == np.byte: + return ".i8bin" + else: + raise RuntimeError("Not supported dtype extension" + dtype) + + +def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32): + extent_itemsize = np.dtype(size_dtype).itemsize + offset = int(extent_itemsize) * 2 + if bin_file is None: + return None + if dtype is None: + dtype = dtype_from_filename(bin_file) + + if mode[0] == "r": + a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) + if shape is None: + shape = (a[0], a[1]) + print("Read shape from file", shape) + return np.memmap(bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape) + elif mode[0] == "w": + if shape is None: + raise ValueError("Need to specify shape to map file in write mode") + + print("creating file", bin_file) + dirname = os.path.dirname(bin_file) + if len(dirname) > 0: + os.makedirs(dirname, exist_ok=True) + a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) + a[0] = shape[0] + a[1] = shape[1] + a.flush() + del a + fp = np.memmap(bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape) + return fp + + # print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype)) + + +def write_bin(fname, data): + print("writing", fname, data.shape, data.dtype, "...") + with open(fname, "wb") as f: + np.asarray(data.shape, dtype=np.uint32).tofile(f) + data.tofile(f) + + +def calc_recall(ann_idx, true_nn_idx): + # ann_idx = np.asarray(ann_idx) + ann_idx = cp.asnumpy(ann_idx) + if ann_idx.shape != true_nn_idx.shape: + raise RuntimeError( + "Incompatible shapes {} vs {}".format(ann_idx.shape, true_nn_idx.shape) + ) + n = 0 + for i in range(ann_idx.shape[0]): + n += np.intersect1d(ann_idx[i, :], true_nn_idx[i, :]).size + recall = n / ann_idx.size + return recall + + +class BenchmarkTimer: + """Provides a context manager that runs a code block `reps` times + and records results to the instance variable `timings`. Use like: + .. code-block:: python + timer = BenchmarkTimer(rep=5) + for _ in timer.benchmark_runs(): + ... do something ... + print(np.min(timer.timings)) + + This class is part of the rapids/cuml benchmark suite + """ + + def __init__(self, reps=1, warmup=0): + self.warmup = warmup + self.reps = reps + self.timings = [] + + def benchmark_runs(self): + for r in range(self.reps + self.warmup): + t0 = time.time() + yield r + t1 = time.time() + self.timings.append(t1 - t0) + if r >= self.warmup: + self.timings.append(t1 - t0) From 5528997499f6489639f39577ef15680b00b6ba36 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 13:43:30 +0100 Subject: [PATCH 2/9] Improve help string --- .../scripts/generate_groundtruth.py | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py index f6857258e4..2f9ab99c31 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py +++ b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py @@ -85,23 +85,40 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): prog="generate_groundtruth", description="Generate true neighbors using exact NN search. " "The input and output files are in big-ann-benchmark's binary format.", + epilog="""Example usage + # With existing query file + python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin + + # With randomly generated queries + python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=random --n_queries=10000 + + # Using only a subset of the dataset. Define queries by randomly selecting vectors from the (subset of the) dataset. + python generate_groundtruth.py --dataset /dataset/base.1B.fbin --nrows=2000000 --cols=128 --output=groundtruth_dir --queries=random-choice --n_queries=10000 + """, + formatter_class=argparse.RawDescriptionHelpFormatter, ) - parser.add_argument("--dataset", type=str, help="input dataset file name") + parser.add_argument("dataset", type=str, help="input dataset file name") parser.add_argument( "--queries", type=str, - default="random-choice", - help="Queries file name, or one of 'random-choice' or 'random'. " + default="random", + help="Queries file name, or one of 'random-choice' or 'random' (default). " "'random-choice': select n_queries vectors from the input dataset. " - "'random': generate n_queries as uniform random numbers", + "'random': generate n_queries as uniform random numbers.", + ) + parser.add_argument( + "--output", + type=str, + default="/tmp/groundtruth_dir", + help="output directory name", ) - parser.add_argument("--output", type=str, help="output directory name") parser.add_argument( "--n_queries", type=int, - default=None, + default=10000, + help="Number of quries to generate (if no query file is given). Default: 10000.", ) parser.add_argument( @@ -116,15 +133,16 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): "--cols", default=0, type=int, - help="number of features (dataset columns)", + help="number of features (dataset columns). Must be specified if --rows is used. Default: read from dataset file.", ) parser.add_argument( - "--dtype", type=str, help="Dataset dtype. If not given, then derived from " + "--dtype", + type=str, + help="Dataset dtype. If not given, then derived from filename extension.", ) - parser.add_argument("--input_type", type=str, default="float32") - parser.add_argument("--output_type", type=str, default="float32") + parser.add_argument( - "-k", type=int, default=100, help="Number of neighbors (pre query) to calculate" + "-k", type=int, default=100, help="Number of neighbors (per query) to calculate" ) parser.add_argument( "--metric", From 1d9bbb3482e2484399c5bcf3cd8f26110a9c5145 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 13:47:30 +0100 Subject: [PATCH 3/9] Fix license --- .../scripts/generate_groundtruth.py | 19 +++++++++++++++---- .../src/raft-ann-bench/scripts/utils.py | 1 - 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py index 2f9ab99c31..062660ddeb 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py +++ b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py @@ -1,8 +1,19 @@ #!/usr/bin/env python -# coding: utf-8 - -# In[1]: - +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import rmm pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**30) diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py index e5a9057db0..bd05e84e05 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py +++ b/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# import cupy as cp import numpy as np import cupy as cp From 1b73fc7f3112c05dc2a0e3a5222f4f5d4dd07944 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 15:14:38 +0100 Subject: [PATCH 4/9] Fix style --- .../scripts/generate_groundtruth.py | 86 +++++++++++-------- .../src/raft-ann-bench/scripts/utils.py | 23 +++-- 2 files changed, 67 insertions(+), 42 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py index 062660ddeb..2b1e612b82 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py +++ b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py @@ -14,31 +14,24 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import rmm - -pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**30) -rmm.mr.set_current_device_resource(pool) -from rmm.allocators.cupy import rmm_cupy_allocator -import cupy as cp - -cp.cuda.set_allocator(rmm_cupy_allocator) - import argparse import os + import cupy as cp import numpy as np -import math -from timeit import default_timer as timer -from pylibraft.neighbors.brute_force import knn +import rmm from pylibraft.common import DeviceResources - -from utils import dtype_from_filename, suffix_from_dtype, memmap_bin_file, write_bin +from pylibraft.neighbors.brute_force import knn +from rmm.allocators.cupy import rmm_cupy_allocator +from utils import memmap_bin_file, suffix_from_dtype, write_bin def generate_random_queries(n_queries, n_features, dtype=np.float32): print("Generating random queries") if np.issubdtype(dtype, np.integer): - queries = cp.random.randint(0, 255, size=(n_queries, n_features), dtype=dtype) + queries = cp.random.randint( + 0, 255, size=(n_queries, n_features), dtype=dtype + ) else: queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype) return queries @@ -46,7 +39,9 @@ def generate_random_queries(n_queries, n_features, dtype=np.float32): def choose_random_queries(dataset, n_queries): print("Choosing random vector from dataset as query vectors") - query_idx = np.random.choice(dataset.shape[0], size=(n_queries,), replace=False) + query_idx = np.random.choice( + dataset.shape[0], size=(n_queries,), replace=False + ) return dataset[query_idx, :] @@ -65,7 +60,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): X = cp.asarray(dataset[i : i + n_batch, :], cp.float32) - D, I = knn( + D, Ind = knn( X, queries, k, @@ -75,13 +70,13 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): ) handle.sync() - D, I = cp.asarray(D), cp.asarray(I) + D, Ind = cp.asarray(D), cp.asarray(Ind) if distances is None: distances = D - indices = I + indices = Ind else: distances = cp.concatenate([distances, D], axis=1) - indices = cp.concatenate([indices, I], axis=1) + indices = cp.concatenate([indices, Ind], axis=1) idx = cp.argsort(distances, axis=1)[:, :k] distances = cp.take_along_axis(distances, idx, axis=1) indices = cp.take_along_axis(indices, idx, axis=1) @@ -92,19 +87,30 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): if __name__ == "__main__": + pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), initial_pool_size=2**30 + ) + rmm.mr.set_current_device_resource(pool) + cp.cuda.set_allocator(rmm_cupy_allocator) + parser = argparse.ArgumentParser( prog="generate_groundtruth", description="Generate true neighbors using exact NN search. " "The input and output files are in big-ann-benchmark's binary format.", epilog="""Example usage # With existing query file - python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin + python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ + --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin # With randomly generated queries - python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=random --n_queries=10000 - - # Using only a subset of the dataset. Define queries by randomly selecting vectors from the (subset of the) dataset. - python generate_groundtruth.py --dataset /dataset/base.1B.fbin --nrows=2000000 --cols=128 --output=groundtruth_dir --queries=random-choice --n_queries=10000 + python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ + --output=groundtruth_dir --queries=random --n_queries=10000 + + # Using only a subset of the dataset. Define queries by randomly + # selecting vectors from the (subset of the) dataset. + python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ + --nrows=2000000 --cols=128 --output=groundtruth_dir \ + --queries=random-choice --n_queries=10000 """, formatter_class=argparse.RawDescriptionHelpFormatter, ) @@ -114,9 +120,9 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): "--queries", type=str, default="random", - help="Queries file name, or one of 'random-choice' or 'random' (default). " - "'random-choice': select n_queries vectors from the input dataset. " - "'random': generate n_queries as uniform random numbers.", + help="Queries file name, or one of 'random-choice' or 'random' " + "(default). 'random-choice': select n_queries vectors from the input " + "dataset. 'random': generate n_queries as uniform random numbers.", ) parser.add_argument( "--output", @@ -129,7 +135,8 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): "--n_queries", type=int, default=10000, - help="Number of quries to generate (if no query file is given). Default: 10000.", + help="Number of quries to generate (if no query file is given). " + "Default: 10000.", ) parser.add_argument( @@ -137,23 +144,28 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): "--rows", default=0, type=int, - help="use only first N rows from dataset, by default the whole dataset is used", + help="use only first N rows from dataset, by default the whole " + "dataset is used", ) parser.add_argument( "-D", "--cols", default=0, type=int, - help="number of features (dataset columns). Must be specified if --rows is used. Default: read from dataset file.", + help="number of features (dataset columns). Must be specified if " + "--rows is used. Default: read from dataset file.", ) parser.add_argument( "--dtype", type=str, - help="Dataset dtype. If not given, then derived from filename extension.", + help="Dataset dtype. When not specified, then derived from extension.", ) parser.add_argument( - "-k", type=int, default=100, help="Number of neighbors (per query) to calculate" + "-k", + type=int, + default=100, + help="Number of neighbors (per query) to calculate", ) parser.add_argument( "--metric", @@ -195,9 +207,13 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): if args.queries == "random" or args.queries == "random-choice": if args.n_queries is None: - raise RuntimeError("n_queries must be given to generate random queries") + raise RuntimeError( + "n_queries must be given to generate random queries" + ) if args.queries == "random": - queries = generate_random_queries(args.n_queries, n_features, dtype) + queries = generate_random_queries( + args.n_queries, n_features, dtype + ) elif args.queries == "random-choice": queries = choose_random_queries(dataset, args.n_queries) diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py index bd05e84e05..a27a8ec1ab 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py +++ b/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py @@ -14,10 +14,11 @@ # limitations under the License. # -import numpy as np -import cupy as cp -import time import os +import time + +import cupy as cp +import numpy as np def dtype_from_filename(filename): @@ -47,7 +48,9 @@ def suffix_from_dtype(dtype): raise RuntimeError("Not supported dtype extension" + dtype) -def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32): +def memmap_bin_file( + bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 +): extent_itemsize = np.dtype(size_dtype).itemsize offset = int(extent_itemsize) * 2 if bin_file is None: @@ -60,7 +63,9 @@ def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32) if shape is None: shape = (a[0], a[1]) print("Read shape from file", shape) - return np.memmap(bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape) + return np.memmap( + bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape + ) elif mode[0] == "w": if shape is None: raise ValueError("Need to specify shape to map file in write mode") @@ -74,7 +79,9 @@ def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32) a[1] = shape[1] a.flush() del a - fp = np.memmap(bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape) + fp = np.memmap( + bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape + ) return fp # print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype)) @@ -92,7 +99,9 @@ def calc_recall(ann_idx, true_nn_idx): ann_idx = cp.asnumpy(ann_idx) if ann_idx.shape != true_nn_idx.shape: raise RuntimeError( - "Incompatible shapes {} vs {}".format(ann_idx.shape, true_nn_idx.shape) + "Incompatible shapes {} vs {}".format( + ann_idx.shape, true_nn_idx.shape + ) ) n = 0 for i in range(ann_idx.shape[0]): From 22d57b4bfb47665bfc0a7b075aac796c648788ae Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 15:17:54 +0100 Subject: [PATCH 5/9] Fix style --- .../src/raft-ann-bench/scripts/generate_groundtruth.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py index 2b1e612b82..8d249d4a98 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py +++ b/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py @@ -100,17 +100,17 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): epilog="""Example usage # With existing query file python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ - --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin +--output=groundtruth_dir --queries=/dataset/query.public.10K.fbin # With randomly generated queries python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ - --output=groundtruth_dir --queries=random --n_queries=10000 +--output=groundtruth_dir --queries=random --n_queries=10000 # Using only a subset of the dataset. Define queries by randomly # selecting vectors from the (subset of the) dataset. python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ - --nrows=2000000 --cols=128 --output=groundtruth_dir \ - --queries=random-choice --n_queries=10000 +--nrows=2000000 --cols=128 --output=groundtruth_dir \ +--queries=random-choice --n_queries=10000 """, formatter_class=argparse.RawDescriptionHelpFormatter, ) From cdb63d8e7cf1bd28edf1cde6b44724469d2a5496 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 17:42:02 +0100 Subject: [PATCH 6/9] Move files to generate_groundtruth module --- .../__main__.py} | 58 ++++++++----------- .../utils.py | 26 ++++----- 2 files changed, 36 insertions(+), 48 deletions(-) rename python/raft-ann-bench/src/raft-ann-bench/{scripts/generate_groundtruth.py => generate_groundtruth/__main__.py} (83%) rename python/raft-ann-bench/src/raft-ann-bench/{scripts => generate_groundtruth}/utils.py (86%) diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py similarity index 83% rename from python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py rename to python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py index 8d249d4a98..987a0cee11 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/scripts/generate_groundtruth.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py @@ -23,15 +23,13 @@ from pylibraft.common import DeviceResources from pylibraft.neighbors.brute_force import knn from rmm.allocators.cupy import rmm_cupy_allocator -from utils import memmap_bin_file, suffix_from_dtype, write_bin +from .utils import memmap_bin_file, suffix_from_dtype, write_bin def generate_random_queries(n_queries, n_features, dtype=np.float32): print("Generating random queries") if np.issubdtype(dtype, np.integer): - queries = cp.random.randint( - 0, 255, size=(n_queries, n_features), dtype=dtype - ) + queries = cp.random.randint(0, 255, size=(n_queries, n_features), dtype=dtype) else: queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype) return queries @@ -39,9 +37,7 @@ def generate_random_queries(n_queries, n_features, dtype=np.float32): def choose_random_queries(dataset, n_queries): print("Choosing random vector from dataset as query vectors") - query_idx = np.random.choice( - dataset.shape[0], size=(n_queries,), replace=False - ) + query_idx = np.random.choice(dataset.shape[0], size=(n_queries,), replace=False) return dataset[query_idx, :] @@ -86,7 +82,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): return distances, indices -if __name__ == "__main__": +def main(): pool = rmm.mr.PoolMemoryResource( rmm.mr.CudaMemoryResource(), initial_pool_size=2**30 ) @@ -127,8 +123,8 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): parser.add_argument( "--output", type=str, - default="/tmp/groundtruth_dir", - help="output directory name", + default="", + help="output directory name (default current dir)", ) parser.add_argument( @@ -142,7 +138,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): parser.add_argument( "-N", "--rows", - default=0, + default=None, type=int, help="use only first N rows from dataset, by default the whole " "dataset is used", @@ -150,7 +146,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): parser.add_argument( "-D", "--cols", - default=0, + default=None, type=int, help="number of features (dataset columns). Must be specified if " "--rows is used. Default: read from dataset file.", @@ -158,7 +154,8 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): parser.add_argument( "--dtype", type=str, - help="Dataset dtype. When not specified, then derived from extension.", + help="Dataset dtype. When not specified, then derived from extension." + " Supported types: 'float32', 'float16', 'uint8', 'int8'", ) parser.add_argument( @@ -175,45 +172,32 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): ) args = parser.parse_args() - if args.rows != 0 and args.cols == 0: - raise RuntimeError( - "Number of columns has to be specified with the --cols argument" - ) - n_samples = args.rows - n_features = args.cols - - if n_samples != 0: - shape = (n_samples, n_features) - print("Reading subset of the data, shape=", shape) + if args.rows is not None: + print("Reading subset of the data, nrows=", args.rows) else: print("Reading whole dataset") - shape = None # Load input data - dataset = memmap_bin_file(args.dataset, args.dtype, shape) + dataset = memmap_bin_file(args.dataset, args.dtype, shape=(args.rows, args.cols)) n_samples = dataset.shape[0] n_features = dataset.shape[1] dtype = dataset.dtype - print(dataset.shape) print( - "Dataset size {:6.1f} GB, dtype {}".format( - dataset.size * dataset.dtype.itemsize / 1e9, np.dtype(dtype) + "Dataset size {:6.1f} GB, shape {}, dtype {}".format( + dataset.size * dataset.dtype.itemsize / 1e9, dataset.shape, np.dtype(dtype) ) ) - os.makedirs(args.output, exist_ok=True) + if len(args.output) > 0: + os.makedirs(args.output, exist_ok=True) if args.queries == "random" or args.queries == "random-choice": if args.n_queries is None: - raise RuntimeError( - "n_queries must be given to generate random queries" - ) + raise RuntimeError("n_queries must be given to generate random queries") if args.queries == "random": - queries = generate_random_queries( - args.n_queries, n_features, dtype - ) + queries = generate_random_queries(args.n_queries, n_features, dtype) elif args.queries == "random-choice": queries = choose_random_queries(dataset, args.n_queries) @@ -237,3 +221,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): os.path.join(args.output, "groundtruth.distances.fbin"), distances.astype(np.float32), ) + + +if __name__ == "__main__": + main() diff --git a/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py similarity index 86% rename from python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py rename to python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py index a27a8ec1ab..55d81cd658 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py @@ -25,6 +25,8 @@ def dtype_from_filename(filename): ext = os.path.splitext(filename)[1] if ext == ".fbin": return np.float32 + if ext == ".hbin": + return np.float16 elif ext == ".ibin": return np.int32 elif ext == ".u8bin": @@ -38,6 +40,8 @@ def dtype_from_filename(filename): def suffix_from_dtype(dtype): if dtype == np.float32: return ".fbin" + if dtype == np.float16: + return ".hbin" elif dtype == np.int32: return ".ibin" elif dtype == np.ubyte: @@ -48,9 +52,7 @@ def suffix_from_dtype(dtype): raise RuntimeError("Not supported dtype extension" + dtype) -def memmap_bin_file( - bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 -): +def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32): extent_itemsize = np.dtype(size_dtype).itemsize offset = int(extent_itemsize) * 2 if bin_file is None: @@ -62,10 +64,12 @@ def memmap_bin_file( a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) if shape is None: shape = (a[0], a[1]) - print("Read shape from file", shape) - return np.memmap( - bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape - ) + else: + shape = tuple( + [aval if sval is None else sval for aval, sval in zip(a, shape)] + ) + + return np.memmap(bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape) elif mode[0] == "w": if shape is None: raise ValueError("Need to specify shape to map file in write mode") @@ -79,9 +83,7 @@ def memmap_bin_file( a[1] = shape[1] a.flush() del a - fp = np.memmap( - bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape - ) + fp = np.memmap(bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape) return fp # print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype)) @@ -99,9 +101,7 @@ def calc_recall(ann_idx, true_nn_idx): ann_idx = cp.asnumpy(ann_idx) if ann_idx.shape != true_nn_idx.shape: raise RuntimeError( - "Incompatible shapes {} vs {}".format( - ann_idx.shape, true_nn_idx.shape - ) + "Incompatible shapes {} vs {}".format(ann_idx.shape, true_nn_idx.shape) ) n = 0 for i in range(ann_idx.shape[0]): From 64afc2e8e4005f04ba577079e650e457d6deee69 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 17:49:14 +0100 Subject: [PATCH 7/9] Update documentation --- docs/source/ann_benchmarks_dataset.md | 18 ++++++++++- .../generate_groundtruth/__main__.py | 32 +++++++++++++------ .../generate_groundtruth/utils.py | 21 +++++++++--- 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md index 99a6bfbd3a..b688232f6c 100644 --- a/docs/source/ann_benchmarks_dataset.md +++ b/docs/source/ann_benchmarks_dataset.md @@ -44,4 +44,20 @@ Commonly used datasets can be downloaded from two websites: # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced popd ``` - Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation. \ No newline at end of file + Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation. + +## Generate ground truth + +If you have a dataset, but no corresponding ground truth file, then you can generate ground trunth using the `generate_groundtruth` utility. Example usage: + +```bash +# With existing query file +python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin + +# With randomly generated queries +python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=random --n_queries=10000 + +# Using only a subset of the dataset. Define queries by randomly +# selecting vectors from the (subset of the) dataset. +python generate_groundtruth.py --dataset /dataset/base.1B.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000 +``` \ No newline at end of file diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py index 987a0cee11..19ab4ba945 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py @@ -23,13 +23,16 @@ from pylibraft.common import DeviceResources from pylibraft.neighbors.brute_force import knn from rmm.allocators.cupy import rmm_cupy_allocator + from .utils import memmap_bin_file, suffix_from_dtype, write_bin def generate_random_queries(n_queries, n_features, dtype=np.float32): print("Generating random queries") if np.issubdtype(dtype, np.integer): - queries = cp.random.randint(0, 255, size=(n_queries, n_features), dtype=dtype) + queries = cp.random.randint( + 0, 255, size=(n_queries, n_features), dtype=dtype + ) else: queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype) return queries @@ -37,7 +40,9 @@ def generate_random_queries(n_queries, n_features, dtype=np.float32): def choose_random_queries(dataset, n_queries): print("Choosing random vector from dataset as query vectors") - query_idx = np.random.choice(dataset.shape[0], size=(n_queries,), replace=False) + query_idx = np.random.choice( + dataset.shape[0], size=(n_queries,), replace=False + ) return dataset[query_idx, :] @@ -95,16 +100,16 @@ def main(): "The input and output files are in big-ann-benchmark's binary format.", epilog="""Example usage # With existing query file - python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ + python -m generate_groundtruth --dataset /dataset/base.1B.fbin \ --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin # With randomly generated queries - python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ + python -m generate_groundtruth --dataset /dataset/base.1B.fbin \ --output=groundtruth_dir --queries=random --n_queries=10000 # Using only a subset of the dataset. Define queries by randomly # selecting vectors from the (subset of the) dataset. - python generate_groundtruth.py --dataset /dataset/base.1B.fbin \ + python -m generate_groundtruth --dataset /dataset/base.1B.fbin \ --nrows=2000000 --cols=128 --output=groundtruth_dir \ --queries=random-choice --n_queries=10000 """, @@ -179,14 +184,17 @@ def main(): print("Reading whole dataset") # Load input data - dataset = memmap_bin_file(args.dataset, args.dtype, shape=(args.rows, args.cols)) - n_samples = dataset.shape[0] + dataset = memmap_bin_file( + args.dataset, args.dtype, shape=(args.rows, args.cols) + ) n_features = dataset.shape[1] dtype = dataset.dtype print( "Dataset size {:6.1f} GB, shape {}, dtype {}".format( - dataset.size * dataset.dtype.itemsize / 1e9, dataset.shape, np.dtype(dtype) + dataset.size * dataset.dtype.itemsize / 1e9, + dataset.shape, + np.dtype(dtype), ) ) @@ -195,9 +203,13 @@ def main(): if args.queries == "random" or args.queries == "random-choice": if args.n_queries is None: - raise RuntimeError("n_queries must be given to generate random queries") + raise RuntimeError( + "n_queries must be given to generate random queries" + ) if args.queries == "random": - queries = generate_random_queries(args.n_queries, n_features, dtype) + queries = generate_random_queries( + args.n_queries, n_features, dtype + ) elif args.queries == "random-choice": queries = choose_random_queries(dataset, args.n_queries) diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py index 55d81cd658..590b922b58 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py @@ -52,7 +52,9 @@ def suffix_from_dtype(dtype): raise RuntimeError("Not supported dtype extension" + dtype) -def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32): +def memmap_bin_file( + bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 +): extent_itemsize = np.dtype(size_dtype).itemsize offset = int(extent_itemsize) * 2 if bin_file is None: @@ -66,10 +68,15 @@ def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32) shape = (a[0], a[1]) else: shape = tuple( - [aval if sval is None else sval for aval, sval in zip(a, shape)] + [ + aval if sval is None else sval + for aval, sval in zip(a, shape) + ] ) - return np.memmap(bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape) + return np.memmap( + bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape + ) elif mode[0] == "w": if shape is None: raise ValueError("Need to specify shape to map file in write mode") @@ -83,7 +90,9 @@ def memmap_bin_file(bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32) a[1] = shape[1] a.flush() del a - fp = np.memmap(bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape) + fp = np.memmap( + bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape + ) return fp # print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype)) @@ -101,7 +110,9 @@ def calc_recall(ann_idx, true_nn_idx): ann_idx = cp.asnumpy(ann_idx) if ann_idx.shape != true_nn_idx.shape: raise RuntimeError( - "Incompatible shapes {} vs {}".format(ann_idx.shape, true_nn_idx.shape) + "Incompatible shapes {} vs {}".format( + ann_idx.shape, true_nn_idx.shape + ) ) n = 0 for i in range(ann_idx.shape[0]): From e9924e40b51cd79feaf1f4341afdb3cf1d88f798 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 18:00:23 +0100 Subject: [PATCH 8/9] Update parameter help --- .../src/raft-ann-bench/generate_groundtruth/__main__.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py index 19ab4ba945..7c81f38103 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py @@ -153,8 +153,8 @@ def main(): "--cols", default=None, type=int, - help="number of features (dataset columns). Must be specified if " - "--rows is used. Default: read from dataset file.", + help="number of features (dataset columns). " + "Default: read from dataset file.", ) parser.add_argument( "--dtype", @@ -173,7 +173,9 @@ def main(): "--metric", type=str, default="sqeuclidean", - help="Metric to use while calculating distances.", + help="Metric to use while calculating distances. Valid metrics are " + "those that are accepted by pylibraft.neighbors.brute_force.knn. Most" + " commonly used with RAFT ANN are 'sqeuclidean' and 'inner_product'", ) args = parser.parse_args() From 9439818715d823014f51e93712ad97a357493231 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Tue, 7 Nov 2023 18:08:25 +0100 Subject: [PATCH 9/9] Remove unused code, fix doc --- docs/source/ann_benchmarks_dataset.md | 6 +-- .../generate_groundtruth/__main__.py | 12 ++--- .../generate_groundtruth/utils.py | 45 ------------------- 3 files changed, 9 insertions(+), 54 deletions(-) diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md index b688232f6c..821345b07c 100644 --- a/docs/source/ann_benchmarks_dataset.md +++ b/docs/source/ann_benchmarks_dataset.md @@ -52,12 +52,12 @@ If you have a dataset, but no corresponding ground truth file, then you can gene ```bash # With existing query file -python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin +python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin # With randomly generated queries -python generate_groundtruth.py --dataset /dataset/base.1B.fbin --output=groundtruth_dir --queries=random --n_queries=10000 +python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000 # Using only a subset of the dataset. Define queries by randomly # selecting vectors from the (subset of the) dataset. -python generate_groundtruth.py --dataset /dataset/base.1B.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000 +python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000 ``` \ No newline at end of file diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py index 7c81f38103..77a930f81e 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py @@ -100,17 +100,17 @@ def main(): "The input and output files are in big-ann-benchmark's binary format.", epilog="""Example usage # With existing query file - python -m generate_groundtruth --dataset /dataset/base.1B.fbin \ ---output=groundtruth_dir --queries=/dataset/query.public.10K.fbin + python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\ +fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin # With randomly generated queries - python -m generate_groundtruth --dataset /dataset/base.1B.fbin \ ---output=groundtruth_dir --queries=random --n_queries=10000 + python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\ +fbin --output=groundtruth_dir --queries=random --n_queries=10000 # Using only a subset of the dataset. Define queries by randomly # selecting vectors from the (subset of the) dataset. - python -m generate_groundtruth --dataset /dataset/base.1B.fbin \ ---nrows=2000000 --cols=128 --output=groundtruth_dir \ + python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\ +fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \ --queries=random-choice --n_queries=10000 """, formatter_class=argparse.RawDescriptionHelpFormatter, diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py index 590b922b58..3f2dd11a16 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py @@ -15,9 +15,7 @@ # import os -import time -import cupy as cp import numpy as np @@ -103,46 +101,3 @@ def write_bin(fname, data): with open(fname, "wb") as f: np.asarray(data.shape, dtype=np.uint32).tofile(f) data.tofile(f) - - -def calc_recall(ann_idx, true_nn_idx): - # ann_idx = np.asarray(ann_idx) - ann_idx = cp.asnumpy(ann_idx) - if ann_idx.shape != true_nn_idx.shape: - raise RuntimeError( - "Incompatible shapes {} vs {}".format( - ann_idx.shape, true_nn_idx.shape - ) - ) - n = 0 - for i in range(ann_idx.shape[0]): - n += np.intersect1d(ann_idx[i, :], true_nn_idx[i, :]).size - recall = n / ann_idx.size - return recall - - -class BenchmarkTimer: - """Provides a context manager that runs a code block `reps` times - and records results to the instance variable `timings`. Use like: - .. code-block:: python - timer = BenchmarkTimer(rep=5) - for _ in timer.benchmark_runs(): - ... do something ... - print(np.min(timer.timings)) - - This class is part of the rapids/cuml benchmark suite - """ - - def __init__(self, reps=1, warmup=0): - self.warmup = warmup - self.reps = reps - self.timings = [] - - def benchmark_runs(self): - for r in range(self.reps + self.warmup): - t0 = time.time() - yield r - t1 = time.time() - self.timings.append(t1 - t0) - if r >= self.warmup: - self.timings.append(t1 - t0)