diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md index 99a6bfbd3a..821345b07c 100644 --- a/docs/source/ann_benchmarks_dataset.md +++ b/docs/source/ann_benchmarks_dataset.md @@ -44,4 +44,20 @@ Commonly used datasets can be downloaded from two websites: # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced popd ``` - Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation. \ No newline at end of file + Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation. + +## Generate ground truth + +If you have a dataset, but no corresponding ground truth file, then you can generate ground trunth using the `generate_groundtruth` utility. Example usage: + +```bash +# With existing query file +python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin + +# With randomly generated queries +python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000 + +# Using only a subset of the dataset. Define queries by randomly +# selecting vectors from the (subset of the) dataset. +python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000 +``` \ No newline at end of file diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py new file mode 100644 index 0000000000..77a930f81e --- /dev/null +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import argparse +import os + +import cupy as cp +import numpy as np +import rmm +from pylibraft.common import DeviceResources +from pylibraft.neighbors.brute_force import knn +from rmm.allocators.cupy import rmm_cupy_allocator + +from .utils import memmap_bin_file, suffix_from_dtype, write_bin + + +def generate_random_queries(n_queries, n_features, dtype=np.float32): + print("Generating random queries") + if np.issubdtype(dtype, np.integer): + queries = cp.random.randint( + 0, 255, size=(n_queries, n_features), dtype=dtype + ) + else: + queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype) + return queries + + +def choose_random_queries(dataset, n_queries): + print("Choosing random vector from dataset as query vectors") + query_idx = np.random.choice( + dataset.shape[0], size=(n_queries,), replace=False + ) + return dataset[query_idx, :] + + +def calc_truth(dataset, queries, k, metric="sqeuclidean"): + handle = DeviceResources() + n_samples = dataset.shape[0] + n = 500000 # batch size for processing neighbors + i = 0 + indices = None + distances = None + queries = cp.asarray(queries, dtype=cp.float32) + + while i < n_samples: + print("Step {0}/{1}:".format(i // n, n_samples // n)) + n_batch = n if i + n <= n_samples else n_samples - i + + X = cp.asarray(dataset[i : i + n_batch, :], cp.float32) + + D, Ind = knn( + X, + queries, + k, + metric=metric, + handle=handle, + global_id_offset=i, # shift neighbor index by offset i + ) + handle.sync() + + D, Ind = cp.asarray(D), cp.asarray(Ind) + if distances is None: + distances = D + indices = Ind + else: + distances = cp.concatenate([distances, D], axis=1) + indices = cp.concatenate([indices, Ind], axis=1) + idx = cp.argsort(distances, axis=1)[:, :k] + distances = cp.take_along_axis(distances, idx, axis=1) + indices = cp.take_along_axis(indices, idx, axis=1) + + i += n_batch + + return distances, indices + + +def main(): + pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), initial_pool_size=2**30 + ) + rmm.mr.set_current_device_resource(pool) + cp.cuda.set_allocator(rmm_cupy_allocator) + + parser = argparse.ArgumentParser( + prog="generate_groundtruth", + description="Generate true neighbors using exact NN search. " + "The input and output files are in big-ann-benchmark's binary format.", + epilog="""Example usage + # With existing query file + python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\ +fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin + + # With randomly generated queries + python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\ +fbin --output=groundtruth_dir --queries=random --n_queries=10000 + + # Using only a subset of the dataset. Define queries by randomly + # selecting vectors from the (subset of the) dataset. + python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\ +fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \ +--queries=random-choice --n_queries=10000 + """, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + parser.add_argument("dataset", type=str, help="input dataset file name") + parser.add_argument( + "--queries", + type=str, + default="random", + help="Queries file name, or one of 'random-choice' or 'random' " + "(default). 'random-choice': select n_queries vectors from the input " + "dataset. 'random': generate n_queries as uniform random numbers.", + ) + parser.add_argument( + "--output", + type=str, + default="", + help="output directory name (default current dir)", + ) + + parser.add_argument( + "--n_queries", + type=int, + default=10000, + help="Number of quries to generate (if no query file is given). " + "Default: 10000.", + ) + + parser.add_argument( + "-N", + "--rows", + default=None, + type=int, + help="use only first N rows from dataset, by default the whole " + "dataset is used", + ) + parser.add_argument( + "-D", + "--cols", + default=None, + type=int, + help="number of features (dataset columns). " + "Default: read from dataset file.", + ) + parser.add_argument( + "--dtype", + type=str, + help="Dataset dtype. When not specified, then derived from extension." + " Supported types: 'float32', 'float16', 'uint8', 'int8'", + ) + + parser.add_argument( + "-k", + type=int, + default=100, + help="Number of neighbors (per query) to calculate", + ) + parser.add_argument( + "--metric", + type=str, + default="sqeuclidean", + help="Metric to use while calculating distances. Valid metrics are " + "those that are accepted by pylibraft.neighbors.brute_force.knn. Most" + " commonly used with RAFT ANN are 'sqeuclidean' and 'inner_product'", + ) + + args = parser.parse_args() + + if args.rows is not None: + print("Reading subset of the data, nrows=", args.rows) + else: + print("Reading whole dataset") + + # Load input data + dataset = memmap_bin_file( + args.dataset, args.dtype, shape=(args.rows, args.cols) + ) + n_features = dataset.shape[1] + dtype = dataset.dtype + + print( + "Dataset size {:6.1f} GB, shape {}, dtype {}".format( + dataset.size * dataset.dtype.itemsize / 1e9, + dataset.shape, + np.dtype(dtype), + ) + ) + + if len(args.output) > 0: + os.makedirs(args.output, exist_ok=True) + + if args.queries == "random" or args.queries == "random-choice": + if args.n_queries is None: + raise RuntimeError( + "n_queries must be given to generate random queries" + ) + if args.queries == "random": + queries = generate_random_queries( + args.n_queries, n_features, dtype + ) + elif args.queries == "random-choice": + queries = choose_random_queries(dataset, args.n_queries) + + queries_filename = os.path.join( + args.output, "queries" + suffix_from_dtype(dtype) + ) + print("Writing queries file", queries_filename) + write_bin(queries_filename, queries) + else: + print("Reading queries from file", args.queries) + queries = memmap_bin_file(args.queries, dtype) + + print("Calculating true nearest neighbors") + distances, indices = calc_truth(dataset, queries, args.k, args.metric) + + write_bin( + os.path.join(args.output, "groundtruth.neighbors.ibin"), + indices.astype(np.uint32), + ) + write_bin( + os.path.join(args.output, "groundtruth.distances.fbin"), + distances.astype(np.float32), + ) + + +if __name__ == "__main__": + main() diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py new file mode 100644 index 0000000000..3f2dd11a16 --- /dev/null +++ b/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py @@ -0,0 +1,103 @@ +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +import numpy as np + + +def dtype_from_filename(filename): + ext = os.path.splitext(filename)[1] + if ext == ".fbin": + return np.float32 + if ext == ".hbin": + return np.float16 + elif ext == ".ibin": + return np.int32 + elif ext == ".u8bin": + return np.ubyte + elif ext == ".i8bin": + return np.byte + else: + raise RuntimeError("Not supported file extension" + ext) + + +def suffix_from_dtype(dtype): + if dtype == np.float32: + return ".fbin" + if dtype == np.float16: + return ".hbin" + elif dtype == np.int32: + return ".ibin" + elif dtype == np.ubyte: + return ".u8bin" + elif dtype == np.byte: + return ".i8bin" + else: + raise RuntimeError("Not supported dtype extension" + dtype) + + +def memmap_bin_file( + bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 +): + extent_itemsize = np.dtype(size_dtype).itemsize + offset = int(extent_itemsize) * 2 + if bin_file is None: + return None + if dtype is None: + dtype = dtype_from_filename(bin_file) + + if mode[0] == "r": + a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) + if shape is None: + shape = (a[0], a[1]) + else: + shape = tuple( + [ + aval if sval is None else sval + for aval, sval in zip(a, shape) + ] + ) + + return np.memmap( + bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape + ) + elif mode[0] == "w": + if shape is None: + raise ValueError("Need to specify shape to map file in write mode") + + print("creating file", bin_file) + dirname = os.path.dirname(bin_file) + if len(dirname) > 0: + os.makedirs(dirname, exist_ok=True) + a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) + a[0] = shape[0] + a[1] = shape[1] + a.flush() + del a + fp = np.memmap( + bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape + ) + return fp + + # print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype)) + + +def write_bin(fname, data): + print("writing", fname, data.shape, data.dtype, "...") + with open(fname, "wb") as f: + np.asarray(data.shape, dtype=np.uint32).tofile(f) + data.tofile(f)