Skip to content

Commit

Permalink
Add ANN bench scripts to generate ground truth (rapidsai#1967)
Browse files Browse the repository at this point in the history
Add a command line based tool that can read a dataset and generate ground truth files with exact neighbors using raft brute force knn.

Authors:
  - Tamas Bela Feher (https://github.com/tfeher)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: rapidsai#1967
  • Loading branch information
tfeher authored and benfred committed Nov 8, 2023
1 parent be3292a commit 66a6af8
Show file tree
Hide file tree
Showing 3 changed files with 361 additions and 1 deletion.
18 changes: 17 additions & 1 deletion docs/source/ann_benchmarks_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,20 @@ Commonly used datasets can be downloaded from two websites:
# two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
popd
```
Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
Besides ground truth files for the whole billion-scale datasets, this site also provides ground truth files for the first 10M or 100M vectors of the base sets. This mean we can use these billion-scale datasets as million-scale datasets. To facilitate this, an optional parameter `subset_size` for dataset can be used. See the next step for further explanation.
## Generate ground truth
If you have a dataset, but no corresponding ground truth file, then you can generate ground trunth using the `generate_groundtruth` utility. Example usage:
```bash
# With existing query file
python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
# With randomly generated queries
python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000
# Using only a subset of the dataset. Define queries by randomly
# selecting vectors from the (subset of the) dataset.
python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
#!/usr/bin/env python
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os

import cupy as cp
import numpy as np
import rmm
from pylibraft.common import DeviceResources
from pylibraft.neighbors.brute_force import knn
from rmm.allocators.cupy import rmm_cupy_allocator

from .utils import memmap_bin_file, suffix_from_dtype, write_bin


def generate_random_queries(n_queries, n_features, dtype=np.float32):
print("Generating random queries")
if np.issubdtype(dtype, np.integer):
queries = cp.random.randint(
0, 255, size=(n_queries, n_features), dtype=dtype
)
else:
queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype)
return queries


def choose_random_queries(dataset, n_queries):
print("Choosing random vector from dataset as query vectors")
query_idx = np.random.choice(
dataset.shape[0], size=(n_queries,), replace=False
)
return dataset[query_idx, :]


def calc_truth(dataset, queries, k, metric="sqeuclidean"):
handle = DeviceResources()
n_samples = dataset.shape[0]
n = 500000 # batch size for processing neighbors
i = 0
indices = None
distances = None
queries = cp.asarray(queries, dtype=cp.float32)

while i < n_samples:
print("Step {0}/{1}:".format(i // n, n_samples // n))
n_batch = n if i + n <= n_samples else n_samples - i

X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)

D, Ind = knn(
X,
queries,
k,
metric=metric,
handle=handle,
global_id_offset=i, # shift neighbor index by offset i
)
handle.sync()

D, Ind = cp.asarray(D), cp.asarray(Ind)
if distances is None:
distances = D
indices = Ind
else:
distances = cp.concatenate([distances, D], axis=1)
indices = cp.concatenate([indices, Ind], axis=1)
idx = cp.argsort(distances, axis=1)[:, :k]
distances = cp.take_along_axis(distances, idx, axis=1)
indices = cp.take_along_axis(indices, idx, axis=1)

i += n_batch

return distances, indices


def main():
pool = rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
)
rmm.mr.set_current_device_resource(pool)
cp.cuda.set_allocator(rmm_cupy_allocator)

parser = argparse.ArgumentParser(
prog="generate_groundtruth",
description="Generate true neighbors using exact NN search. "
"The input and output files are in big-ann-benchmark's binary format.",
epilog="""Example usage
# With existing query file
python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
# With randomly generated queries
python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
fbin --output=groundtruth_dir --queries=random --n_queries=10000
# Using only a subset of the dataset. Define queries by randomly
# selecting vectors from the (subset of the) dataset.
python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
--queries=random-choice --n_queries=10000
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)

parser.add_argument("dataset", type=str, help="input dataset file name")
parser.add_argument(
"--queries",
type=str,
default="random",
help="Queries file name, or one of 'random-choice' or 'random' "
"(default). 'random-choice': select n_queries vectors from the input "
"dataset. 'random': generate n_queries as uniform random numbers.",
)
parser.add_argument(
"--output",
type=str,
default="",
help="output directory name (default current dir)",
)

parser.add_argument(
"--n_queries",
type=int,
default=10000,
help="Number of quries to generate (if no query file is given). "
"Default: 10000.",
)

parser.add_argument(
"-N",
"--rows",
default=None,
type=int,
help="use only first N rows from dataset, by default the whole "
"dataset is used",
)
parser.add_argument(
"-D",
"--cols",
default=None,
type=int,
help="number of features (dataset columns). "
"Default: read from dataset file.",
)
parser.add_argument(
"--dtype",
type=str,
help="Dataset dtype. When not specified, then derived from extension."
" Supported types: 'float32', 'float16', 'uint8', 'int8'",
)

parser.add_argument(
"-k",
type=int,
default=100,
help="Number of neighbors (per query) to calculate",
)
parser.add_argument(
"--metric",
type=str,
default="sqeuclidean",
help="Metric to use while calculating distances. Valid metrics are "
"those that are accepted by pylibraft.neighbors.brute_force.knn. Most"
" commonly used with RAFT ANN are 'sqeuclidean' and 'inner_product'",
)

args = parser.parse_args()

if args.rows is not None:
print("Reading subset of the data, nrows=", args.rows)
else:
print("Reading whole dataset")

# Load input data
dataset = memmap_bin_file(
args.dataset, args.dtype, shape=(args.rows, args.cols)
)
n_features = dataset.shape[1]
dtype = dataset.dtype

print(
"Dataset size {:6.1f} GB, shape {}, dtype {}".format(
dataset.size * dataset.dtype.itemsize / 1e9,
dataset.shape,
np.dtype(dtype),
)
)

if len(args.output) > 0:
os.makedirs(args.output, exist_ok=True)

if args.queries == "random" or args.queries == "random-choice":
if args.n_queries is None:
raise RuntimeError(
"n_queries must be given to generate random queries"
)
if args.queries == "random":
queries = generate_random_queries(
args.n_queries, n_features, dtype
)
elif args.queries == "random-choice":
queries = choose_random_queries(dataset, args.n_queries)

queries_filename = os.path.join(
args.output, "queries" + suffix_from_dtype(dtype)
)
print("Writing queries file", queries_filename)
write_bin(queries_filename, queries)
else:
print("Reading queries from file", args.queries)
queries = memmap_bin_file(args.queries, dtype)

print("Calculating true nearest neighbors")
distances, indices = calc_truth(dataset, queries, args.k, args.metric)

write_bin(
os.path.join(args.output, "groundtruth.neighbors.ibin"),
indices.astype(np.uint32),
)
write_bin(
os.path.join(args.output, "groundtruth.distances.fbin"),
distances.astype(np.float32),
)


if __name__ == "__main__":
main()
103 changes: 103 additions & 0 deletions python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

import numpy as np


def dtype_from_filename(filename):
ext = os.path.splitext(filename)[1]
if ext == ".fbin":
return np.float32
if ext == ".hbin":
return np.float16
elif ext == ".ibin":
return np.int32
elif ext == ".u8bin":
return np.ubyte
elif ext == ".i8bin":
return np.byte
else:
raise RuntimeError("Not supported file extension" + ext)


def suffix_from_dtype(dtype):
if dtype == np.float32:
return ".fbin"
if dtype == np.float16:
return ".hbin"
elif dtype == np.int32:
return ".ibin"
elif dtype == np.ubyte:
return ".u8bin"
elif dtype == np.byte:
return ".i8bin"
else:
raise RuntimeError("Not supported dtype extension" + dtype)


def memmap_bin_file(
bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
):
extent_itemsize = np.dtype(size_dtype).itemsize
offset = int(extent_itemsize) * 2
if bin_file is None:
return None
if dtype is None:
dtype = dtype_from_filename(bin_file)

if mode[0] == "r":
a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
if shape is None:
shape = (a[0], a[1])
else:
shape = tuple(
[
aval if sval is None else sval
for aval, sval in zip(a, shape)
]
)

return np.memmap(
bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape
)
elif mode[0] == "w":
if shape is None:
raise ValueError("Need to specify shape to map file in write mode")

print("creating file", bin_file)
dirname = os.path.dirname(bin_file)
if len(dirname) > 0:
os.makedirs(dirname, exist_ok=True)
a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
a[0] = shape[0]
a[1] = shape[1]
a.flush()
del a
fp = np.memmap(
bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape
)
return fp

# print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype))


def write_bin(fname, data):
print("writing", fname, data.shape, data.dtype, "...")
with open(fname, "wb") as f:
np.asarray(data.shape, dtype=np.uint32).tofile(f)
data.tofile(f)

0 comments on commit 66a6af8

Please sign in to comment.