Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ANN bench scripts to generate ground truth #1967

Merged
merged 9 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
#!/usr/bin/env python
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os

import cupy as cp
import numpy as np
import rmm
from pylibraft.common import DeviceResources
from pylibraft.neighbors.brute_force import knn
from rmm.allocators.cupy import rmm_cupy_allocator
from utils import memmap_bin_file, suffix_from_dtype, write_bin


def generate_random_queries(n_queries, n_features, dtype=np.float32):
print("Generating random queries")
if np.issubdtype(dtype, np.integer):
queries = cp.random.randint(
0, 255, size=(n_queries, n_features), dtype=dtype
)
else:
queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype)
return queries


def choose_random_queries(dataset, n_queries):
print("Choosing random vector from dataset as query vectors")
query_idx = np.random.choice(
dataset.shape[0], size=(n_queries,), replace=False
)
return dataset[query_idx, :]


def calc_truth(dataset, queries, k, metric="sqeuclidean"):
handle = DeviceResources()
n_samples = dataset.shape[0]
n = 500000 # batch size for processing neighbors
i = 0
indices = None
distances = None
queries = cp.asarray(queries, dtype=cp.float32)

while i < n_samples:
print("Step {0}/{1}:".format(i // n, n_samples // n))
n_batch = n if i + n <= n_samples else n_samples - i

X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)

D, Ind = knn(
X,
queries,
k,
metric=metric,
handle=handle,
global_id_offset=i, # shift neighbor index by offset i
)
handle.sync()

D, Ind = cp.asarray(D), cp.asarray(Ind)
if distances is None:
distances = D
indices = Ind
else:
distances = cp.concatenate([distances, D], axis=1)
indices = cp.concatenate([indices, Ind], axis=1)
idx = cp.argsort(distances, axis=1)[:, :k]
distances = cp.take_along_axis(distances, idx, axis=1)
indices = cp.take_along_axis(indices, idx, axis=1)

i += n_batch

return distances, indices


if __name__ == "__main__":
pool = rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
)
rmm.mr.set_current_device_resource(pool)
cp.cuda.set_allocator(rmm_cupy_allocator)

parser = argparse.ArgumentParser(
prog="generate_groundtruth",
description="Generate true neighbors using exact NN search. "
"The input and output files are in big-ann-benchmark's binary format.",
epilog="""Example usage
# With existing query file
python generate_groundtruth.py --dataset /dataset/base.1B.fbin \
--output=groundtruth_dir --queries=/dataset/query.public.10K.fbin

# With randomly generated queries
python generate_groundtruth.py --dataset /dataset/base.1B.fbin \
--output=groundtruth_dir --queries=random --n_queries=10000

# Using only a subset of the dataset. Define queries by randomly
# selecting vectors from the (subset of the) dataset.
python generate_groundtruth.py --dataset /dataset/base.1B.fbin \
--nrows=2000000 --cols=128 --output=groundtruth_dir \
--queries=random-choice --n_queries=10000
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)

parser.add_argument("dataset", type=str, help="input dataset file name")
tfeher marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument(
"--queries",
type=str,
default="random",
help="Queries file name, or one of 'random-choice' or 'random' "
"(default). 'random-choice': select n_queries vectors from the input "
"dataset. 'random': generate n_queries as uniform random numbers.",
)
parser.add_argument(
"--output",
type=str,
default="/tmp/groundtruth_dir",
tfeher marked this conversation as resolved.
Show resolved Hide resolved
help="output directory name",
)

parser.add_argument(
"--n_queries",
type=int,
default=10000,
help="Number of quries to generate (if no query file is given). "
"Default: 10000.",
)

parser.add_argument(
"-N",
"--rows",
default=0,
type=int,
help="use only first N rows from dataset, by default the whole "
"dataset is used",
)
parser.add_argument(
"-D",
"--cols",
default=0,
type=int,
help="number of features (dataset columns). Must be specified if "
"--rows is used. Default: read from dataset file.",
)
parser.add_argument(
tfeher marked this conversation as resolved.
Show resolved Hide resolved
"--dtype",
type=str,
help="Dataset dtype. When not specified, then derived from extension.",
)

parser.add_argument(
"-k",
type=int,
default=100,
help="Number of neighbors (per query) to calculate",
)
parser.add_argument(
"--metric",
type=str,
default="sqeuclidean",
help="Metric to use while calculating distances.",
)

args = parser.parse_args()
if args.rows != 0 and args.cols == 0:
raise RuntimeError(
"Number of columns has to be specified with the --cols argument"
tfeher marked this conversation as resolved.
Show resolved Hide resolved
)

n_samples = args.rows
n_features = args.cols

if n_samples != 0:
shape = (n_samples, n_features)
print("Reading subset of the data, shape=", shape)
else:
print("Reading whole dataset")
shape = None

# Load input data
dataset = memmap_bin_file(args.dataset, args.dtype, shape)
n_samples = dataset.shape[0]
n_features = dataset.shape[1]
dtype = dataset.dtype

print(dataset.shape)
print(
"Dataset size {:6.1f} GB, dtype {}".format(
dataset.size * dataset.dtype.itemsize / 1e9, np.dtype(dtype)
)
)

os.makedirs(args.output, exist_ok=True)

if args.queries == "random" or args.queries == "random-choice":
if args.n_queries is None:
raise RuntimeError(
"n_queries must be given to generate random queries"
)
if args.queries == "random":
queries = generate_random_queries(
args.n_queries, n_features, dtype
)
elif args.queries == "random-choice":
queries = choose_random_queries(dataset, args.n_queries)

queries_filename = os.path.join(
args.output, "queries" + suffix_from_dtype(dtype)
)
print("Writing queries file", queries_filename)
write_bin(queries_filename, queries)
else:
print("Reading queries from file", args.queries)
queries = memmap_bin_file(args.queries, dtype)

print("Calculating true nearest neighbors")
distances, indices = calc_truth(dataset, queries, args.k, args.metric)

write_bin(
os.path.join(args.output, "groundtruth.neighbors.ibin"),
indices.astype(np.uint32),
)
write_bin(
os.path.join(args.output, "groundtruth.distances.fbin"),
distances.astype(np.float32),
)
137 changes: 137 additions & 0 deletions python/raft-ann-bench/src/raft-ann-bench/scripts/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import time

import cupy as cp
import numpy as np


def dtype_from_filename(filename):
ext = os.path.splitext(filename)[1]
if ext == ".fbin":
return np.float32
elif ext == ".ibin":
return np.int32
elif ext == ".u8bin":
return np.ubyte
elif ext == ".i8bin":
return np.byte
else:
raise RuntimeError("Not supported file extension" + ext)


def suffix_from_dtype(dtype):
if dtype == np.float32:
return ".fbin"
elif dtype == np.int32:
return ".ibin"
elif dtype == np.ubyte:
return ".u8bin"
elif dtype == np.byte:
return ".i8bin"
else:
raise RuntimeError("Not supported dtype extension" + dtype)


def memmap_bin_file(
bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
):
extent_itemsize = np.dtype(size_dtype).itemsize
offset = int(extent_itemsize) * 2
if bin_file is None:
return None
if dtype is None:
dtype = dtype_from_filename(bin_file)

if mode[0] == "r":
a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
if shape is None:
shape = (a[0], a[1])
print("Read shape from file", shape)
return np.memmap(
bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape
)
elif mode[0] == "w":
if shape is None:
raise ValueError("Need to specify shape to map file in write mode")

print("creating file", bin_file)
dirname = os.path.dirname(bin_file)
if len(dirname) > 0:
os.makedirs(dirname, exist_ok=True)
a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
a[0] = shape[0]
a[1] = shape[1]
a.flush()
del a
fp = np.memmap(
bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape
)
return fp

# print('# {}: shape: {}, dtype: {}'.format(bin_file, shape, dtype))


def write_bin(fname, data):
print("writing", fname, data.shape, data.dtype, "...")
with open(fname, "wb") as f:
np.asarray(data.shape, dtype=np.uint32).tofile(f)
data.tofile(f)


def calc_recall(ann_idx, true_nn_idx):
# ann_idx = np.asarray(ann_idx)
ann_idx = cp.asnumpy(ann_idx)
if ann_idx.shape != true_nn_idx.shape:
raise RuntimeError(
"Incompatible shapes {} vs {}".format(
ann_idx.shape, true_nn_idx.shape
)
)
n = 0
for i in range(ann_idx.shape[0]):
n += np.intersect1d(ann_idx[i, :], true_nn_idx[i, :]).size
recall = n / ann_idx.size
return recall


class BenchmarkTimer:
"""Provides a context manager that runs a code block `reps` times
and records results to the instance variable `timings`. Use like:
.. code-block:: python
timer = BenchmarkTimer(rep=5)
for _ in timer.benchmark_runs():
... do something ...
print(np.min(timer.timings))

This class is part of the rapids/cuml benchmark suite
"""

def __init__(self, reps=1, warmup=0):
self.warmup = warmup
self.reps = reps
self.timings = []

def benchmark_runs(self):
for r in range(self.reps + self.warmup):
t0 = time.time()
yield r
t1 = time.time()
self.timings.append(t1 - t0)
if r >= self.warmup:
self.timings.append(t1 - t0)