Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow host dataset for IVF-PQ #1114

Merged
merged 4 commits into from
Jan 10, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/pylibraft/pylibraft/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,6 +13,7 @@
# limitations under the License.
#

from .ai_wrapper import ai_wrapper
from .cai_wrapper import cai_wrapper
from .cuda import Stream
from .device_ndarray import device_ndarray
Expand Down
89 changes: 89 additions & 0 deletions python/pylibraft/pylibraft/common/ai_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np

from pylibraft.common import input_validation


class ai_wrapper:
tfeher marked this conversation as resolved.
Show resolved Hide resolved
"""
Simple wrapper around a array interface object to reduce
boilerplate for extracting common information from the underlying
dictionary.
"""

def __init__(self, ai_arr):
"""
Constructor accepts an array interface compliant array

Parameters
----------
ai_arr : array interface array
"""
self.ai_ = ai_arr.__array_interface__

@property
def dtype(self):
"""
Returns the dtype of the underlying array interface
"""
return np.dtype(self.ai_["typestr"])

@property
def shape(self):
"""
Returns the shape of the underlying array interface
"""
return self.ai_["shape"]

@property
def c_contiguous(self):
"""
Returns whether the underlying array interface has
c-ordered (row-major) layout
"""
return input_validation.is_c_contiguous(self.ai_)

@property
def f_contiguous(self):
"""
Returns whether the underlying array interface has
f-ordered (column-major) layout
"""
return not input_validation.is_c_contiguous(self.ai_)

@property
def data(self):
"""
Returns the data pointer of the underlying array interface
"""
return self.ai_["data"][0]

def validate_shape_dtype(self, expected_dims=None, expected_dtype=None):
"""Checks to see if the shape, dtype, and strides match expectations"""
if expected_dims is not None and len(self.shape) != expected_dims:
raise ValueError(
f"unexpected shape {self.shape} - "
f"expected {expected_dims} dimensions"
)

if expected_dtype is not None and self.dtype != expected_dtype:
raise ValueError(
f"invalid dtype {self.dtype}: expected " f"{expected_dtype}"
)

if not self.c_contiguous:
raise ValueError("input must be c-contiguous")
69 changes: 13 additions & 56 deletions python/pylibraft/pylibraft/common/cai_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,12 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from types import SimpleNamespace

from pylibraft.common import input_validation
from pylibraft.common.ai_wrapper import ai_wrapper


class cai_wrapper:
class cai_wrapper(ai_wrapper):
"""
Simple wrapper around a CUDA array interface object to reduce
boilerplate for extracting common information from the underlying
Expand All @@ -33,57 +33,14 @@ def __init__(self, cai_arr):
----------
cai_arr : CUDA array interface array
"""
self.cai_ = cai_arr.__cuda_array_interface__
helper = SimpleNamespace(
__array_interface__=cai_arr.__cuda_array_interface__
)
super().__init__(helper)

@property
def dtype(self):
"""
Returns the dtype of the underlying CUDA array interface
"""
return np.dtype(self.cai_["typestr"])

@property
def shape(self):
"""
Returns the shape of the underlying CUDA array interface
"""
return self.cai_["shape"]

@property
def c_contiguous(self):
"""
Returns whether the underlying CUDA array interface has
c-ordered (row-major) layout
"""
return input_validation.is_c_contiguous(self.cai_)

@property
def f_contiguous(self):
"""
Returns whether the underlying CUDA array interface has
f-ordered (column-major) layout
"""
return not input_validation.is_c_contiguous(self.cai_)

@property
def data(self):
"""
Returns the data pointer of the underlying CUDA array interface
"""
return self.cai_["data"][0]

def validate_shape_dtype(self, expected_dims=None, expected_dtype=None):
"""Checks to see if the shape, dtype, and strides match expectations"""
if expected_dims is not None and len(self.shape) != expected_dims:
raise ValueError(
f"unexpected shape {self.shape} - "
f"expected {expected_dims} dimensions"
)

if expected_dtype is not None and self.dtype != expected_dtype:
raise ValueError(
f"invalid dtype {self.dtype}: expected " f"{expected_dtype}"
)

if not self.c_contiguous:
raise ValueError("input must be c-contiguous")
def wrap_array(array):
try:
return cai_wrapper(array)
except AttributeError:
return ai_wrapper(array)
21 changes: 14 additions & 7 deletions python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -36,10 +36,12 @@ from pylibraft.distance.distance_type cimport DistanceType

from pylibraft.common import (
Handle,
ai_wrapper,
auto_convert_output,
cai_wrapper,
device_ndarray,
)
from pylibraft.common.cai_wrapper import wrap_array
from pylibraft.common.interruptible import cuda_interruptible

from pylibraft.common.handle cimport handle_t
Expand Down Expand Up @@ -313,10 +315,13 @@ def build(IndexParams index_params, dataset, handle=None):
"""
Builds an IVF-PQ index that can be later used for nearest neighbor search.

The input array can be either CUDA array interface compliant matrix or
array interface compliant matrix in host memory.

Parameters
----------
index_params : IndexParams object
dataset : CUDA array interface compliant matrix shape (n_samples, dim)
dataset : array interface compliant matrix shape (n_samples, dim)
Supported dtype [float, int8, uint8]
{handle_docstring}

Expand Down Expand Up @@ -359,7 +364,7 @@ def build(IndexParams index_params, dataset, handle=None):
>>> # handle needs to be explicitly synchronized
>>> handle.sync()
"""
dataset_cai = cai_wrapper(dataset)
dataset_cai = wrap_array(dataset)
dataset_dt = dataset_cai.dtype
_check_input_array(dataset_cai, [np.dtype('float32'), np.dtype('byte'),
np.dtype('ubyte')])
Expand Down Expand Up @@ -413,14 +418,16 @@ def extend(Index index, new_vectors, new_indices, handle=None):
"""
Extend an existing index with new vectors.

The input array can be either CUDA array interface compliant matrix or
array interface compliant matrix in host memory.

Parameters
----------
index : ivf_pq.Index
Trained ivf_pq object.
new_vectors : CUDA array interface compliant matrix shape (n_samples, dim)
new_vectors : array interface compliant matrix shape (n_samples, dim)
Supported dtype [float, int8, uint8]
new_indices : CUDA array interface compliant matrix shape (n_samples, dim)
new_indices : array interface compliant matrix shape (n_samples, dim)
Supported dtype [uint64]
{handle_docstring}

Expand Down Expand Up @@ -473,7 +480,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
handle = Handle()
cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()

vecs_cai = cai_wrapper(new_vectors)
vecs_cai = wrap_array(new_vectors)
vecs_dt = vecs_cai.dtype
cdef uint64_t n_rows = vecs_cai.shape[0]
cdef uint32_t dim = vecs_cai.shape[1]
Expand All @@ -482,7 +489,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
np.dtype('ubyte')],
exp_cols=index.dim)

idx_cai = cai_wrapper(new_indices)
idx_cai = wrap_array(new_indices)
_check_input_array(idx_cai, [np.dtype('uint64')], exp_rows=n_rows)
if len(idx_cai.shape)!=1:
raise ValueError("Indices array is expected to be 1D")
Expand Down
36 changes: 26 additions & 10 deletions python/pylibraft/pylibraft/test/test_ivf_pq.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -97,6 +97,7 @@ def run_ivf_pq_build_search_test(
kmeans_n_iters=20,
compare=True,
inplace=True,
array_type="device",
):
dataset = generate_data((n_rows, n_cols), dtype)
if metric == "inner_product":
Expand All @@ -115,7 +116,10 @@ def run_ivf_pq_build_search_test(
add_data_on_build=add_data_on_build,
)

index = ivf_pq.build(build_params, dataset_device)
if array_type == "device":
index = ivf_pq.build(build_params, dataset_device)
else:
index = ivf_pq.build(build_params, dataset)

assert index.trained
if pq_dim != 0:
Expand All @@ -125,14 +129,20 @@ def run_ivf_pq_build_search_test(
assert index.n_lists == build_params.n_lists

if not add_data_on_build:
dataset_1_device = device_ndarray(dataset[: n_rows // 2, :])
dataset_2_device = device_ndarray(dataset[n_rows // 2 :, :])
dataset_1 = dataset[: n_rows // 2, :]
dataset_2 = dataset[n_rows // 2 :, :]
indices_1 = np.arange(n_rows // 2, dtype=np.uint64)
indices_1_device = device_ndarray(indices_1)
indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.uint64)
indices_2_device = device_ndarray(indices_2)
index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
if array_type == "device":
dataset_1_device = device_ndarray(dataset_1)
dataset_2_device = device_ndarray(dataset_2)
indices_1_device = device_ndarray(indices_1)
indices_2_device = device_ndarray(indices_2)
index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
else:
index = ivf_pq.extend(index, dataset_1, indices_1)
index = ivf_pq.extend(index, dataset_2, indices_2)

assert index.size >= n_rows

Expand Down Expand Up @@ -190,7 +200,10 @@ def run_ivf_pq_build_search_test(
@pytest.mark.parametrize("n_queries", [100])
@pytest.mark.parametrize("n_lists", [100])
@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
@pytest.mark.parametrize("array_type", ["host", "device"])
def test_ivf_pq_dtypes(
n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type
):
# Note that inner_product tests use normalized input which we cannot
# represent in int8, therefore we test only l2_expanded metric here.
run_ivf_pq_build_search_test(
Expand All @@ -202,6 +215,7 @@ def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
metric="l2_expanded",
dtype=dtype,
inplace=inplace,
array_type=array_type,
)


Expand Down Expand Up @@ -337,7 +351,8 @@ def test_ivf_pq_search_params(params):


@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
def test_extend(dtype):
@pytest.mark.parametrize("array_type", ["host", "device"])
def test_extend(dtype, array_type):
run_ivf_pq_build_search_test(
n_rows=10000,
n_cols=10,
Expand All @@ -347,6 +362,7 @@ def test_extend(dtype):
metric="l2_expanded",
dtype=dtype,
add_data_on_build=False,
array_type=array_type,
)


Expand Down