Skip to content

Commit

Permalink
Allow host dataset for IVF-PQ
Browse files Browse the repository at this point in the history
  • Loading branch information
tfeher committed Dec 21, 2022
1 parent 107f6e3 commit 10be8a6
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 16 deletions.
1 change: 1 addition & 0 deletions python/pylibraft/pylibraft/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
#

from .ai_wrapper import ai_wrapper
from .cai_wrapper import cai_wrapper
from .cuda import Stream
from .device_ndarray import device_ndarray
Expand Down
89 changes: 89 additions & 0 deletions python/pylibraft/pylibraft/common/ai_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np

from pylibraft.common import input_validation


class ai_wrapper:
"""
Simple wrapper around a array interface object to reduce
boilerplate for extracting common information from the underlying
dictionary.
"""

def __init__(self, ai_arr):
"""
Constructor accepts a CUDA array interface compliant array
Parameters
----------
ai_arr : CUDA array interface array
"""
self.ai_ = ai_arr.__array_interface__

@property
def dtype(self):
"""
Returns the dtype of the underlying CUDA array interface
"""
return np.dtype(self.ai_["typestr"])

@property
def shape(self):
"""
Returns the shape of the underlying CUDA array interface
"""
return self.ai_["shape"]

@property
def c_contiguous(self):
"""
Returns whether the underlying CUDA array interface has
c-ordered (row-major) layout
"""
return input_validation.is_c_contiguous(self.ai_)

@property
def f_contiguous(self):
"""
Returns whether the underlying CUDA array interface has
f-ordered (column-major) layout
"""
return not input_validation.is_c_contiguous(self.ai_)

@property
def data(self):
"""
Returns the data pointer of the underlying CUDA array interface
"""
return self.ai_["data"][0]

def validate_shape_dtype(self, expected_dims=None, expected_dtype=None):
"""Checks to see if the shape, dtype, and strides match expectations"""
if expected_dims is not None and len(self.shape) != expected_dims:
raise ValueError(
f"unexpected shape {self.shape} - "
f"expected {expected_dims} dimensions"
)

if expected_dtype is not None and self.dtype != expected_dtype:
raise ValueError(
f"invalid dtype {self.dtype}: expected " f"{expected_dtype}"
)

if not self.c_contiguous:
raise ValueError("input must be c-contiguous")
30 changes: 23 additions & 7 deletions python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ from libcpp cimport bool, nullptr

from pylibraft.distance.distance_type cimport DistanceType

from pylibraft.common import Handle, cai_wrapper, device_ndarray
from pylibraft.common import Handle, ai_wrapper, cai_wrapper, device_ndarray
from pylibraft.common.interruptible import cuda_interruptible

from pylibraft.common.handle cimport handle_t
Expand Down Expand Up @@ -306,10 +306,13 @@ def build(IndexParams index_params, dataset, handle=None):
"""
Builds an IVF-PQ index that can be later used for nearest neighbor search.
The input array can be either CUDA array interface compliant matrix or
array interface compliant matrix in host memory.
Parameters
----------
index_params : IndexParams object
dataset : CUDA array interface compliant matrix shape (n_samples, dim)
dataset : array interface compliant matrix shape (n_samples, dim)
Supported dtype [float, int8, uint8]
{handle_docstring}
Expand Down Expand Up @@ -352,7 +355,11 @@ def build(IndexParams index_params, dataset, handle=None):
>>> # handle needs to be explicitly synchronized
>>> handle.sync()
"""
dataset_cai = cai_wrapper(dataset)
try:
dataset_cai = cai_wrapper(dataset)
except AttributeError:
dataset_cai = ai_wrapper(dataset)

dataset_dt = dataset_cai.dtype
_check_input_array(dataset_cai, [np.dtype('float32'), np.dtype('byte'),
np.dtype('ubyte')])
Expand Down Expand Up @@ -405,14 +412,16 @@ def extend(Index index, new_vectors, new_indices, handle=None):
"""
Extend an existing index with new vectors.
The input array can be either CUDA array interface compliant matrix or
array interface compliant matrix in host memory.
Parameters
----------
index : ivf_pq.Index
Trained ivf_pq object.
new_vectors : CUDA array interface compliant matrix shape (n_samples, dim)
new_vectors : array interface compliant matrix shape (n_samples, dim)
Supported dtype [float, int8, uint8]
new_indices : CUDA array interface compliant matrix shape (n_samples, dim)
new_indices : array interface compliant matrix shape (n_samples, dim)
Supported dtype [uint64]
{handle_docstring}
Expand Down Expand Up @@ -465,16 +474,23 @@ def extend(Index index, new_vectors, new_indices, handle=None):
handle = Handle()
cdef handle_t* handle_ = <handle_t*><size_t>handle.getHandle()

vecs_cai = cai_wrapper(new_vectors)
try:
vecs_cai = cai_wrapper(new_vectors)
except AttributeError:
vecs_cai = ai_wrapper(new_vectors)

vecs_dt = vecs_cai.dtype
cdef uint64_t n_rows = vecs_cai.shape[0]
cdef uint32_t dim = vecs_cai.shape[1]

_check_input_array(vecs_cai, [np.dtype('float32'), np.dtype('byte'),
np.dtype('ubyte')],
exp_cols=index.dim)
try:
idx_cai = cai_wrapper(new_indices)
except AttributeError:
idx_cai = ai_wrapper(new_indices)

idx_cai = cai_wrapper(new_indices)
_check_input_array(idx_cai, [np.dtype('uint64')], exp_rows=n_rows)
if len(idx_cai.shape)!=1:
raise ValueError("Indices array is expected to be 1D")
Expand Down
34 changes: 25 additions & 9 deletions python/pylibraft/pylibraft/test/test_ivf_pq.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def run_ivf_pq_build_search_test(
kmeans_n_iters=20,
compare=True,
inplace=True,
array_type="device",
):
dataset = generate_data((n_rows, n_cols), dtype)
if metric == "inner_product":
Expand All @@ -115,7 +116,10 @@ def run_ivf_pq_build_search_test(
add_data_on_build=add_data_on_build,
)

index = ivf_pq.build(build_params, dataset_device)
if array_type == "device":
index = ivf_pq.build(build_params, dataset_device)
else:
index = ivf_pq.build(build_params, dataset)

assert index.trained
if pq_dim != 0:
Expand All @@ -125,14 +129,20 @@ def run_ivf_pq_build_search_test(
assert index.n_lists == build_params.n_lists

if not add_data_on_build:
dataset_1_device = device_ndarray(dataset[: n_rows // 2, :])
dataset_2_device = device_ndarray(dataset[n_rows // 2 :, :])
dataset_1 = dataset[: n_rows // 2, :]
dataset_2 = dataset[n_rows // 2 :, :]
indices_1 = np.arange(n_rows // 2, dtype=np.uint64)
indices_1_device = device_ndarray(indices_1)
indices_2 = np.arange(n_rows // 2, n_rows, dtype=np.uint64)
indices_2_device = device_ndarray(indices_2)
index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
if array_type == "device":
dataset_1_device = device_ndarray(dataset_1)
dataset_2_device = device_ndarray(dataset_2)
indices_1_device = device_ndarray(indices_1)
indices_2_device = device_ndarray(indices_2)
index = ivf_pq.extend(index, dataset_1_device, indices_1_device)
index = ivf_pq.extend(index, dataset_2_device, indices_2_device)
else:
index = ivf_pq.extend(index, dataset_1, indices_1)
index = ivf_pq.extend(index, dataset_2, indices_2)

assert index.size >= n_rows

Expand Down Expand Up @@ -190,7 +200,10 @@ def run_ivf_pq_build_search_test(
@pytest.mark.parametrize("n_queries", [100])
@pytest.mark.parametrize("n_lists", [100])
@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
@pytest.mark.parametrize("array_type", ["host", "device"])
def test_ivf_pq_dtypes(
n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type
):
# Note that inner_product tests use normalized input which we cannot
# represent in int8, therefore we test only l2_expanded metric here.
run_ivf_pq_build_search_test(
Expand All @@ -202,6 +215,7 @@ def test_ivf_pq_dtypes(n_rows, n_cols, n_queries, n_lists, dtype, inplace):
metric="l2_expanded",
dtype=dtype,
inplace=inplace,
array_type=array_type,
)


Expand Down Expand Up @@ -337,7 +351,8 @@ def test_ivf_pq_search_params(params):


@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
def test_extend(dtype):
@pytest.mark.parametrize("array_type", ["host", "device"])
def test_extend(dtype, array_type):
run_ivf_pq_build_search_test(
n_rows=10000,
n_cols=10,
Expand All @@ -347,6 +362,7 @@ def test_extend(dtype):
metric="l2_expanded",
dtype=dtype,
add_data_on_build=False,
array_type=array_type,
)


Expand Down

0 comments on commit 10be8a6

Please sign in to comment.