diff --git a/docs/source/cpp_api/neighbors_cagra.rst b/docs/source/cpp_api/neighbors_cagra.rst index 6613b0b06d..99ecd3a985 100644 --- a/docs/source/cpp_api/neighbors_cagra.rst +++ b/docs/source/cpp_api/neighbors_cagra.rst @@ -19,3 +19,13 @@ namespace *raft::neighbors::cagra* :content-only: +Serializer Methods +------------------ +``#include `` + +namespace *raft::neighbors::cagra* + +.. doxygengroup:: cagra_serialize + :project: RAFT + :members: + :content-only: diff --git a/docs/source/cpp_api/neighbors_ivf_pq.rst b/docs/source/cpp_api/neighbors_ivf_pq.rst index 348928d719..17948a37fe 100644 --- a/docs/source/cpp_api/neighbors_ivf_pq.rst +++ b/docs/source/cpp_api/neighbors_ivf_pq.rst @@ -21,6 +21,17 @@ Serializer Methods namespace *raft::neighbors::ivf_pq* .. doxygengroup:: ivf_pq_serialize + :project: RAFT + :members: + :content-only: + +Candidate Refinement +-------------------- +``#include `` + +namespace *raft::neighbors* + +.. doxygengroup:: ann_refine :project: RAFT :members: :content-only: \ No newline at end of file diff --git a/docs/source/pylibraft_api/cluster.rst b/docs/source/pylibraft_api/cluster.rst index 59e53e7d4c..085297fe34 100644 --- a/docs/source/pylibraft_api/cluster.rst +++ b/docs/source/pylibraft_api/cluster.rst @@ -7,6 +7,9 @@ This page provides pylibraft class references for the publicly-exposed elements :language: python :class: highlight +KMeans +###### + .. autoclass:: pylibraft.cluster.kmeans.KMeansParams :members: @@ -14,8 +17,4 @@ This page provides pylibraft class references for the publicly-exposed elements .. autofunction:: pylibraft.cluster.kmeans.cluster_cost -.. autofunction:: pylibraft.cluster.compute_new_centroids - - - - +.. autofunction:: pylibraft.cluster.kmeans.compute_new_centroids diff --git a/docs/source/pylibraft_api/neighbors.rst b/docs/source/pylibraft_api/neighbors.rst index ca89c25ed4..680a2982cb 100644 --- a/docs/source/pylibraft_api/neighbors.rst +++ b/docs/source/pylibraft_api/neighbors.rst @@ -27,6 +27,11 @@ CAGRA .. autofunction:: pylibraft.neighbors.cagra.search +Serializer Methods +------------------ +.. autofunction:: pylibraft.neighbors.cagra.save + +.. autofunction:: pylibraft.neighbors.cagra.load IVF-Flat ######## @@ -43,6 +48,12 @@ IVF-Flat .. autofunction:: pylibraft.neighbors.ivf_flat.search +Serializer Methods +------------------ + +.. autofunction:: pylibraft.neighbors.ivf_flat.save + +.. autofunction:: pylibraft.neighbors.ivf_flat.load IVF-PQ ###### @@ -59,8 +70,14 @@ IVF-PQ .. autofunction:: pylibraft.neighbors.ivf_pq.search +Serializer Methods +------------------ + +.. autofunction:: pylibraft.neighbors.ivf_pq.save + +.. autofunction:: pylibraft.neighbors.ivf_pq.load Candidate Refinement -#################### +-------------------- .. autofunction:: pylibraft.neighbors.refine diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 3a3db0f3ea..6fd7523c64 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -182,6 +182,10 @@ options: All algorithms present in the CSV file supplied to this script with parameter `result_csv` will appear in the plot. +The figure below is the resulting plot of running our benchmarks as of August 2023 for a batch size of 10, on an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU. It presents the throughput (in Queries-Per-Second) performance for every level of recall. + +![Throughput vs recall plot comparing popular ANN algorithms with RAFT's at batch size 10](../../img/raft-vector-search-batch-10.png) + ## Adding a new ANN algorithm ### Implementation and Configuration Implementation of a new algorithm should be a C++ class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions. diff --git a/img/raft-vector-search-batch-10.png b/img/raft-vector-search-batch-10.png new file mode 100644 index 0000000000..5416e611f2 Binary files /dev/null and b/img/raft-vector-search-batch-10.png differ diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx index b61fb4ab02..f4af519dc1 100644 --- a/python/pylibraft/pylibraft/cluster/kmeans.pyx +++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx @@ -85,33 +85,26 @@ def compute_new_centroids(X, -------- >>> import cupy as cp - >>> from pylibraft.common import Handle >>> from pylibraft.cluster.kmeans import compute_new_centroids - >>> # A single RAFT handle can optionally be reused across >>> # pylibraft functions. >>> handle = Handle() - >>> n_samples = 5000 >>> n_features = 50 >>> n_clusters = 3 - >>> X = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> centroids = cp.random.random_sample((n_clusters, n_features), ... dtype=cp.float32) ... >>> labels = cp.random.randint(0, high=n_clusters, size=n_samples, ... dtype=cp.int32) - - >>> new_centroids = cp.empty((n_clusters, n_features), dtype=cp.float32) - + >>> new_centroids = cp.empty((n_clusters, n_features), + ... dtype=cp.float32) >>> compute_new_centroids( ... X, centroids, labels, new_centroids, handle=handle ... ) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() @@ -221,11 +214,9 @@ def init_plus_plus(X, n_clusters=None, seed=None, handle=None, centroids=None): >>> import cupy as cp >>> from pylibraft.cluster.kmeans import init_plus_plus - >>> n_samples = 5000 >>> n_features = 50 >>> n_clusters = 3 - >>> X = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) @@ -301,19 +292,14 @@ def cluster_cost(X, centroids, handle=None): -------- >>> import cupy as cp - >>> >>> from pylibraft.cluster.kmeans import cluster_cost - >>> >>> n_samples = 5000 >>> n_features = 50 >>> n_clusters = 3 - >>> >>> X = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> centroids = cp.random.random_sample((n_clusters, n_features), ... dtype=cp.float32) - >>> inertia = cluster_cost(X, centroids) """ x_cai = X.__cuda_array_interface__ @@ -524,13 +510,10 @@ def fit( -------- >>> import cupy as cp - >>> >>> from pylibraft.cluster.kmeans import fit, KMeansParams - >>> >>> n_samples = 5000 >>> n_features = 50 >>> n_clusters = 3 - >>> >>> X = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx index b4cdb9b0c1..7e3dc289e0 100644 --- a/python/pylibraft/pylibraft/common/handle.pyx +++ b/python/pylibraft/pylibraft/common/handle.pyx @@ -197,8 +197,8 @@ cdef class Handle(DeviceResources): _HANDLE_PARAM_DOCSTRING = """ - handle : Optional RAFT resource handle for reusing expensive CUDA - resources. If a handle isn't supplied, CUDA resources will be + handle : Optional RAFT resource handle for reusing CUDA resources. + If a handle isn't supplied, CUDA resources will be allocated inside this function and synchronized before the function exits. If a handle is supplied, you will need to explicitly synchronize yourself by calling `handle.sync()` diff --git a/python/pylibraft/pylibraft/neighbors/brute_force.pyx b/python/pylibraft/pylibraft/neighbors/brute_force.pyx index 2d118072ab..4aa47b8a18 100644 --- a/python/pylibraft/pylibraft/neighbors/brute_force.pyx +++ b/python/pylibraft/pylibraft/neighbors/brute_force.pyx @@ -95,7 +95,6 @@ def knn(dataset, queries, k=None, indices=None, distances=None, distances : Optional array interface compliant matrix shape (n_queries, k), dtype float. If supplied, neighbor indices will be written here in-place. (default None) - {handle_docstring} Returns @@ -108,16 +107,12 @@ def knn(dataset, queries, k=None, indices=None, distances=None, Examples -------- - >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors.brute_force import knn - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> # Search using the built index diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx index 7d758a32ef..fbc1623cac 100644 --- a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx +++ b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx @@ -85,6 +85,25 @@ from pylibraft.neighbors.common cimport _get_metric_string cdef class IndexParams: + """" + Parameters to build index for CAGRA nearest neighbor search + + Parameters + ---------- + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean"], where + - sqeuclidean is the euclidean distance without the square root + operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2 + intermediate_graph_degree : int, default = 128 + + graph_degree : int, default = 64 + + add_data_on_build : bool, default = True + After training the coarse and fine quantizers, we will populate + the index with the dataset if add_data_on_build == True, otherwise + the index is left empty, and the extend method can be used + to add new vectors to the index. + """ cdef c_cagra.index_params params def __init__(self, *, @@ -92,29 +111,6 @@ cdef class IndexParams: intermediate_graph_degree=128, graph_degree=64, add_data_on_build=True): - """" - Parameters to build index for CAGRA nearest neighbor search - - Parameters - ---------- - metric : string denoting the metric type, default="sqeuclidean" - Valid values for metric: ["sqeuclidean", "inner_product", - "euclidean"], where - - sqeuclidean is the euclidean distance without the square root - operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, - - euclidean is the euclidean distance - - inner product distance is defined as - distance(a, b) = \\sum_i a_i * b_i. - intermediate_graph_degree : int, default = 128 - - graph_degree : int, default = 64 - - add_data_on_build : bool, default = True - After training the coarse and fine quantizers, we will populate - the index with the dataset if add_data_on_build == True, otherwise - the index is left empty, and the extend method can be used - to add new vectors to the index. - """ self.params.metric = _get_metric(metric) self.params.metric_arg = 0 self.params.intermediate_graph_degree = intermediate_graph_degree @@ -163,7 +159,7 @@ cdef class IndexFloat(Index): m_str = "metric=" + _get_metric_string(self.index.metric()) attr_str = [attr + "=" + str(getattr(self, attr)) for attr in ["metric", "dim", "graph_degree"]] - attr_str = m_str + attr_str + attr_str = [m_str] + attr_str return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" @property @@ -203,7 +199,7 @@ cdef class IndexInt8(Index): m_str = "metric=" + _get_metric_string(self.index.metric()) attr_str = [attr + "=" + str(getattr(self, attr)) for attr in ["metric", "dim", "graph_degree"]] - attr_str = m_str + attr_str + attr_str = [m_str] + attr_str return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" @property @@ -243,7 +239,7 @@ cdef class IndexUint8(Index): m_str = "metric=" + _get_metric_string(self.index.metric()) attr_str = [attr + "=" + str(getattr(self, attr)) for attr in ["metric", "dim", "graph_degree"]] - attr_str = m_str + attr_str + attr_str = [m_str] + attr_str return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")" @property @@ -280,8 +276,8 @@ def build(IndexParams index_params, dataset, handle=None): It is required that both the dataset and the optimized graph fit the GPU memory. - The following distance metrics are supported: - - L2 + The following distance metrics are supported: + - L2 Parameters ---------- @@ -298,31 +294,23 @@ def build(IndexParams index_params, dataset, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import cagra - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 >>> k = 10 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> handle = DeviceResources() >>> build_params = cagra.IndexParams(metric="sqeuclidean") - >>> index = cagra.build(build_params, dataset, handle=handle) - >>> distances, neighbors = cagra.search(cagra.SearchParams(), ... index, dataset, ... k, handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) """ @@ -415,6 +403,54 @@ def build(IndexParams index_params, dataset, handle=None): cdef class SearchParams: + """ + CAGRA search parameters + + Parameters + ---------- + max_queries: int, default = 0 + Maximum number of queries to search at the same time (batch size). + Auto select when 0. + itopk_size: int, default = 64 + Number of intermediate search results retained during the search. + This is the main knob to adjust trade off between accuracy and + search speed. Higher values improve the search accuracy. + max_iterations: int, default = 0 + Upper limit of search iterations. Auto select when 0. + algo: string denoting the search algorithm to use, default = "auto" + Valid values for algo: ["auto", "single_cta", "multi_cta"], where + - auto will automatically select the best value based on query size + - single_cta is better when query contains larger number of + vectors (e.g >10) + - multi_cta is better when query contains only a few vectors + team_size: int, default = 0 + Number of threads used to calculate a single distance. 4, 8, 16, + or 32. + search_width: int, default = 1 + Number of graph nodes to select as the starting point for the + search in each iteration. + min_iterations: int, default = 0 + Lower limit of search iterations. + thread_block_size: int, default = 0 + Thread block size. 0, 64, 128, 256, 512, 1024. + Auto selection when 0. + hashmap_mode: string denoting the type of hash map to use. It's + usually better to allow the algorithm to select this value., + default = "auto" + Valid values for hashmap_mode: ["auto", "small", "hash"], where + - auto will automatically select the best value based on algo + - small will use the small shared memory hash table with resetting. + - hash will use a single hash table in global memory. + hashmap_min_bitlen: int, default = 0 + Upper limit of hashmap fill rate. More than 0.1, less than 0.9. + hashmap_max_fill_rate: float, default = 0.5 + Upper limit of hashmap fill rate. More than 0.1, less than 0.9. + num_random_samplings: int, default = 1 + Number of iterations of initial random seed node selection. 1 or + more. + rand_xor_mask: int, default = 0x128394 + Bit mask used for initial random seed node selection. + """ cdef c_cagra.search_params params def __init__(self, *, @@ -431,56 +467,6 @@ cdef class SearchParams: hashmap_max_fill_rate=0.5, num_random_samplings=1, rand_xor_mask=0x128394): - """ - CAGRA search parameters - - Parameters - ---------- - max_queries: int, default = 0 - Maximum number of queries to search at the same time (batch size). - Auto select when 0. - itopk_size: int, default = 64 - Number of intermediate search results retained during the search. - This is the main knob to adjust trade off between accuracy and - search speed. Higher values improve the search accuracy. - max_iterations: int, default = 0 - Upper limit of search iterations. Auto select when 0. - algo: string denoting the search algorithm to use, default = "auto" - Valid values for algo: ["auto", "single_cta", "multi_cta"], where - - auto will automatically select the best value based on query size - - single_cta is better when query contains larger number of - vectors (e.g >10) - - multi_cta is better when query contains only a few vectors - team_size: int, default = 0 - Number of threads used to calculate a single distance. 4, 8, 16, - or 32. - search_width: int, default = 1 - Number of graph nodes to select as the starting point for the - search in each iteration. - min_iterations: int, default = 0 - Lower limit of search iterations. - thread_block_size: int, default = 0 - Thread block size. 0, 64, 128, 256, 512, 1024. - Auto selection when 0. - hashmap_mode: string denoting the type of hash map to use. It's - usually better to allow the algorithm to select this value., - default = "auto" - Valid values for hashmap_mode: ["auto", "small", "hash"], where - - auto will automatically select the best value based on algo - - small will use the small shared memory hash table with resetting. - - hash will use a single hash table in global memory. - hashmap_min_bitlen: int, default = 0 - Upper limit of hashmap fill rate. More than 0.1, less than 0.9. - hashmap_max_fill_rate: float, default = 0.5 - Upper limit of hashmap fill rate. More than 0.1, less than 0.9. - num_random_samplings: int, default = 1 - Number of iterations of initial random seed node selection. 1 or - more. - rand_xor_mask: int, default = 0x128394 - Bit mask used for initial random seed node selection. - - - """ self.params.max_queries = max_queries self.params.itopk_size = itopk_size self.params.max_iterations = max_iterations @@ -514,9 +500,13 @@ cdef class SearchParams: self.params.rand_xor_mask = rand_xor_mask def __repr__(self): - # todo(dantegd): add all relevant attrs attr_str = [attr + "=" + str(getattr(self, attr)) - for attr in ["max_queries"]] + for attr in [ + "max_queries", "itopk_size", "max_iterations", "algo", + "team_size", "search_width", "min_iterations", + "thread_block_size", "hashmap_mode", + "hashmap_min_bitlen", "hashmap_max_fill_rate", + "num_random_samplings", "rand_xor_mask"]] return "SearchParams(type=CAGRA, " + (", ".join(attr_str)) + ")" @property @@ -604,20 +594,16 @@ def search(SearchParams search_params, Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import cagra - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() >>> index = cagra.build(cagra.IndexParams(), dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -626,17 +612,14 @@ def search(SearchParams search_params, ... max_queries=100, ... itopk_size=64 ... ) - >>> # Using a pooling allocator reduces overhead of temporary array >>> # creation during search. This is useful if multiple searches >>> # are performad with same query size. >>> distances, neighbors = cagra.search(search_params, index, queries, ... k, handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> neighbors = cp.asarray(neighbors) >>> distances = cp.asarray(distances) """ @@ -712,9 +695,9 @@ def search(SearchParams search_params, @auto_sync_handle def save(filename, Index index, handle=None): """ - Saves the index to file. + Saves the index to a file. - Saving / loading the index is. The serialization format is + Saving / loading the index is experimental. The serialization format is subject to change. Parameters @@ -728,19 +711,18 @@ def save(filename, Index index, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import cagra - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() >>> index = cagra.build(cagra.IndexParams(), dataset, handle=handle) + >>> # Serialize and deserialize the cagra index built >>> cagra.save("my_index.bin", index, handle=handle) + >>> index_loaded = cagra.load("my_index.bin", handle=handle) """ if not index.trained: raise ValueError("Index need to be built before saving it.") @@ -778,7 +760,7 @@ def load(filename, handle=None): """ Loads index from file. - Saving / loading the index is. The serialization format is + Saving / loading the index is experimental. The serialization format is subject to change, therefore loading an index saved with a previous version of raft is not guaranteed to work. @@ -792,13 +774,6 @@ def load(filename, handle=None): ------- index : Index - Examples - -------- - >>> import cupy as cp - - >>> from pylibraft.common import DeviceResources - >>> from pylibraft.neighbors import cagra - """ if handle is None: handle = DeviceResources() @@ -810,11 +785,12 @@ def load(filename, handle=None): cdef IndexInt8 idx_int8 cdef IndexUint8 idx_uint8 - # we extract the dtype from the arrai interfaces in the file + # we extract the dtype from the array interfaces in the file with open(filename, 'rb') as f: type_str = f.read(700).decode("utf-8", errors='ignore') - dataset_dt = np.dtype(type_str[673:676]) + # Read description of the 6th element to get the datatype + dataset_dt = np.dtype(type_str.split('descr')[6][5:7]) if dataset_dt == np.float32: idx_float = IndexFloat(handle) diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx index e265bee23b..d8fbdc74da 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx @@ -75,6 +75,45 @@ from pylibraft.neighbors.ivf_flat.cpp.c_ivf_flat cimport ( cdef class IndexParams: + """ + Parameters to build index for IVF-FLAT nearest neighbor search + + Parameters + ---------- + n_list : int, default = 1024 + The number of clusters used in the coarse quantizer. + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean", "inner_product", + "euclidean"], where + - sqeuclidean is the euclidean distance without the square root + operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, + - euclidean is the euclidean distance + - inner product distance is defined as + distance(a, b) = \\sum_i a_i * b_i. + kmeans_n_iters : int, default = 20 + The number of iterations searching for kmeans centers during index + building. + kmeans_trainset_fraction : int, default = 0.5 + If kmeans_trainset_fraction is less than 1, then the dataset is + subsampled, and only n_samples * kmeans_trainset_fraction rows + are used for training. + add_data_on_build : bool, default = True + After training the coarse and fine quantizers, we will populate + the index with the dataset if add_data_on_build == True, otherwise + the index is left empty, and the extend method can be used + to add new vectors to the index. + adaptive_centers : bool, default = False + By default (adaptive_centers = False), the cluster centers are + trained in `ivf_flat::build`, and and never modified in + `ivf_flat::extend`. The alternative behavior (adaptive_centers + = true) is to update the cluster centers for new data when it is + added. In this case, `index.centers()` are always exactly the + centroids of the data in the corresponding clusters. The drawback + of this behavior is that the centroids depend on the order of + adding new data (through the classification of the added data); + that is, `index.centers()` "drift" together with the changing + distribution of the newly added data. + """ cdef c_ivf_flat.index_params params def __init__(self, *, @@ -84,45 +123,6 @@ cdef class IndexParams: kmeans_trainset_fraction=0.5, add_data_on_build=True, bool adaptive_centers=False): - """" - Parameters to build index for IVF-FLAT nearest neighbor search - - Parameters - ---------- - n_list : int, default = 1024 - The number of clusters used in the coarse quantizer. - metric : string denoting the metric type, default="sqeuclidean" - Valid values for metric: ["sqeuclidean", "inner_product", - "euclidean"], where - - sqeuclidean is the euclidean distance without the square root - operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, - - euclidean is the euclidean distance - - inner product distance is defined as - distance(a, b) = \\sum_i a_i * b_i. - kmeans_n_iters : int, default = 20 - The number of iterations searching for kmeans centers during index - building. - kmeans_trainset_fraction : int, default = 0.5 - If kmeans_trainset_fraction is less than 1, then the dataset is - subsampled, and only n_samples * kmeans_trainset_fraction rows - are used for training. - add_data_on_build : bool, default = True - After training the coarse and fine quantizers, we will populate - the index with the dataset if add_data_on_build == True, otherwise - the index is left empty, and the extend method can be used - to add new vectors to the index. - adaptive_centers : bool, default = False - By default (adaptive_centers = False), the cluster centers are - trained in `ivf_flat::build`, and and never modified in - `ivf_flat::extend`. The alternative behavior (adaptive_centers - = true) is to update the cluster centers for new data when it is - added. In this case, `index.centers()` are always exactly the - centroids of the data in the corresponding clusters. The drawback - of this behavior is that the centroids depend on the order of - adding new data (through the classification of the added data); - that is, `index.centers()` "drift" together with the changing - distribution of the newly added data. - """ self.params.n_lists = n_lists self.params.metric = _get_metric(metric) self.params.metric_arg = 0 @@ -333,33 +333,27 @@ def build(IndexParams index_params, dataset, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() >>> index_params = ivf_flat.IndexParams( ... n_lists=1024, ... metric="sqeuclidean") - >>> index = ivf_flat.build(index_params, dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> k = 10 - >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), index, - ... queries, k, handle=handle) - + >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), + ... index, queries, k, + ... handle=handle) >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() @@ -439,25 +433,21 @@ def extend(Index index, new_vectors, new_indices, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() - >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle) - + >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, + ... handle=handle) >>> n_rows = 100 >>> more_data = cp.random.random_sample((n_rows, n_features), ... dtype=cp.float32) >>> indices = index.size + cp.arange(n_rows, dtype=cp.int64) >>> index = ivf_flat.extend(index, more_data, indices) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -465,7 +455,6 @@ def extend(Index index, new_vectors, new_indices, handle=None): >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), ... index, queries, ... k, handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() @@ -540,17 +529,17 @@ def extend(Index index, new_vectors, new_indices, handle=None): cdef class SearchParams: + """ + IVF-FLAT search parameters + + Parameters + ---------- + n_probes: int, default = 1024 + The number of course clusters to select for the fine search. + """ cdef c_ivf_flat.search_params params def __init__(self, *, n_probes=20): - """ - IVF-FLAT search parameters - - Parameters - ---------- - n_probes: int, default = 1024 - The number of course clusters to select for the fine search. - """ self.params.n_probes = n_probes def __repr__(self): @@ -595,20 +584,17 @@ def search(SearchParams search_params, Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() - >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle) - + >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, + ... handle=handle) >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -616,13 +602,11 @@ def search(SearchParams search_params, >>> search_params = ivf_flat.SearchParams( ... n_probes=20 ... ) - >>> distances, neighbors = ivf_flat.search(search_params, index, queries, - ... k, handle=handle) - + >>> distances, neighbors = ivf_flat.search(search_params, index, + ... queries, k, handle=handle) >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> neighbors = cp.asarray(neighbors) >>> distances = cp.asarray(distances) """ @@ -697,7 +681,7 @@ def search(SearchParams search_params, @auto_sync_handle def save(filename, Index index, handle=None): """ - Saves the index to file. + Saves the index to a file. Saving / loading the index is experimental. The serialization format is subject to change. @@ -713,18 +697,16 @@ def save(filename, Index index, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() - >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle) + >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, + ... handle=handle) >>> ivf_flat.save("my_index.bin", index, handle=handle) """ if not index.trained: @@ -761,7 +743,7 @@ def save(filename, Index index, handle=None): @auto_sync_handle def load(filename, handle=None): """ - Loads index from file. + Loads index from a file. Saving / loading the index is experimental. The serialization format is subject to change, therefore loading an index saved with a previous @@ -780,29 +762,26 @@ def load(filename, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_flat - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build and save index >>> handle = DeviceResources() - >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle) + >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, + ... handle=handle) >>> ivf_flat.save("my_index.bin", index, handle=handle) >>> del index - >>> n_queries = 100 >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() >>> index = ivf_flat.load("my_index.bin", handle=handle) - - >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), index, - ... queries, k=10, handle=handle) + >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), + ... index, queries, k=10, + ... handle=handle) """ if handle is None: handle = DeviceResources() diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx index 413a9a1d4b..0c1bbf6b9c 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx @@ -95,7 +95,68 @@ cdef _get_dtype_string(dtype): cdef class IndexParams: + """ + Parameters to build index for IVF-PQ nearest neighbor search + Parameters + ---------- + n_list : int, default = 1024 + The number of clusters used in the coarse quantizer. + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean", "inner_product", + "euclidean"], where + - sqeuclidean is the euclidean distance without the square root + operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, + - euclidean is the euclidean distance + - inner product distance is defined as + distance(a, b) = \\sum_i a_i * b_i. + kmeans_n_iters : int, default = 20 + The number of iterations searching for kmeans centers during index + building. + kmeans_trainset_fraction : int, default = 0.5 + If kmeans_trainset_fraction is less than 1, then the dataset is + subsampled, and only n_samples * kmeans_trainset_fraction rows + are used for training. + pq_bits : int, default = 8 + The bit length of the vector element after quantization. + pq_dim : int, default = 0 + The dimensionality of a the vector after product quantization. + When zero, an optimal value is selected using a heuristic. Note + pq_dim * pq_bits must be a multiple of 8. Hint: a smaller 'pq_dim' + results in a smaller index size and better search performance, but + lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, + but multiple of 8 are desirable for good performance. If 'pq_bits' + is not 8, 'pq_dim' should be a multiple of 8. For good performance, + it is desirable that 'pq_dim' is a multiple of 32. Ideally, + 'pq_dim' should be also a divisor of the dataset dim. + codebook_kind : string, default = "subspace" + Valid values ["subspace", "cluster"] + force_random_rotation : bool, default = False + Apply a random rotation matrix on the input data and queries even + if `dim % pq_dim == 0`. Note: if `dim` is not multiple of `pq_dim`, + a random rotation is always applied to the input data and queries + to transform the working space from `dim` to `rot_dim`, which may + be slightly larger than the original space and and is a multiple + of `pq_dim` (`rot_dim % pq_dim == 0`). However, this transform is + not necessary when `dim` is multiple of `pq_dim` (`dim == rot_dim`, + hence no need in adding "extra" data columns / features). By + default, if `dim == rot_dim`, the rotation transform is + initialized with the identity matrix. When + `force_random_rotation == True`, a random orthogonal transform + matrix is generated regardless of the values of `dim` and `pq_dim`. + add_data_on_build : bool, default = True + After training the coarse and fine quantizers, we will populate + the index with the dataset if add_data_on_build == True, otherwise + the index is left empty, and the extend method can be used + to add new vectors to the index. + conservative_memory_allocation : bool, default = True + By default, the algorithm allocates more space than necessary for + individual clusters (`list_data`). This allows to amortize the cost + of memory allocation and reduce the number of data copies during + repeated calls to `extend` (extending the database). + To disable this behavior and use as little GPU memory for the + database as possible, set this flat to `True`. + """ def __init__(self, *, n_lists=1024, metric="sqeuclidean", @@ -107,69 +168,6 @@ cdef class IndexParams: force_random_rotation=False, add_data_on_build=True, conservative_memory_allocation=False): - """" - Parameters to build index for IVF-PQ nearest neighbor search - - Parameters - ---------- - n_list : int, default = 1024 - The number of clusters used in the coarse quantizer. - metric : string denoting the metric type, default="sqeuclidean" - Valid values for metric: ["sqeuclidean", "inner_product", - "euclidean"], where - - sqeuclidean is the euclidean distance without the square root - operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, - - euclidean is the euclidean distance - - inner product distance is defined as - distance(a, b) = \\sum_i a_i * b_i. - kmeans_n_iters : int, default = 20 - The number of iterations searching for kmeans centers during index - building. - kmeans_trainset_fraction : int, default = 0.5 - If kmeans_trainset_fraction is less than 1, then the dataset is - subsampled, and only n_samples * kmeans_trainset_fraction rows - are used for training. - pq_bits : int, default = 8 - The bit length of the vector element after quantization. - pq_dim : int, default = 0 - The dimensionality of a the vector after product quantization. - When zero, an optimal value is selected using a heuristic. Note - pq_dim * pq_bits must be a multiple of 8. Hint: a smaller 'pq_dim' - results in a smaller index size and better search performance, but - lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, - but multiple of 8 are desirable for good performance. If 'pq_bits' - is not 8, 'pq_dim' should be a multiple of 8. For good performance, - it is desirable that 'pq_dim' is a multiple of 32. Ideally, - 'pq_dim' should be also a divisor of the dataset dim. - codebook_kind : string, default = "subspace" - Valid values ["subspace", "cluster"] - force_random_rotation : bool, default = False - Apply a random rotation matrix on the input data and queries even - if `dim % pq_dim == 0`. Note: if `dim` is not multiple of `pq_dim`, - a random rotation is always applied to the input data and queries - to transform the working space from `dim` to `rot_dim`, which may - be slightly larger than the original space and and is a multiple - of `pq_dim` (`rot_dim % pq_dim == 0`). However, this transform is - not necessary when `dim` is multiple of `pq_dim` (`dim == rot_dim`, - hence no need in adding "extra" data columns / features). By - default, if `dim == rot_dim`, the rotation transform is - initialized with the identity matrix. When - `force_random_rotation == True`, a random orthogonal transform - matrix is generated regardless of the values of `dim` and `pq_dim`. - add_data_on_build : bool, default = True - After training the coarse and fine quantizers, we will populate - the index with the dataset if add_data_on_build == True, otherwise - the index is left empty, and the extend method can be used - to add new vectors to the index. - conservative_memory_allocation : bool, default = True - By default, the algorithm allocates more space than necessary for - individual clusters (`list_data`). This allows to amortize the cost - of memory allocation and reduce the number of data copies during - repeated calls to `extend` (extending the database). - To disable this behavior and use as little GPU memory for the - database as possible, set this flat to `True`. - - """ self.params.n_lists = n_lists self.params.metric = _get_metric(metric) self.params.metric_arg = 0 @@ -333,14 +331,11 @@ def build(IndexParams index_params, dataset, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() @@ -349,17 +344,14 @@ def build(IndexParams index_params, dataset, handle=None): ... metric="sqeuclidean", ... pq_dim=10) >>> index = ivf_pq.build(index_params, dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> k = 10 >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index, ... queries, k, handle=handle) - >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() @@ -433,25 +425,20 @@ def extend(Index index, new_vectors, new_indices, handle=None): -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle) - >>> n_rows = 100 >>> more_data = cp.random.random_sample((n_rows, n_features), ... dtype=cp.float32) >>> indices = index.size + cp.arange(n_rows, dtype=cp.int64) >>> index = ivf_pq.extend(index, more_data, indices) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -459,11 +446,9 @@ def extend(Index index, new_vectors, new_indices, handle=None): >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), ... index, queries, ... k, handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) """ @@ -520,29 +505,27 @@ def extend(Index index, new_vectors, new_indices, handle=None): cdef class SearchParams: + """ + IVF-PQ search parameters + Parameters + ---------- + n_probes: int, default = 1024 + The number of course clusters to select for the fine search. + lut_dtype: default = np.float32 + Data type of look up table to be created dynamically at search + time. The use of low-precision types reduces the amount of shared + memory required at search time, so fast shared memory kernels can + be used even for datasets with large dimansionality. Note that + the recall is slightly degraded when low-precision type is + selected. Possible values [np.float32, np.float16, np.uint8] + internal_distance_dtype: default = np.float32 + Storage data type for distance/similarity computation. + Possible values [np.float32, np.float16] + """ def __init__(self, *, n_probes=20, lut_dtype=np.float32, internal_distance_dtype=np.float32): - """ - IVF-PQ search parameters - - Parameters - ---------- - n_probes: int, default = 1024 - The number of course clusters to select for the fine search. - lut_dtype: default = np.float32 - Data type of look up table to be created dynamically at search - time. The use of low-precision types reduces the amount of shared - memory required at search time, so fast shared memory kernels can - be used even for datasets with large dimansionality. Note that - the recall is slightly degraded when low-precision type is - selected. Possible values [np.float32, np.float16, np.uint8] - internal_distance_dtype: default = np.float32 - Storage data type for distance/similarity computation. - Possible values [np.float32, np.float16] - """ - self.params.n_probes = n_probes self.params.lut_dtype = _map_dtype_np_to_cuda(lut_dtype) self.params.internal_distance_dtype = \ @@ -611,20 +594,16 @@ def search(SearchParams search_params, Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) @@ -634,7 +613,6 @@ def search(SearchParams search_params, ... lut_dtype=cp.float16, ... internal_distance_dtype=cp.float32 ... ) - >>> # Using a pooling allocator reduces overhead of temporary array >>> # creation during search. This is useful if multiple searches >>> # are performad with same query size. @@ -647,11 +625,9 @@ def search(SearchParams search_params, >>> distances, neighbors = ivf_pq.search(search_params, index, queries, ... k, memory_resource=mr, ... handle=handle) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync() - >>> neighbors = cp.asarray(neighbors) >>> distances = cp.asarray(distances) """ @@ -728,7 +704,7 @@ def search(SearchParams search_params, @auto_sync_handle def save(filename, Index index, handle=None): """ - Saves the index to file. + Saves the index to a file. Saving / loading the index is experimental. The serialization format is subject to change. @@ -744,15 +720,12 @@ def save(filename, Index index, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build index >>> handle = DeviceResources() >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle) @@ -774,7 +747,7 @@ def save(filename, Index index, handle=None): @auto_sync_handle def load(filename, handle=None): """ - Loads index from file. + Loads index from a file. Saving / loading the index is experimental. The serialization format is subject to change, therefore loading an index saved with a previous @@ -793,27 +766,22 @@ def load(filename, handle=None): Examples -------- >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq - >>> n_samples = 50000 >>> n_features = 50 >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) - >>> # Build and save index >>> handle = DeviceResources() >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle) >>> ivf_pq.save("my_index.bin", index, handle=handle) >>> del index - >>> n_queries = 100 >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() >>> index = ivf_pq.load("my_index.bin", handle=handle) - >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index, ... queries, k=10, handle=handle) """ diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx index 5e57da713c..a9bf811c9f 100644 --- a/python/pylibraft/pylibraft/neighbors/refine.pyx +++ b/python/pylibraft/pylibraft/neighbors/refine.pyx @@ -192,19 +192,19 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None, queries : array interface compliant matrix, shape (n_queries, dim) Supported dtype [float, int8, uint8] candidates : array interface compliant matrix, shape (n_queries, k0) - dtype int64 + Supported dtype int64 k : int Number of neighbors to search (k <= k0). Optional if indices or distances arrays are given (in which case their second dimension is k). - indices : Optional array interface compliant matrix shape - (n_queries, k), dtype int64. If supplied, neighbor - indices will be written here in-place. (default None) - Supported dtype int64 - distances : Optional array interface compliant matrix shape - (n_queries, k), dtype float. If supplied, neighbor - indices will be written here in-place. (default None) - + indices : Optional array interface compliant matrix shape \ + (n_queries, k). + If supplied, neighbor indices will be written here in-place. + (default None). Supported dtype int64. + distances : Optional array interface compliant matrix shape \ + (n_queries, k). + If supplied, neighbor indices will be written here in-place. + (default None) Supported dtype float. {handle_docstring} Returns @@ -213,36 +213,30 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None, Examples -------- - >>> import cupy as cp - >>> from pylibraft.common import DeviceResources >>> from pylibraft.neighbors import ivf_pq, refine - >>> n_samples = 50000 >>> n_features = 50 >>> n_queries = 1000 - >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = DeviceResources() - >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean", + >>> index_params = ivf_pq.IndexParams(n_lists=1024, + ... metric="sqeuclidean", ... pq_dim=10) >>> index = ivf_pq.build(index_params, dataset, handle=handle) - >>> # Search using the built index >>> queries = cp.random.random_sample((n_queries, n_features), ... dtype=cp.float32) >>> k = 40 >>> _, candidates = ivf_pq.search(ivf_pq.SearchParams(), index, ... queries, k, handle=handle) - >>> k = 10 >>> distances, neighbors = refine(dataset, queries, candidates, k, ... handle=handle) >>> distances = cp.asarray(distances) >>> neighbors = cp.asarray(neighbors) - >>> # pylibraft functions are often asynchronous so the >>> # handle needs to be explicitly synchronized >>> handle.sync()