From d233a2cba9108b37727440e88d0ad6e406d28d5f Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Thu, 19 Jan 2023 19:34:52 -0800 Subject: [PATCH] Use squeuclidean for metric name in ivf_pq python bindings (#1160) Use sqeuclidean instead of l2_expanded for the distance name in the ivf_pq python bindings. This matches both sklearn, and the RAFT pairwise_distance api - and should be less confusing for our users Authors: - Ben Frederickson (https://github.com/benfred) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/1160 --- .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx | 23 ++++++++++------ .../pylibraft/pylibraft/neighbors/refine.pyx | 6 ++--- .../pylibraft/pylibraft/test/test_ivf_pq.py | 26 +++++++++---------- .../pylibraft/pylibraft/test/test_refine.py | 8 +++--- 4 files changed, 35 insertions(+), 28 deletions(-) diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx index ee30864193..8f8a49fb63 100644 --- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx +++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx @@ -18,6 +18,8 @@ # cython: embedsignature = True # cython: language_level = 3 +import warnings + import numpy as np from cython.operator cimport dereference as deref @@ -63,17 +65,22 @@ from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport ( def _get_metric(metric): SUPPORTED_DISTANCES = { - "l2_expanded": DistanceType.L2Expanded, + "sqeuclidean": DistanceType.L2Expanded, "euclidean": DistanceType.L2SqrtExpanded, "inner_product": DistanceType.InnerProduct } if metric not in SUPPORTED_DISTANCES: + if metric == "l2_expanded": + warnings.warn("Using l2_expanded as a metric name is deprecated," + " use sqeuclidean instead", FutureWarning) + return DistanceType.L2Expanded + raise ValueError("metric %s is not supported" % metric) return SUPPORTED_DISTANCES[metric] cdef _get_metric_string(DistanceType metric): - return {DistanceType.L2Expanded : "l2_expanded", + return {DistanceType.L2Expanded : "sqeuclidean", DistanceType.InnerProduct: "inner_product", DistanceType.L2SqrtExpanded: "euclidean"}[metric] @@ -118,7 +125,7 @@ cdef class IndexParams: def __init__(self, *, n_lists=1024, - metric="l2_expanded", + metric="sqeuclidean", kmeans_n_iters=20, kmeans_trainset_fraction=0.5, pq_bits=8, @@ -133,10 +140,10 @@ cdef class IndexParams: ---------- n_list : int, default = 1024 The number of clusters used in the coarse quantizer. - metric : string denoting the metric type, default="l2_expanded" - Valid values for metric: ["l2_expanded", "inner_product", + metric : string denoting the metric type, default="sqeuclidean" + Valid values for metric: ["sqeuclidean", "inner_product", "euclidean"], where - - l2_expanded is the euclidean distance without the square root + - sqeuclidean is the euclidean distance without the square root operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2, - euclidean is the euclidean distance - inner product distance is defined as @@ -251,7 +258,7 @@ cdef class Index: # We create a placeholder object. The actual parameter values do # not matter, it will be replaced with a built index object later. self.index = new c_ivf_pq.index[uint64_t]( - deref(handle_), _get_metric("l2_expanded"), + deref(handle_), _get_metric("sqeuclidean"), c_ivf_pq.codebook_gen.PER_SUBSPACE, 1, 4, @@ -347,7 +354,7 @@ def build(IndexParams index_params, dataset, handle=None): >>> handle = Handle() >>> index_params = ivf_pq.IndexParams( ... n_lists=1024, - ... metric="l2_expanded", + ... metric="sqeuclidean", ... pq_dim=10) >>> index = ivf_pq.build(index_params, dataset, handle=handle) diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx index 37ef69e7b5..b8f1bd0caa 100644 --- a/python/pylibraft/pylibraft/neighbors/refine.pyx +++ b/python/pylibraft/pylibraft/neighbors/refine.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -215,7 +215,7 @@ cdef host_matrix_view[int8_t, uint64_t, row_major] \ @auto_sync_handle @auto_convert_output def refine(dataset, queries, candidates, k=None, indices=None, distances=None, - metric="l2_expanded", handle=None): + metric="sqeuclidean", handle=None): """ Refine nearest neighbor search. @@ -271,7 +271,7 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None, >>> dataset = cp.random.random_sample((n_samples, n_features), ... dtype=cp.float32) >>> handle = Handle() - >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="l2_expanded", + >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean", ... pq_dim=10) >>> index = ivf_pq.build(index_params, dataset, handle=handle) diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py index db1389c6cd..6952408c02 100644 --- a/python/pylibraft/pylibraft/test/test_ivf_pq.py +++ b/python/pylibraft/pylibraft/test/test_ivf_pq.py @@ -58,7 +58,7 @@ def check_distances(dataset, queries, metric, out_idx, out_dist, eps=None): for i in range(queries.shape[0]): X = queries[np.newaxis, i, :] Y = dataset[out_idx[i, :], :] - if metric == "l2_expanded": + if metric == "sqeuclidean": dist[i, :] = pairwise_distances(X, Y, "sqeuclidean") elif metric == "euclidean": dist[i, :] = pairwise_distances(X, Y, "euclidean") @@ -177,7 +177,7 @@ def run_ivf_pq_build_search_test( # Calculate reference values with sklearn skl_metric = { - "l2_expanded": "sqeuclidean", + "sqeuclidean": "sqeuclidean", "inner_product": "cosine", "euclidean": "euclidean", }[metric] @@ -204,14 +204,14 @@ def test_ivf_pq_dtypes( n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type ): # Note that inner_product tests use normalized input which we cannot - # represent in int8, therefore we test only l2_expanded metric here. + # represent in int8, therefore we test only sqeuclidean metric here. run_ivf_pq_build_search_test( n_rows=n_rows, n_cols=n_cols, n_queries=n_queries, k=10, n_lists=n_lists, - metric="l2_expanded", + metric="sqeuclidean", dtype=dtype, inplace=inplace, array_type=array_type, @@ -246,14 +246,14 @@ def test_ivf_pq_n(params): n_queries=params["n_queries"], k=params["k"], n_lists=params["n_lists"], - metric="l2_expanded", + metric="sqeuclidean", dtype=np.float32, compare=False, ) @pytest.mark.parametrize( - "metric", ["l2_expanded", "inner_product", "euclidean"] + "metric", ["sqeuclidean", "inner_product", "euclidean"] ) @pytest.mark.parametrize("dtype", [np.float32]) @pytest.mark.parametrize("codebook_kind", ["subspace", "cluster"]) @@ -298,7 +298,7 @@ def test_ivf_pq_params(params): n_queries=1000, k=10, n_lists=params["n_lists"], - metric="l2_expanded", + metric="sqeuclidean", dtype=np.float32, pq_bits=params["pq_bits"], pq_dim=params["pq_dims"], @@ -344,7 +344,7 @@ def test_ivf_pq_search_params(params): k=params["k"], n_lists=100, n_probes=params["n_probes"], - metric="l2_expanded", + metric="sqeuclidean", dtype=np.float32, lut_dtype=params["lut"], internal_distance_dtype=params["idd"], @@ -360,7 +360,7 @@ def test_extend(dtype, array_type): n_queries=100, k=10, n_lists=100, - metric="l2_expanded", + metric="sqeuclidean", dtype=dtype, add_data_on_build=False, array_type=array_type, @@ -375,7 +375,7 @@ def test_build_assertions(): n_queries=100, k=10, n_lists=100, - metric="l2_expanded", + metric="sqeuclidean", dtype=np.float64, ) @@ -388,7 +388,7 @@ def test_build_assertions(): index_params = ivf_pq.IndexParams( n_lists=50, - metric="l2_expanded", + metric="sqeuclidean", kmeans_n_iters=20, kmeans_trainset_fraction=1, add_data_on_build=False, @@ -482,7 +482,7 @@ def test_search_inputs(params): out_dist_device = device_ndarray(out_dist) index_params = ivf_pq.IndexParams( - n_lists=50, metric="l2_expanded", add_data_on_build=True + n_lists=50, metric="sqeuclidean", add_data_on_build=True ) dataset = generate_data((n_rows, n_cols), dtype) @@ -511,7 +511,7 @@ def test_save_load(): dataset = generate_data((n_rows, n_cols), dtype) dataset_device = device_ndarray(dataset) - build_params = ivf_pq.IndexParams(n_lists=100, metric="l2_expanded") + build_params = ivf_pq.IndexParams(n_lists=100, metric="sqeuclidean") index = ivf_pq.build(build_params, dataset_device) assert index.trained diff --git a/python/pylibraft/pylibraft/test/test_refine.py b/python/pylibraft/pylibraft/test/test_refine.py index 2f3bef2e0c..8502d0575c 100644 --- a/python/pylibraft/pylibraft/test/test_refine.py +++ b/python/pylibraft/pylibraft/test/test_refine.py @@ -27,7 +27,7 @@ def run_refine( n_rows=500, n_cols=50, n_queries=100, - metric="l2_expanded", + metric="sqeuclidean", k0=40, k=10, inplace=False, @@ -49,7 +49,7 @@ def run_refine( queries_device = device_ndarray(queries) # Calculate reference values with sklearn - skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[ + skl_metric = {"sqeuclidean": "euclidean", "inner_product": "cosine"}[ metric ] nn_skl = NearestNeighbors( @@ -106,7 +106,7 @@ def run_refine( if recall <= 0.999: # We did not find the same neighbor indices. # We could have found other neighbor with same distance. - if metric == "l2_expanded": + if metric == "sqeuclidean": skl_dist = np.power(skl_dist[:, :k], 2) elif metric == "inner_product": skl_dist = 1 - skl_dist[:, :k] @@ -120,7 +120,7 @@ def run_refine( @pytest.mark.parametrize("n_queries", [100, 1024, 37]) @pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"]) +@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"]) @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8]) @pytest.mark.parametrize("memory_type", ["device", "host"]) def test_refine_dtypes(n_queries, dtype, inplace, metric, memory_type):