Use squeuclidean for metric name in ivf_pq python bindings (#1160)

Use sqeuclidean instead of l2_expanded for the distance name in the ivf_pq python bindings. This matches both sklearn, and the RAFT pairwise_distance api - and should be less confusing for our users Authors: - Ben Frederickson (https://github.com/benfred) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #1160
rapidsai · Jan 20, 2023 · d233a2c · d233a2c
1 parent f2bc24d
commit d233a2c
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 28 deletions.
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -18,6 +18,8 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
+import warnings
+
 import numpy as np
 
 from cython.operator cimport dereference as deref
@@ -63,17 +65,22 @@ from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport (
 
 def _get_metric(metric):
     SUPPORTED_DISTANCES = {
-        "l2_expanded": DistanceType.L2Expanded,
+        "sqeuclidean": DistanceType.L2Expanded,
         "euclidean": DistanceType.L2SqrtExpanded,
         "inner_product": DistanceType.InnerProduct
     }
     if metric not in SUPPORTED_DISTANCES:
+        if metric == "l2_expanded":
+            warnings.warn("Using l2_expanded as a metric name is deprecated,"
+                          " use sqeuclidean instead", FutureWarning)
+            return DistanceType.L2Expanded
+
         raise ValueError("metric %s is not supported" % metric)
     return SUPPORTED_DISTANCES[metric]
 
 
 cdef _get_metric_string(DistanceType metric):
-    return {DistanceType.L2Expanded : "l2_expanded",
+    return {DistanceType.L2Expanded : "sqeuclidean",
             DistanceType.InnerProduct: "inner_product",
             DistanceType.L2SqrtExpanded: "euclidean"}[metric]
 
@@ -118,7 +125,7 @@ cdef class IndexParams:
 
     def __init__(self, *,
                  n_lists=1024,
-                 metric="l2_expanded",
+                 metric="sqeuclidean",
                  kmeans_n_iters=20,
                  kmeans_trainset_fraction=0.5,
                  pq_bits=8,
@@ -133,10 +140,10 @@ cdef class IndexParams:
         ----------
         n_list : int, default = 1024
             The number of clusters used in the coarse quantizer.
-        metric : string denoting the metric type, default="l2_expanded"
-            Valid values for metric: ["l2_expanded", "inner_product",
+        metric : string denoting the metric type, default="sqeuclidean"
+            Valid values for metric: ["sqeuclidean", "inner_product",
             "euclidean"], where
-            - l2_expanded is the euclidean distance without the square root
+            - sqeuclidean is the euclidean distance without the square root
               operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
             - euclidean is the euclidean distance
             - inner product distance is defined as
@@ -251,7 +258,7 @@ cdef class Index:
         # We create a placeholder object. The actual parameter values do
         # not matter, it will be replaced with a built index object later.
         self.index = new c_ivf_pq.index[uint64_t](
-            deref(handle_), _get_metric("l2_expanded"),
+            deref(handle_), _get_metric("sqeuclidean"),
             c_ivf_pq.codebook_gen.PER_SUBSPACE,
             <uint32_t>1,
             <uint32_t>4,
@@ -347,7 +354,7 @@ def build(IndexParams index_params, dataset, handle=None):
     >>> handle = Handle()
     >>> index_params = ivf_pq.IndexParams(
     ...     n_lists=1024,
-    ...     metric="l2_expanded",
+    ...     metric="sqeuclidean",
     ...     pq_dim=10)
     >>> index = ivf_pq.build(index_params, dataset, handle=handle)
 

diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -215,7 +215,7 @@ cdef host_matrix_view[int8_t, uint64_t, row_major] \
 @auto_sync_handle
 @auto_convert_output
 def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
-           metric="l2_expanded", handle=None):
+           metric="sqeuclidean", handle=None):
     """
     Refine nearest neighbor search.
 
@@ -271,7 +271,7 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> handle = Handle()
-    >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="l2_expanded",
+    >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean",
     ...                                   pq_dim=10)
     >>> index = ivf_pq.build(index_params, dataset, handle=handle)
 

diff --git a/python/pylibraft/pylibraft/test/test_ivf_pq.py b/python/pylibraft/pylibraft/test/test_ivf_pq.py
@@ -58,7 +58,7 @@ def check_distances(dataset, queries, metric, out_idx, out_dist, eps=None):
     for i in range(queries.shape[0]):
         X = queries[np.newaxis, i, :]
         Y = dataset[out_idx[i, :], :]
-        if metric == "l2_expanded":
+        if metric == "sqeuclidean":
             dist[i, :] = pairwise_distances(X, Y, "sqeuclidean")
         elif metric == "euclidean":
             dist[i, :] = pairwise_distances(X, Y, "euclidean")
@@ -177,7 +177,7 @@ def run_ivf_pq_build_search_test(
 
     # Calculate reference values with sklearn
     skl_metric = {
-        "l2_expanded": "sqeuclidean",
+        "sqeuclidean": "sqeuclidean",
         "inner_product": "cosine",
         "euclidean": "euclidean",
     }[metric]
@@ -204,14 +204,14 @@ def test_ivf_pq_dtypes(
     n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type
 ):
     # Note that inner_product tests use normalized input which we cannot
-    # represent in int8, therefore we test only l2_expanded metric here.
+    # represent in int8, therefore we test only sqeuclidean metric here.
     run_ivf_pq_build_search_test(
         n_rows=n_rows,
         n_cols=n_cols,
         n_queries=n_queries,
         k=10,
         n_lists=n_lists,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=dtype,
         inplace=inplace,
         array_type=array_type,
@@ -246,14 +246,14 @@ def test_ivf_pq_n(params):
         n_queries=params["n_queries"],
         k=params["k"],
         n_lists=params["n_lists"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         compare=False,
     )
 
 
 @pytest.mark.parametrize(
-    "metric", ["l2_expanded", "inner_product", "euclidean"]
+    "metric", ["sqeuclidean", "inner_product", "euclidean"]
 )
 @pytest.mark.parametrize("dtype", [np.float32])
 @pytest.mark.parametrize("codebook_kind", ["subspace", "cluster"])
@@ -298,7 +298,7 @@ def test_ivf_pq_params(params):
         n_queries=1000,
         k=10,
         n_lists=params["n_lists"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         pq_bits=params["pq_bits"],
         pq_dim=params["pq_dims"],
@@ -344,7 +344,7 @@ def test_ivf_pq_search_params(params):
         k=params["k"],
         n_lists=100,
         n_probes=params["n_probes"],
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=np.float32,
         lut_dtype=params["lut"],
         internal_distance_dtype=params["idd"],
@@ -360,7 +360,7 @@ def test_extend(dtype, array_type):
         n_queries=100,
         k=10,
         n_lists=100,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         dtype=dtype,
         add_data_on_build=False,
         array_type=array_type,
@@ -375,7 +375,7 @@ def test_build_assertions():
             n_queries=100,
             k=10,
             n_lists=100,
-            metric="l2_expanded",
+            metric="sqeuclidean",
             dtype=np.float64,
         )
 
@@ -388,7 +388,7 @@ def test_build_assertions():
 
     index_params = ivf_pq.IndexParams(
         n_lists=50,
-        metric="l2_expanded",
+        metric="sqeuclidean",
         kmeans_n_iters=20,
         kmeans_trainset_fraction=1,
         add_data_on_build=False,
@@ -482,7 +482,7 @@ def test_search_inputs(params):
     out_dist_device = device_ndarray(out_dist)
 
     index_params = ivf_pq.IndexParams(
-        n_lists=50, metric="l2_expanded", add_data_on_build=True
+        n_lists=50, metric="sqeuclidean", add_data_on_build=True
     )
 
     dataset = generate_data((n_rows, n_cols), dtype)
@@ -511,7 +511,7 @@ def test_save_load():
     dataset = generate_data((n_rows, n_cols), dtype)
     dataset_device = device_ndarray(dataset)
 
-    build_params = ivf_pq.IndexParams(n_lists=100, metric="l2_expanded")
+    build_params = ivf_pq.IndexParams(n_lists=100, metric="sqeuclidean")
     index = ivf_pq.build(build_params, dataset_device)
 
     assert index.trained

diff --git a/python/pylibraft/pylibraft/test/test_refine.py b/python/pylibraft/pylibraft/test/test_refine.py
@@ -27,7 +27,7 @@ def run_refine(
     n_rows=500,
     n_cols=50,
     n_queries=100,
-    metric="l2_expanded",
+    metric="sqeuclidean",
     k0=40,
     k=10,
     inplace=False,
@@ -49,7 +49,7 @@ def run_refine(
     queries_device = device_ndarray(queries)
 
     # Calculate reference values with sklearn
-    skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[
+    skl_metric = {"sqeuclidean": "euclidean", "inner_product": "cosine"}[
         metric
     ]
     nn_skl = NearestNeighbors(
@@ -106,7 +106,7 @@ def run_refine(
     if recall <= 0.999:
         # We did not find the same neighbor indices.
         # We could have found other neighbor with same distance.
-        if metric == "l2_expanded":
+        if metric == "sqeuclidean":
             skl_dist = np.power(skl_dist[:, :k], 2)
         elif metric == "inner_product":
             skl_dist = 1 - skl_dist[:, :k]
@@ -120,7 +120,7 @@ def run_refine(
 
 @pytest.mark.parametrize("n_queries", [100, 1024, 37])
 @pytest.mark.parametrize("inplace", [True, False])
-@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"])
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("memory_type", ["device", "host"])
 def test_refine_dtypes(n_queries, dtype, inplace, metric, memory_type):