Skip to content

Commit

Permalink
Use squeuclidean for metric name in ivf_pq python bindings (rapidsai#…
Browse files Browse the repository at this point in the history
…1160)

Use sqeuclidean instead of l2_expanded for the distance name in the ivf_pq python bindings. This matches both sklearn, and the RAFT pairwise_distance api - and should be less confusing for our users

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: rapidsai#1160
  • Loading branch information
benfred authored and ahendriksen committed Jan 23, 2023
1 parent 2316061 commit 165e940
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 28 deletions.
23 changes: 15 additions & 8 deletions python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
# cython: embedsignature = True
# cython: language_level = 3

import warnings

import numpy as np

from cython.operator cimport dereference as deref
Expand Down Expand Up @@ -63,17 +65,22 @@ from pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq cimport (

def _get_metric(metric):
SUPPORTED_DISTANCES = {
"l2_expanded": DistanceType.L2Expanded,
"sqeuclidean": DistanceType.L2Expanded,
"euclidean": DistanceType.L2SqrtExpanded,
"inner_product": DistanceType.InnerProduct
}
if metric not in SUPPORTED_DISTANCES:
if metric == "l2_expanded":
warnings.warn("Using l2_expanded as a metric name is deprecated,"
" use sqeuclidean instead", FutureWarning)
return DistanceType.L2Expanded

raise ValueError("metric %s is not supported" % metric)
return SUPPORTED_DISTANCES[metric]


cdef _get_metric_string(DistanceType metric):
return {DistanceType.L2Expanded : "l2_expanded",
return {DistanceType.L2Expanded : "sqeuclidean",
DistanceType.InnerProduct: "inner_product",
DistanceType.L2SqrtExpanded: "euclidean"}[metric]

Expand Down Expand Up @@ -118,7 +125,7 @@ cdef class IndexParams:

def __init__(self, *,
n_lists=1024,
metric="l2_expanded",
metric="sqeuclidean",
kmeans_n_iters=20,
kmeans_trainset_fraction=0.5,
pq_bits=8,
Expand All @@ -133,10 +140,10 @@ cdef class IndexParams:
----------
n_list : int, default = 1024
The number of clusters used in the coarse quantizer.
metric : string denoting the metric type, default="l2_expanded"
Valid values for metric: ["l2_expanded", "inner_product",
metric : string denoting the metric type, default="sqeuclidean"
Valid values for metric: ["sqeuclidean", "inner_product",
"euclidean"], where
- l2_expanded is the euclidean distance without the square root
- sqeuclidean is the euclidean distance without the square root
operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
- euclidean is the euclidean distance
- inner product distance is defined as
Expand Down Expand Up @@ -251,7 +258,7 @@ cdef class Index:
# We create a placeholder object. The actual parameter values do
# not matter, it will be replaced with a built index object later.
self.index = new c_ivf_pq.index[uint64_t](
deref(handle_), _get_metric("l2_expanded"),
deref(handle_), _get_metric("sqeuclidean"),
c_ivf_pq.codebook_gen.PER_SUBSPACE,
<uint32_t>1,
<uint32_t>4,
Expand Down Expand Up @@ -347,7 +354,7 @@ def build(IndexParams index_params, dataset, handle=None):
>>> handle = Handle()
>>> index_params = ivf_pq.IndexParams(
... n_lists=1024,
... metric="l2_expanded",
... metric="sqeuclidean",
... pq_dim=10)
>>> index = ivf_pq.build(index_params, dataset, handle=handle)
Expand Down
6 changes: 3 additions & 3 deletions python/pylibraft/pylibraft/neighbors/refine.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -215,7 +215,7 @@ cdef host_matrix_view[int8_t, uint64_t, row_major] \
@auto_sync_handle
@auto_convert_output
def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
metric="l2_expanded", handle=None):
metric="sqeuclidean", handle=None):
"""
Refine nearest neighbor search.
Expand Down Expand Up @@ -271,7 +271,7 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
>>> dataset = cp.random.random_sample((n_samples, n_features),
... dtype=cp.float32)
>>> handle = Handle()
>>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="l2_expanded",
>>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean",
... pq_dim=10)
>>> index = ivf_pq.build(index_params, dataset, handle=handle)
Expand Down
26 changes: 13 additions & 13 deletions python/pylibraft/pylibraft/test/test_ivf_pq.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def check_distances(dataset, queries, metric, out_idx, out_dist, eps=None):
for i in range(queries.shape[0]):
X = queries[np.newaxis, i, :]
Y = dataset[out_idx[i, :], :]
if metric == "l2_expanded":
if metric == "sqeuclidean":
dist[i, :] = pairwise_distances(X, Y, "sqeuclidean")
elif metric == "euclidean":
dist[i, :] = pairwise_distances(X, Y, "euclidean")
Expand Down Expand Up @@ -177,7 +177,7 @@ def run_ivf_pq_build_search_test(

# Calculate reference values with sklearn
skl_metric = {
"l2_expanded": "sqeuclidean",
"sqeuclidean": "sqeuclidean",
"inner_product": "cosine",
"euclidean": "euclidean",
}[metric]
Expand All @@ -204,14 +204,14 @@ def test_ivf_pq_dtypes(
n_rows, n_cols, n_queries, n_lists, dtype, inplace, array_type
):
# Note that inner_product tests use normalized input which we cannot
# represent in int8, therefore we test only l2_expanded metric here.
# represent in int8, therefore we test only sqeuclidean metric here.
run_ivf_pq_build_search_test(
n_rows=n_rows,
n_cols=n_cols,
n_queries=n_queries,
k=10,
n_lists=n_lists,
metric="l2_expanded",
metric="sqeuclidean",
dtype=dtype,
inplace=inplace,
array_type=array_type,
Expand Down Expand Up @@ -246,14 +246,14 @@ def test_ivf_pq_n(params):
n_queries=params["n_queries"],
k=params["k"],
n_lists=params["n_lists"],
metric="l2_expanded",
metric="sqeuclidean",
dtype=np.float32,
compare=False,
)


@pytest.mark.parametrize(
"metric", ["l2_expanded", "inner_product", "euclidean"]
"metric", ["sqeuclidean", "inner_product", "euclidean"]
)
@pytest.mark.parametrize("dtype", [np.float32])
@pytest.mark.parametrize("codebook_kind", ["subspace", "cluster"])
Expand Down Expand Up @@ -298,7 +298,7 @@ def test_ivf_pq_params(params):
n_queries=1000,
k=10,
n_lists=params["n_lists"],
metric="l2_expanded",
metric="sqeuclidean",
dtype=np.float32,
pq_bits=params["pq_bits"],
pq_dim=params["pq_dims"],
Expand Down Expand Up @@ -344,7 +344,7 @@ def test_ivf_pq_search_params(params):
k=params["k"],
n_lists=100,
n_probes=params["n_probes"],
metric="l2_expanded",
metric="sqeuclidean",
dtype=np.float32,
lut_dtype=params["lut"],
internal_distance_dtype=params["idd"],
Expand All @@ -360,7 +360,7 @@ def test_extend(dtype, array_type):
n_queries=100,
k=10,
n_lists=100,
metric="l2_expanded",
metric="sqeuclidean",
dtype=dtype,
add_data_on_build=False,
array_type=array_type,
Expand All @@ -375,7 +375,7 @@ def test_build_assertions():
n_queries=100,
k=10,
n_lists=100,
metric="l2_expanded",
metric="sqeuclidean",
dtype=np.float64,
)

Expand All @@ -388,7 +388,7 @@ def test_build_assertions():

index_params = ivf_pq.IndexParams(
n_lists=50,
metric="l2_expanded",
metric="sqeuclidean",
kmeans_n_iters=20,
kmeans_trainset_fraction=1,
add_data_on_build=False,
Expand Down Expand Up @@ -482,7 +482,7 @@ def test_search_inputs(params):
out_dist_device = device_ndarray(out_dist)

index_params = ivf_pq.IndexParams(
n_lists=50, metric="l2_expanded", add_data_on_build=True
n_lists=50, metric="sqeuclidean", add_data_on_build=True
)

dataset = generate_data((n_rows, n_cols), dtype)
Expand Down Expand Up @@ -511,7 +511,7 @@ def test_save_load():
dataset = generate_data((n_rows, n_cols), dtype)
dataset_device = device_ndarray(dataset)

build_params = ivf_pq.IndexParams(n_lists=100, metric="l2_expanded")
build_params = ivf_pq.IndexParams(n_lists=100, metric="sqeuclidean")
index = ivf_pq.build(build_params, dataset_device)

assert index.trained
Expand Down
8 changes: 4 additions & 4 deletions python/pylibraft/pylibraft/test/test_refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def run_refine(
n_rows=500,
n_cols=50,
n_queries=100,
metric="l2_expanded",
metric="sqeuclidean",
k0=40,
k=10,
inplace=False,
Expand All @@ -49,7 +49,7 @@ def run_refine(
queries_device = device_ndarray(queries)

# Calculate reference values with sklearn
skl_metric = {"l2_expanded": "euclidean", "inner_product": "cosine"}[
skl_metric = {"sqeuclidean": "euclidean", "inner_product": "cosine"}[
metric
]
nn_skl = NearestNeighbors(
Expand Down Expand Up @@ -106,7 +106,7 @@ def run_refine(
if recall <= 0.999:
# We did not find the same neighbor indices.
# We could have found other neighbor with same distance.
if metric == "l2_expanded":
if metric == "sqeuclidean":
skl_dist = np.power(skl_dist[:, :k], 2)
elif metric == "inner_product":
skl_dist = 1 - skl_dist[:, :k]
Expand All @@ -120,7 +120,7 @@ def run_refine(

@pytest.mark.parametrize("n_queries", [100, 1024, 37])
@pytest.mark.parametrize("inplace", [True, False])
@pytest.mark.parametrize("metric", ["l2_expanded", "inner_product"])
@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
@pytest.mark.parametrize("memory_type", ["device", "host"])
def test_refine_dtypes(n_queries, dtype, inplace, metric, memory_type):
Expand Down

0 comments on commit 165e940

Please sign in to comment.