From d1bea11c15932882aa7d829b559ad8e42a0eea25 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 10 May 2022 18:33:53 -0400 Subject: [PATCH 01/10] Some fixes to pairwise distances --- .../pylibraft/distance/pairwise_distance.pyx | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx index e667015ac8..a6b78a05aa 100644 --- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx +++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx @@ -68,21 +68,25 @@ DISTANCE_TYPES = { "jensenshannon": DistanceType.JensenShannon, "hamming": DistanceType.HammingUnexpanded, "kl_divergence": DistanceType.KLDivergence, + "minkowski": DistanceType.LpUnexpanded, "russellrao": DistanceType.RusselRaoExpanded, "dice": DistanceType.DiceExpanded } -SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys()) +SUPPORTED_DISTANCES = ["euclidean", "l1", "cityblock", "l2", "inner_product", + "chebyshev", "minkowski", "canberra", "kl_divergence", + "correlation", "russellrao", "hellinger", "lp", + "hamming", "jensenshannon"] -def distance(X, Y, dists, metric="euclidean"): +def distance(X, Y, dists, metric="euclidean", p=0.0): """ Compute pairwise distances between X and Y Valid values for metric: ["euclidean", "l2", "l1", "cityblock", "inner_product", "chebyshev", "canberra", "lp", "hellinger", "jensenshannon", - "kl_divergence", "russellrao"] + "kl_divergence", "russellrao", "minkowski", "correlation"] Parameters ---------- @@ -132,6 +136,12 @@ def distance(X, Y, dists, metric="euclidean"): y_dt = np.dtype(y_cai["typestr"]) d_dt = np.dtype(dists_cai["typestr"]) + x_c_contiguous = "strides" not in x_cai or x_cai["strides"] is None + y_c_contiguous = "strides" not in y_cai or y_cai["strides"] is None + + if x_c_contiguous != y_c_contiguous: + raise ValueError("Inputs must have matching strides") + if metric not in SUPPORTED_DISTANCES: raise ValueError("metric %s is not supported" % metric) @@ -149,8 +159,8 @@ def distance(X, Y, dists, metric="euclidean"): n, k, distance_type, - True, - 0.0) + x_c_contiguous, + p) elif x_dt == np.float64: pairwise_distance(deref(h), x_ptr, @@ -160,7 +170,7 @@ def distance(X, Y, dists, metric="euclidean"): n, k, distance_type, - True, - 0.0) + x_c_contiguous, + p) else: raise ValueError("dtype %s not supported" % x_dt) From d5d6619460d75c136b4f3dee0cd880a719988192 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 10 May 2022 18:54:02 -0400 Subject: [PATCH 02/10] Updates --- python/pylibraft/pylibraft/distance/pairwise_distance.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx index a6b78a05aa..181b95f0d5 100644 --- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx +++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx @@ -79,7 +79,7 @@ SUPPORTED_DISTANCES = ["euclidean", "l1", "cityblock", "l2", "inner_product", "hamming", "jensenshannon"] -def distance(X, Y, dists, metric="euclidean", p=0.0): +def distance(X, Y, dists, metric="euclidean", p=2.0): """ Compute pairwise distances between X and Y @@ -139,6 +139,8 @@ def distance(X, Y, dists, metric="euclidean", p=0.0): x_c_contiguous = "strides" not in x_cai or x_cai["strides"] is None y_c_contiguous = "strides" not in y_cai or y_cai["strides"] is None + print(str(x_c_contiguous)) + if x_c_contiguous != y_c_contiguous: raise ValueError("Inputs must have matching strides") From dca96bf546449fe4bee1d25bb7eec108579fafb8 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 10 May 2022 18:55:36 -0400 Subject: [PATCH 03/10] Adding additional validation --- python/pylibraft/pylibraft/distance/pairwise_distance.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx index 181b95f0d5..da1b77fb27 100644 --- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx +++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx @@ -117,13 +117,16 @@ def distance(X, Y, dists, metric="euclidean", p=2.0): pairwise_distance(in1, in2, output, metric="euclidean") """ - # TODO: Validate inputs, shapes, etc... x_cai = X.__cuda_array_interface__ y_cai = Y.__cuda_array_interface__ dists_cai = dists.__cuda_array_interface__ m = x_cai["shape"][0] n = y_cai["shape"][0] + + if x_cai["shape"][1] != y_cai["shape"][1]: + raise ValueError("Inputs must have same number of columns.") + k = x_cai["shape"][1] x_ptr = x_cai["data"][0] @@ -139,8 +142,6 @@ def distance(X, Y, dists, metric="euclidean", p=2.0): x_c_contiguous = "strides" not in x_cai or x_cai["strides"] is None y_c_contiguous = "strides" not in y_cai or y_cai["strides"] is None - print(str(x_c_contiguous)) - if x_c_contiguous != y_c_contiguous: raise ValueError("Inputs must have matching strides") From beb0b8459f147207418e573b893bf0e750896e0c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 10 May 2022 19:43:12 -0400 Subject: [PATCH 04/10] Adding cosine and sqeuclidean --- python/pylibraft/pylibraft/distance/pairwise_distance.pyx | 7 +++++-- python/pylibraft/pylibraft/test/test_distance.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx index da1b77fb27..5c7fe3f992 100644 --- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx +++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx @@ -54,12 +54,14 @@ cdef extern from "raft_distance/pairwise_distance.hpp" \ DISTANCE_TYPES = { "l2": DistanceType.L2SqrtUnexpanded, + "sqeuclidean": DistanceType.L2Unexpanded, "euclidean": DistanceType.L2SqrtUnexpanded, "l1": DistanceType.L1, "cityblock": DistanceType.L1, "inner_product": DistanceType.InnerProduct, "chebyshev": DistanceType.Linf, "canberra": DistanceType.Canberra, + "cosine": DistanceType.CosineExpanded, "lp": DistanceType.LpUnexpanded, "correlation": DistanceType.CorrelationExpanded, "jaccard": DistanceType.JaccardExpanded, @@ -76,7 +78,7 @@ DISTANCE_TYPES = { SUPPORTED_DISTANCES = ["euclidean", "l1", "cityblock", "l2", "inner_product", "chebyshev", "minkowski", "canberra", "kl_divergence", "correlation", "russellrao", "hellinger", "lp", - "hamming", "jensenshannon"] + "hamming", "jensenshannon", "cosine", "sqeuclidean"] def distance(X, Y, dists, metric="euclidean", p=2.0): @@ -86,7 +88,8 @@ def distance(X, Y, dists, metric="euclidean", p=2.0): Valid values for metric: ["euclidean", "l2", "l1", "cityblock", "inner_product", "chebyshev", "canberra", "lp", "hellinger", "jensenshannon", - "kl_divergence", "russellrao", "minkowski", "correlation"] + "kl_divergence", "russellrao", "minkowski", "correlation", + "cosine"] Parameters ---------- diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py index 594f6e2f66..bb6f872666 100644 --- a/python/pylibraft/pylibraft/test/test_distance.py +++ b/python/pylibraft/pylibraft/test/test_distance.py @@ -49,7 +49,7 @@ def copy_to_host(self): @pytest.mark.parametrize("n_cols", [100]) @pytest.mark.parametrize("metric", ["euclidean", "cityblock", "chebyshev", "canberra", "correlation", "hamming", - "jensenshannon", "russellrao"]) + "jensenshannon", "russellrao", "cosine"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_distance(n_rows, n_cols, metric, dtype): input1 = np.random.random_sample((n_rows, n_cols)).astype(dtype) From 91941005ad18ea7af927b2b864160f2a75bc25ad Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 10 May 2022 19:43:47 -0400 Subject: [PATCH 05/10] Adding test for sqeuclidean --- python/pylibraft/pylibraft/test/test_distance.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py index bb6f872666..77845b539c 100644 --- a/python/pylibraft/pylibraft/test/test_distance.py +++ b/python/pylibraft/pylibraft/test/test_distance.py @@ -49,7 +49,8 @@ def copy_to_host(self): @pytest.mark.parametrize("n_cols", [100]) @pytest.mark.parametrize("metric", ["euclidean", "cityblock", "chebyshev", "canberra", "correlation", "hamming", - "jensenshannon", "russellrao", "cosine"]) + "jensenshannon", "russellrao", "cosine", + "sqeuclidean"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_distance(n_rows, n_cols, metric, dtype): input1 = np.random.random_sample((n_rows, n_cols)).astype(dtype) From c691e78c093a4f9239ff6ddc846c1de0e93a8b09 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 10 May 2022 20:11:00 -0400 Subject: [PATCH 06/10] Testing both row- and col- major --- python/pylibraft/pylibraft/test/test_distance.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py index 77845b539c..d4f73ecf2b 100644 --- a/python/pylibraft/pylibraft/test/test_distance.py +++ b/python/pylibraft/pylibraft/test/test_distance.py @@ -24,10 +24,10 @@ class TestDeviceBuffer: - def __init__(self, ndarray): + def __init__(self, ndarray, order): self.ndarray_ = ndarray self.device_buffer_ = \ - rmm.DeviceBuffer.to_device(ndarray.ravel(order="C").tobytes()) + rmm.DeviceBuffer.to_device(ndarray.ravel(order=order).tobytes()) @property def __cuda_array_interface__(self): @@ -51,9 +51,11 @@ def copy_to_host(self): "canberra", "correlation", "hamming", "jensenshannon", "russellrao", "cosine", "sqeuclidean"]) +@pytest.mark.parametrize("order", ["F", "C"]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -def test_distance(n_rows, n_cols, metric, dtype): - input1 = np.random.random_sample((n_rows, n_cols)).astype(dtype) +def test_distance(n_rows, n_cols, metric, order, dtype): + input1 = np.random.random_sample((n_rows, n_cols)) + input1 = np.asarray(input1, order=order).astype(dtype) # RussellRao expects boolean arrays if metric == "russellrao": @@ -71,8 +73,8 @@ def test_distance(n_rows, n_cols, metric, dtype): expected[expected <= 1e-5] = 0.0 - input1_device = TestDeviceBuffer(input1) - output_device = TestDeviceBuffer(output) + input1_device = TestDeviceBuffer(input1, order) + output_device = TestDeviceBuffer(output, order) pairwise_distance(input1_device, input1_device, output_device, metric) actual = output_device.copy_to_host() From e1673a9e3acb30c6d3be2ccb826ac7c466e94425 Mon Sep 17 00:00:00 2001 From: Vinay D Date: Wed, 11 May 2022 23:06:35 +0530 Subject: [PATCH 07/10] Syncronizing on handle.get_stream() stream as uniform is executed in that stream --- cpp/test/linalg/divide.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 914ef21269..96fdba820b 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -59,6 +59,7 @@ class DivideTest : public ::testing::TestWithParam Date: Wed, 11 May 2022 14:11:30 -0400 Subject: [PATCH 08/10] Review feedback --- .../pylibraft/pylibraft/distance/pairwise_distance.pyx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx index 5c7fe3f992..4aad344aac 100644 --- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx +++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx @@ -27,6 +27,12 @@ from libcpp cimport bool from .distance_type cimport DistanceType from pylibraft.common.handle cimport handle_t +def is_c_cont(cai, dt): + return "strides" not in cai or \ + cai["strides"] is None or \ + cai["strides"][1] == dt.itemsize + + cdef extern from "raft_distance/pairwise_distance.hpp" \ namespace "raft::distance::runtime": @@ -142,8 +148,8 @@ def distance(X, Y, dists, metric="euclidean", p=2.0): y_dt = np.dtype(y_cai["typestr"]) d_dt = np.dtype(dists_cai["typestr"]) - x_c_contiguous = "strides" not in x_cai or x_cai["strides"] is None - y_c_contiguous = "strides" not in y_cai or y_cai["strides"] is None + x_c_contiguous = is_c_cont(x_cai, x_dt) + y_c_contiguous = is_c_cont(y_cai, y_dt) if x_c_contiguous != y_c_contiguous: raise ValueError("Inputs must have matching strides") From 2917f5ef9ca412113caec737df995832ddc83c34 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 11 May 2022 14:23:02 -0400 Subject: [PATCH 09/10] More review feedback --- .../pylibraft/distance/pairwise_distance.pyx | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx index 4aad344aac..8d55402e23 100644 --- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx +++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx @@ -27,10 +27,11 @@ from libcpp cimport bool from .distance_type cimport DistanceType from pylibraft.common.handle cimport handle_t + def is_c_cont(cai, dt): - return "strides" not in cai or \ - cai["strides"] is None or \ - cai["strides"][1] == dt.itemsize + return "strides" not in cai or \ + cai["strides"] is None or \ + cai["strides"][1] == dt.itemsize cdef extern from "raft_distance/pairwise_distance.hpp" \ @@ -104,6 +105,7 @@ def distance(X, Y, dists, metric="euclidean", p=2.0): Y : CUDA array interface compliant matrix shape (n, k) dists : Writable CUDA array interface matrix shape (m, n) metric : string denoting the metric type (default="euclidean") + p : metric parameter (currently used only for "minkowski") Examples -------- @@ -133,10 +135,12 @@ def distance(X, Y, dists, metric="euclidean", p=2.0): m = x_cai["shape"][0] n = y_cai["shape"][0] - if x_cai["shape"][1] != y_cai["shape"][1]: - raise ValueError("Inputs must have same number of columns.") + x_k = x_cai["shape"][1] + y_k = y_cai["shape"][1] - k = x_cai["shape"][1] + if x_k != y_k: + raise ValueError("Inputs must have same number of columns. " + "a=%s, b=%s" % (x_k, y_k)) x_ptr = x_cai["data"][0] y_ptr = y_cai["data"][0] @@ -169,7 +173,7 @@ def distance(X, Y, dists, metric="euclidean", p=2.0): d_ptr, m, n, - k, + x_k, distance_type, x_c_contiguous, p) @@ -180,7 +184,7 @@ def distance(X, Y, dists, metric="euclidean", p=2.0): d_ptr, m, n, - k, + x_k, distance_type, x_c_contiguous, p) From 5c9c9a44a02572df97035b55ca6e5e6f9623f6ca Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 11 May 2022 16:47:22 -0400 Subject: [PATCH 10/10] Removing explicit stream create --- cpp/test/linalg/divide.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 96fdba820b..d620979c2f 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -57,9 +57,7 @@ class DivideTest : public ::testing::TestWithParam