diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 39ae9c774e5..28ed67c9b0b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5652,10 +5652,25 @@ def cov(self, **kwargs): df._set_column_names_like(self) return df - @_cudf_nvtx_annotate - def corr(self): - """Compute the correlation matrix of a DataFrame.""" - corr = cupy.corrcoef(self.values, rowvar=False) + def corr(self, method="pearson"): + """Compute the correlation matrix of a DataFrame. + Parameters + ---------- + method : {'pearson', 'spearman'}, default 'pearson' + The correlation method to use, one of 'pearson' or 'spearman'. + + Returns + ------- + DataFrame + The requested correlation matrix. + """ + if method == "pearson": + values = self.values + elif method == "spearman": + values = self.rank().values + else: + raise ValueError("method must be either 'pearson', 'spearman'") + corr = cupy.corrcoef(values, rowvar=False) cols = self._data.to_pandas_index() df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) df._set_column_names_like(self) @@ -5665,7 +5680,6 @@ def corr(self): def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. - Parameters ---------- name: optional diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ef5850ecc17..67f999d6d2b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2422,11 +2422,13 @@ def corr(self, other, method="pearson", min_periods=None): >>> import cudf >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) - >>> ser1.corr(ser2) + >>> ser1.corr(ser2, method="pearson") -0.20454263717316112 + >>> ser1.corr(ser2, method="spearman") + -0.5 """ - if method not in ("pearson",): + if method not in {"pearson", "spearman"}: raise ValueError(f"Unknown method {method}") if min_periods not in (None,): @@ -2438,6 +2440,9 @@ def corr(self, other, method="pearson", min_periods=None): lhs = self.nans_to_nulls().dropna() rhs = other.nans_to_nulls().dropna() lhs, rhs = _align_indices([lhs, rhs], how="inner") + if method == "spearman": + lhs = lhs.rank() + rhs = rhs.rank() try: return lhs._column.corr(rhs._column) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 7bf339d6ab7..782ec52e30d 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -408,24 +408,26 @@ def test_cov1d(data1, data2): cudf.Series([5]), ], ) -def test_corr1d(data1, data2): +@pytest.mark.parametrize("method", ["spearman", "pearson"]) +def test_corr1d(data1, data2, method): gs1 = cudf.Series(data1) gs2 = cudf.Series(data2) ps1 = gs1.to_pandas() ps2 = gs2.to_pandas() - got = gs1.corr(gs2) - expected = ps1.corr(ps2) + got = gs1.corr(gs2, method) + expected = ps1.corr(ps2, method) np.testing.assert_approx_equal(got, expected, significant=8) -def test_df_corr(): +@pytest.mark.parametrize("method", ["spearman", "pearson"]) +def test_df_corr(method): gdf = randomdata(100, {str(x): float for x in range(50)}) pdf = gdf.to_pandas() - got = gdf.corr() - expected = pdf.corr() + got = gdf.corr(method) + expected = pdf.corr(method) assert_eq(got, expected)