From ce5bacb273dbcefc1206c7e40230050f5d07d36d Mon Sep 17 00:00:00 2001 From: dominicshanshan Date: Thu, 24 Mar 2022 00:10:33 +0800 Subject: [PATCH] Add 'spearman' correlation method for `dataframe.corr` (#7141) Closes #6804 Adds 'spearman' correlation method for `dataframe.corr` Authors: - https://github.com/dominicshanshan - Vyas Ramasubramani (https://github.com/vyasr) - Michael Wang (https://github.com/isVoid) Approvers: - Michael Wang (https://github.com/isVoid) - Nick Becker (https://github.com/beckernick) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/7141 --- python/cudf/cudf/core/dataframe.py | 24 +++++++++++++++++++----- python/cudf/cudf/core/series.py | 9 +++++++-- python/cudf/cudf/tests/test_stats.py | 14 ++++++++------ 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index afd087c63cf..436363aadae 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5664,10 +5664,25 @@ def cov(self, **kwargs): df._set_column_names_like(self) return df - @_cudf_nvtx_annotate - def corr(self): - """Compute the correlation matrix of a DataFrame.""" - corr = cupy.corrcoef(self.values, rowvar=False) + def corr(self, method="pearson"): + """Compute the correlation matrix of a DataFrame. + Parameters + ---------- + method : {'pearson', 'spearman'}, default 'pearson' + The correlation method to use, one of 'pearson' or 'spearman'. + + Returns + ------- + DataFrame + The requested correlation matrix. + """ + if method == "pearson": + values = self.values + elif method == "spearman": + values = self.rank().values + else: + raise ValueError("method must be either 'pearson', 'spearman'") + corr = cupy.corrcoef(values, rowvar=False) cols = self._data.to_pandas_index() df = DataFrame(cupy.asfortranarray(corr)).set_index(cols) df._set_column_names_like(self) @@ -5677,7 +5692,6 @@ def corr(self): def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. - Parameters ---------- name: optional diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5ecfc10c439..3244497933e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2418,11 +2418,13 @@ def corr(self, other, method="pearson", min_periods=None): >>> import cudf >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) - >>> ser1.corr(ser2) + >>> ser1.corr(ser2, method="pearson") -0.20454263717316112 + >>> ser1.corr(ser2, method="spearman") + -0.5 """ - if method not in ("pearson",): + if method not in {"pearson", "spearman"}: raise ValueError(f"Unknown method {method}") if min_periods not in (None,): @@ -2434,6 +2436,9 @@ def corr(self, other, method="pearson", min_periods=None): lhs = self.nans_to_nulls().dropna() rhs = other.nans_to_nulls().dropna() lhs, rhs = _align_indices([lhs, rhs], how="inner") + if method == "spearman": + lhs = lhs.rank() + rhs = rhs.rank() try: return lhs._column.corr(rhs._column) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 11ab69bda41..a6dae25dd80 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -421,24 +421,26 @@ def test_cov1d(data1, data2): cudf.Series([5]), ], ) -def test_corr1d(data1, data2): +@pytest.mark.parametrize("method", ["spearman", "pearson"]) +def test_corr1d(data1, data2, method): gs1 = cudf.Series(data1) gs2 = cudf.Series(data2) ps1 = gs1.to_pandas() ps2 = gs2.to_pandas() - got = gs1.corr(gs2) - expected = ps1.corr(ps2) + got = gs1.corr(gs2, method) + expected = ps1.corr(ps2, method) np.testing.assert_approx_equal(got, expected, significant=8) -def test_df_corr(): +@pytest.mark.parametrize("method", ["spearman", "pearson"]) +def test_df_corr(method): gdf = randomdata(100, {str(x): float for x in range(50)}) pdf = gdf.to_pandas() - got = gdf.corr() - expected = pdf.corr() + got = gdf.corr(method) + expected = pdf.corr(method) assert_eq(got, expected)