Skip to content

Commit

Permalink
Add 'spearman' correlation method for dataframe.corr (#7141)
Browse files Browse the repository at this point in the history
Closes #6804

Adds 'spearman' correlation method for `dataframe.corr`

Authors:
  - https://github.com/dominicshanshan
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Nick Becker (https://github.com/beckernick)
  - https://github.com/brandon-b-miller

URL: #7141
  • Loading branch information
dominicshanshan authored Mar 23, 2022
1 parent 18398ab commit ce5bacb
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 13 deletions.
24 changes: 19 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5664,10 +5664,25 @@ def cov(self, **kwargs):
df._set_column_names_like(self)
return df

@_cudf_nvtx_annotate
def corr(self):
"""Compute the correlation matrix of a DataFrame."""
corr = cupy.corrcoef(self.values, rowvar=False)
def corr(self, method="pearson"):
"""Compute the correlation matrix of a DataFrame.
Parameters
----------
method : {'pearson', 'spearman'}, default 'pearson'
The correlation method to use, one of 'pearson' or 'spearman'.
Returns
-------
DataFrame
The requested correlation matrix.
"""
if method == "pearson":
values = self.values
elif method == "spearman":
values = self.rank().values
else:
raise ValueError("method must be either 'pearson', 'spearman'")
corr = cupy.corrcoef(values, rowvar=False)
cols = self._data.to_pandas_index()
df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
df._set_column_names_like(self)
Expand All @@ -5677,7 +5692,6 @@ def corr(self):
def to_struct(self, name=None):
"""
Return a struct Series composed of the columns of the DataFrame.
Parameters
----------
name: optional
Expand Down
9 changes: 7 additions & 2 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2418,11 +2418,13 @@ def corr(self, other, method="pearson", min_periods=None):
>>> import cudf
>>> ser1 = cudf.Series([0.9, 0.13, 0.62])
>>> ser2 = cudf.Series([0.12, 0.26, 0.51])
>>> ser1.corr(ser2)
>>> ser1.corr(ser2, method="pearson")
-0.20454263717316112
>>> ser1.corr(ser2, method="spearman")
-0.5
"""

if method not in ("pearson",):
if method not in {"pearson", "spearman"}:
raise ValueError(f"Unknown method {method}")

if min_periods not in (None,):
Expand All @@ -2434,6 +2436,9 @@ def corr(self, other, method="pearson", min_periods=None):
lhs = self.nans_to_nulls().dropna()
rhs = other.nans_to_nulls().dropna()
lhs, rhs = _align_indices([lhs, rhs], how="inner")
if method == "spearman":
lhs = lhs.rank()
rhs = rhs.rank()

try:
return lhs._column.corr(rhs._column)
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,24 +421,26 @@ def test_cov1d(data1, data2):
cudf.Series([5]),
],
)
def test_corr1d(data1, data2):
@pytest.mark.parametrize("method", ["spearman", "pearson"])
def test_corr1d(data1, data2, method):
gs1 = cudf.Series(data1)
gs2 = cudf.Series(data2)

ps1 = gs1.to_pandas()
ps2 = gs2.to_pandas()

got = gs1.corr(gs2)
expected = ps1.corr(ps2)
got = gs1.corr(gs2, method)
expected = ps1.corr(ps2, method)
np.testing.assert_approx_equal(got, expected, significant=8)


def test_df_corr():
@pytest.mark.parametrize("method", ["spearman", "pearson"])
def test_df_corr(method):

gdf = randomdata(100, {str(x): float for x in range(50)})
pdf = gdf.to_pandas()
got = gdf.corr()
expected = pdf.corr()
got = gdf.corr(method)
expected = pdf.corr(method)
assert_eq(got, expected)


Expand Down

0 comments on commit ce5bacb

Please sign in to comment.