Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 'spearman' correlation method for dataframe.corr and series.corr #7141

Merged
merged 37 commits into from
Mar 23, 2022
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
78e17f8
Update dataframe.py
dominicshanshan Jan 14, 2021
e62b07f
Merge pull request #1 from dominicshanshan/dominicshanshan-cudf-dataf…
dominicshanshan Jan 14, 2021
3d5dda5
Update test_stats.py
dominicshanshan Mar 2, 2021
c1a3277
Update series.py
dominicshanshan Mar 2, 2021
36708c2
Update series.py
dominicshanshan Mar 3, 2021
f1dd461
Update dataframe.py
dominicshanshan Mar 3, 2021
9929304
Merge remote-tracking branch 'origin/branch-0.20' into branch-0.18
vyasr Apr 22, 2021
449752b
Add full docstring for DataFrame.corr.
vyasr Apr 22, 2021
bfbdcb1
Add default method to Series.corr.
vyasr Apr 22, 2021
f762fe9
Update series.py
dominicshanshan Mar 11, 2022
6b8b5e9
Update python/cudf/cudf/core/dataframe.py
dominicshanshan Mar 11, 2022
32a7927
Update series.py
dominicshanshan Mar 11, 2022
0c31c85
Update python/cudf/cudf/core/dataframe.py
dominicshanshan Mar 11, 2022
1b7b843
Update dataframe.py
dominicshanshan Mar 18, 2022
289997d
Update dataframe.py
dominicshanshan Mar 18, 2022
8259354
Update dataframe.py
dominicshanshan Mar 18, 2022
aa70df4
Update test_stats.py
dominicshanshan Mar 18, 2022
95cb983
Update test_stats.py
dominicshanshan Mar 18, 2022
63f73a8
Update series.py
dominicshanshan Mar 18, 2022
b58559e
Update dataframe.py
dominicshanshan Mar 18, 2022
50e9fbc
Update series.py
dominicshanshan Mar 18, 2022
b73a35d
Merge branch 'branch-22.04' into branch-0.18
dominicshanshan Mar 18, 2022
ebca141
Update series.py
dominicshanshan Mar 18, 2022
0dbeb47
Update series.py
dominicshanshan Mar 18, 2022
be3080b
Update dataframe.py
dominicshanshan Mar 18, 2022
6a2e6cf
Update series.py
dominicshanshan Mar 18, 2022
6865b34
Update dataframe.py
dominicshanshan Mar 19, 2022
d111a5c
formatting_check
dominicshanshan Mar 19, 2022
ef42ecf
Update dataframe.py
dominicshanshan Mar 21, 2022
e3d96a8
flake8 check
dominicshanshan Mar 21, 2022
549961a
Apply suggestions from code review
isVoid Mar 21, 2022
663ce56
Update python/cudf/cudf/core/dataframe.py
isVoid Mar 21, 2022
6fee0b0
Update python/cudf/cudf/core/series.py
dominicshanshan Mar 22, 2022
d070ae4
Update python/cudf/cudf/core/series.py
dominicshanshan Mar 22, 2022
de06884
Update python/cudf/cudf/core/dataframe.py
dominicshanshan Mar 22, 2022
d4251cb
add "method
dominicshanshan Mar 23, 2022
29e31bf
update comment on correlation
dominicshanshan Mar 23, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5652,13 +5652,29 @@ def cov(self, **kwargs):
df._set_column_names_like(self)
return df

@_cudf_nvtx_annotate
def corr(self):
"""Compute the correlation matrix of a DataFrame."""
corr = cupy.corrcoef(self.values, rowvar=False)
cols = self._data.to_pandas_index()
df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
df._set_column_names_like(self)
def corr(self, method="pearson"):
"""Compute the correlation matrix of a DataFrame.

Parameters
----------
method : {'pearson', 'spearman'}, default 'pearson'
The correlation method to use, one of 'pearson' or 'spearman'.

Returns
-------
DataFrame
The requested correlation matrix.
"""
if method == "pearson":
corr = cupy.corrcoef(self.values, rowvar=False)
isVoid marked this conversation as resolved.
Show resolved Hide resolved
df = DataFrame(cupy.asfortranarray(corr)).set_index(self.columns)
isVoid marked this conversation as resolved.
Show resolved Hide resolved
isVoid marked this conversation as resolved.
Show resolved Hide resolved
df.columns = self.columns
elif method == "spearman":
corr = cupy.corrcoef(self.rank().values, rowvar=False)
df = DataFrame(cupy.asfortranarray(corr)).set_index(self.columns)
isVoid marked this conversation as resolved.
Show resolved Hide resolved
df.columns = self.columns
isVoid marked this conversation as resolved.
Show resolved Hide resolved
else:
raise ValueError("method must be either 'pearson', 'spearman'")
return df

@_cudf_nvtx_annotate
Expand Down
21 changes: 16 additions & 5 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2422,11 +2422,13 @@ def corr(self, other, method="pearson", min_periods=None):
>>> import cudf
>>> ser1 = cudf.Series([0.9, 0.13, 0.62])
>>> ser2 = cudf.Series([0.12, 0.26, 0.51])
>>> ser1.corr(ser2)
>>> ser1.corr(ser2, method="pearson")
dominicshanshan marked this conversation as resolved.
Show resolved Hide resolved
-0.20454263717316112
>>> ser1.corr(ser2, method="spearman")
-0.5
"""

if method not in ("pearson",):
if method not in ("pearson", "spearman",):
dominicshanshan marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(f"Unknown method {method}")

if min_periods not in (None,):
Expand All @@ -2435,9 +2437,18 @@ def corr(self, other, method="pearson", min_periods=None):
if self.empty or other.empty:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

lhs = self.nans_to_nulls().dropna()
rhs = other.nans_to_nulls().dropna()
lhs, rhs = _align_indices([lhs, rhs], how="inner")
if method == "pearson":
lhs = self.nans_to_nulls().dropna()
rhs = other.nans_to_nulls().dropna()
lhs, rhs = _align_indices([lhs, rhs], how="inner")
isVoid marked this conversation as resolved.
Show resolved Hide resolved
elif method == "spearman":
lhs = self.nans_to_nulls().dropna()
rhs = other.nans_to_nulls().dropna()
lhs, rhs = _align_indices([lhs, rhs], how="inner")
isVoid marked this conversation as resolved.
Show resolved Hide resolved
lhs = lhs.rank()
rhs = rhs.rank()
else:
raise ValueError("method must be either 'pearson', 'spearman'")
dominicshanshan marked this conversation as resolved.
Show resolved Hide resolved

try:
return lhs._column.corr(rhs._column)
Expand Down
14 changes: 8 additions & 6 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,24 +408,26 @@ def test_cov1d(data1, data2):
cudf.Series([5]),
],
)
def test_corr1d(data1, data2):
@pytest.mark.parametrize("method", ["spearman", "pearson"])
def test_corr1d(data1, data2, method):
gs1 = cudf.Series(data1)
gs2 = cudf.Series(data2)

ps1 = gs1.to_pandas()
ps2 = gs2.to_pandas()

got = gs1.corr(gs2)
expected = ps1.corr(ps2)
got = gs1.corr(gs2, method)
expected = ps1.corr(ps2, method)
np.testing.assert_approx_equal(got, expected, significant=8)


def test_df_corr():
@pytest.mark.parametrize("method", ["spearman", "pearson"])
def test_df_corr(method):

gdf = randomdata(100, {str(x): float for x in range(50)})
pdf = gdf.to_pandas()
got = gdf.corr()
expected = pdf.corr()
got = gdf.corr(method)
expected = pdf.corr(method)
assert_eq(got, expected)


Expand Down