Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Series covariance and Pearson correlation #2719

Merged
merged 19 commits into from
Oct 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
- PR #2836 Add nvstrings.code_points method
- PR #2844 Add Series/DataFrame notnull
- PR #2858 Add GTest type list utilities
- PR #2719 Series covariance and Pearson correlation
- PR #2207 Beginning of libcudf overhaul: introduce new column and table types
- PR #2869 Add `cudf.CategoricalDtype`
- PR #2838 CSV Reader: Support ARROW_RANDOM_FILE input
Expand Down
47 changes: 45 additions & 2 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1934,6 +1934,49 @@ def skew(self, axis=None, skipna=None, level=None, numeric_only=None):
skew = unbiased_coef * m3 / (m2 ** (3 / 2))
return skew

def cov(self, other, min_periods=None):
"""Calculates the sample covariance between two Series,
excluding missing values.
"""
assert min_periods in (None,)

if self.empty or other.empty:
return np.nan

lhs = self.nans_to_nulls().dropna()
rhs = other.nans_to_nulls().dropna()
lhs, rhs = _align_indices(lhs, rhs, join="inner")

if lhs.empty or rhs.empty or (len(lhs) == 1 and len(rhs) == 1):
return np.nan

result = (lhs - lhs.mean()) * (rhs - rhs.mean())
cov_sample = result.sum() / (len(lhs) - 1)
return cov_sample

def corr(self, other, method="pearson", min_periods=None):
"""Calculates the sample correlation between two Series,
excluding missing values.
"""
assert method in ("pearson",) and min_periods in (None,)

if self.empty or other.empty:
return np.nan

lhs = self.nans_to_nulls().dropna()
rhs = other.nans_to_nulls().dropna()
lhs, rhs = _align_indices(lhs, rhs, join="inner")

if lhs.empty or rhs.empty:
return np.nan

cov = lhs.cov(rhs)
lhs_std, rhs_std = lhs.std(), rhs.std()

if not cov or lhs_std == 0 or rhs_std == 0:
return np.nan
return cov / lhs_std / rhs_std

def isin(self, test):

from cudf import DataFrame
Expand Down Expand Up @@ -2556,13 +2599,13 @@ def get_dt_field(self, field):
return Series(data=out_column, index=self.series._index)


def _align_indices(lhs, rhs):
def _align_indices(lhs, rhs, join="outer"):
"""
Internal util to align the indices of two Series. Returns a tuple of the
aligned series, or the original arguments if the indices are the same, or
if rhs isn't a Series.
"""
if isinstance(rhs, Series) and not lhs.index.equals(rhs.index):
lhs, rhs = lhs.to_frame(0), rhs.to_frame(1)
lhs, rhs = lhs.join(rhs, how="outer", sort=True)._cols.values()
lhs, rhs = lhs.join(rhs, how=join, sort=True)._cols.values()
return lhs, rhs
7 changes: 1 addition & 6 deletions python/cudf/cudf/tests/test_array_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,7 @@ def test_array_func_cudf_series(np_ar, func):
@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
@pytest.mark.parametrize("np_ar", [np.random.random(100)])
@pytest.mark.parametrize(
"func",
[
lambda x: np.cov(x, x),
lambda x: np.dot(x, x),
lambda x: np.linalg.norm(x),
],
"func", [lambda x: np.dot(x, x), lambda x: np.linalg.norm(x)]
)
def test_array_func_missing_cudf_series(np_ar, func):
cudf_ser = cudf.Series(np_ar)
Expand Down
76 changes: 76 additions & 0 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,79 @@ def test_skew(data, null_flag):
got = data.skew()
expected = pdata.skew()
np.testing.assert_array_almost_equal(got, expected)


@pytest.mark.parametrize(
"data1",
[
np.random.normal(-100, 100, 1000),
np.random.randint(-50, 50, 1000),
np.zeros(100),
np.repeat(np.nan, 100),
np.array([1.123, 2.343, np.nan, 0.0]),
Series([5, 10, 53, None, np.nan, None], nan_as_null=False),
Series([1.1, 2.32, 43.4], index=[0, 4, 3]),
Series([]),
Series([-3]),
],
)
@pytest.mark.parametrize(
"data2",
[
np.random.normal(-100, 100, 1000),
np.random.randint(-50, 50, 1000),
np.zeros(100),
np.repeat(np.nan, 100),
np.array([1.123, 2.343, np.nan, 0.0]),
Series([1.1, 2.32, 43.4], index=[0, 500, 4000]),
Series([5]),
],
)
def test_cov1d(data1, data2):
gs1 = Series(data1)
gs2 = Series(data2)

ps1 = gs1.to_pandas()
ps2 = gs2.to_pandas()

got = gs1.cov(gs2)
expected = ps1.cov(ps2)
np.testing.assert_approx_equal(got, expected, significant=8)


@pytest.mark.parametrize(
"data1",
[
np.random.normal(-100, 100, 1000),
np.random.randint(-50, 50, 1000),
np.zeros(100),
np.repeat(np.nan, 100),
np.array([1.123, 2.343, np.nan, 0.0]),
Series([5, 10, 53, None, np.nan, None], nan_as_null=False),
Series([1.1032, 2.32, 43.4], index=[0, 4, 3]),
Series([]),
Series([-3]),
],
)
@pytest.mark.parametrize(
"data2",
[
np.random.normal(-100, 100, 1000),
np.random.randint(-50, 50, 1000),
np.zeros(100),
np.repeat(np.nan, 100),
np.array([1.123, 2.343, np.nan, 0.0]),
Series([1.1, 2.32, 43.4], index=[0, 500, 4000]),
Series([5]),
],
)
def test_corr1d(data1, data2):
gs1 = Series(data1)
gs2 = Series(data2)

ps1 = gs1.to_pandas()
ps2 = gs2.to_pandas()

got = gs1.corr(gs2)
expected = ps1.corr(ps2)
np.testing.assert_approx_equal(got, expected, significant=8)