rapidsai · kkraus14 · Oct 2, 2019 · Sep 3, 2019 · Sep 3, 2019 · Sep 3, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -31,6 +31,7 @@
 - PR #2836 Add nvstrings.code_points method
 - PR #2844 Add Series/DataFrame notnull
 - PR #2858 Add GTest type list utilities
+- PR #2719 Series covariance and Pearson correlation
 - PR #2207 Beginning of libcudf overhaul: introduce new column and table types
 - PR #2869 Add `cudf.CategoricalDtype`
 - PR #2838 CSV Reader: Support ARROW_RANDOM_FILE input

@@ -1934,6 +1934,49 @@ def skew(self, axis=None, skipna=None, level=None, numeric_only=None):
         skew = unbiased_coef * m3 / (m2 ** (3 / 2))
         return skew
 
+    def cov(self, other, min_periods=None):
+        """Calculates the sample covariance between two Series,
+        excluding missing values.
+        """
+        assert min_periods in (None,)
+
+        if self.empty or other.empty:
+            return np.nan
+
+        lhs = self.nans_to_nulls().dropna()
+        rhs = other.nans_to_nulls().dropna()
+        lhs, rhs = _align_indices(lhs, rhs, join="inner")
+
+        if lhs.empty or rhs.empty or (len(lhs) == 1 and len(rhs) == 1):
+            return np.nan
+
+        result = (lhs - lhs.mean()) * (rhs - rhs.mean())
+        cov_sample = result.sum() / (len(lhs) - 1)
+        return cov_sample
+
+    def corr(self, other, method="pearson", min_periods=None):
+        """Calculates the sample correlation between two Series,
+        excluding missing values.
+        """
+        assert method in ("pearson",) and min_periods in (None,)
+
+        if self.empty or other.empty:
+            return np.nan
+
+        lhs = self.nans_to_nulls().dropna()
+        rhs = other.nans_to_nulls().dropna()
+        lhs, rhs = _align_indices(lhs, rhs, join="inner")
+
+        if lhs.empty or rhs.empty:
+            return np.nan
+
+        cov = lhs.cov(rhs)
+        lhs_std, rhs_std = lhs.std(), rhs.std()
+
+        if not cov or lhs_std == 0 or rhs_std == 0:
+            return np.nan
+        return cov / lhs_std / rhs_std
+
     def isin(self, test):
 
         from cudf import DataFrame
@@ -2556,13 +2599,13 @@ def get_dt_field(self, field):
         return Series(data=out_column, index=self.series._index)
 
 
-def _align_indices(lhs, rhs):
+def _align_indices(lhs, rhs, join="outer"):
     """
     Internal util to align the indices of two Series. Returns a tuple of the
     aligned series, or the original arguments if the indices are the same, or
     if rhs isn't a Series.
     """
     if isinstance(rhs, Series) and not lhs.index.equals(rhs.index):
         lhs, rhs = lhs.to_frame(0), rhs.to_frame(1)
-        lhs, rhs = lhs.join(rhs, how="outer", sort=True)._cols.values()
+        lhs, rhs = lhs.join(rhs, how=join, sort=True)._cols.values()
     return lhs, rhs
@@ -40,12 +40,7 @@ def test_array_func_cudf_series(np_ar, func):
 @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason)
 @pytest.mark.parametrize("np_ar", [np.random.random(100)])
 @pytest.mark.parametrize(
-    "func",
-    [
-        lambda x: np.cov(x, x),
-        lambda x: np.dot(x, x),
-        lambda x: np.linalg.norm(x),
-    ],
+    "func", [lambda x: np.dot(x, x), lambda x: np.linalg.norm(x)]
 )
 def test_array_func_missing_cudf_series(np_ar, func):
     cudf_ser = cudf.Series(np_ar)

@@ -268,3 +268,79 @@ def test_skew(data, null_flag):
     got = data.skew()
     expected = pdata.skew()
     np.testing.assert_array_almost_equal(got, expected)
+
+
+@pytest.mark.parametrize(
+    "data1",
+    [
+        np.random.normal(-100, 100, 1000),
+        np.random.randint(-50, 50, 1000),
+        np.zeros(100),
+        np.repeat(np.nan, 100),
+        np.array([1.123, 2.343, np.nan, 0.0]),
+        Series([5, 10, 53, None, np.nan, None], nan_as_null=False),
+        Series([1.1, 2.32, 43.4], index=[0, 4, 3]),
+        Series([]),
+        Series([-3]),
+    ],
+)
+@pytest.mark.parametrize(
+    "data2",
+    [
+        np.random.normal(-100, 100, 1000),
+        np.random.randint(-50, 50, 1000),
+        np.zeros(100),
+        np.repeat(np.nan, 100),
+        np.array([1.123, 2.343, np.nan, 0.0]),
+        Series([1.1, 2.32, 43.4], index=[0, 500, 4000]),
+        Series([5]),
+    ],
+)
+def test_cov1d(data1, data2):
+    gs1 = Series(data1)
+    gs2 = Series(data2)
+
+    ps1 = gs1.to_pandas()
+    ps2 = gs2.to_pandas()
+
+    got = gs1.cov(gs2)
+    expected = ps1.cov(ps2)
+    np.testing.assert_approx_equal(got, expected, significant=8)
+
+
+@pytest.mark.parametrize(
+    "data1",
+    [
+        np.random.normal(-100, 100, 1000),
+        np.random.randint(-50, 50, 1000),
+        np.zeros(100),
+        np.repeat(np.nan, 100),
+        np.array([1.123, 2.343, np.nan, 0.0]),
+        Series([5, 10, 53, None, np.nan, None], nan_as_null=False),
+        Series([1.1032, 2.32, 43.4], index=[0, 4, 3]),
+        Series([]),
+        Series([-3]),
+    ],
+)
+@pytest.mark.parametrize(
+    "data2",
+    [
+        np.random.normal(-100, 100, 1000),
+        np.random.randint(-50, 50, 1000),
+        np.zeros(100),
+        np.repeat(np.nan, 100),
+        np.array([1.123, 2.343, np.nan, 0.0]),
+        Series([1.1, 2.32, 43.4], index=[0, 500, 4000]),
+        Series([5]),
+    ],
+)
+def test_corr1d(data1, data2):
+    gs1 = Series(data1)
+    gs2 = Series(data2)
+
+    ps1 = gs1.to_pandas()
+    ps2 = gs2.to_pandas()
+
+    got = gs1.corr(gs2)
+    expected = ps1.corr(ps2)
+    np.testing.assert_approx_equal(got, expected, significant=8)