rapidsai · kkraus14 · Mar 11, 2020 · Feb 12, 2020 · Feb 12, 2020 · Feb 13, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 
 ## Improvements
 
+- PR #4140 Add cudf series examples in docs and corr() method for dataframe in dataframe.py
 - PR #3525 build.sh option to disable nvtx
 - PR #3748 Optimize hash_partition using shared memory
 - PR #3698 Add count_(un)set_bits functions taking multiple ranges and updated slice to compute null counts at once.

@@ -1210,9 +1210,9 @@ def iloc(self):
 
         Examples
         --------
-        >>> df = DataFrame([('a', list(range(20))),
-        ...                 ('b', list(range(20))),
-        ...                 ('c', list(range(20)))])
+        >>> df = cudf.DataFrame([('a', range(20)),
+        ...                      ('b', range(20)),
+        ...                      ('c', range(20))])
 
         Select a single row using an integer index.
 
@@ -4525,6 +4525,16 @@ def cov(self, **kwargs):
         df.columns = self.columns
         return df
 
+    def corr(self):
+        """Compute the correlation matrix of a DataFrame.
+        """
+        corr = cupy.corrcoef(self.values, rowvar=False)
+        df = DataFrame.from_gpu_matrix(cupy.asfortranarray(corr)).set_index(
+            self.columns
+        )
+        df.columns = self.columns
+        return df
+
 
 def from_pandas(obj):
     """

@@ -1870,7 +1870,21 @@ def prod(self, axis=None, skipna=True, dtype=None):
         return self.product(axis=axis, skipna=skipna, dtype=dtype)
 
     def cummin(self, axis=0, skipna=True):
-        """Compute the cumulative minimum of the series"""
+        """
+        Compute the cumulative minimum of the series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([1, 5, 2, 4, 3])
+        >>> ser.cummin()
+        0    1
+        1    1
+        2    1
+        3    1
+        4    1
+        """
+
         assert axis in (None, 0) and skipna is True
         return Series(
             self._column._apply_scan_op("min"),
@@ -1879,7 +1893,20 @@ def cummin(self, axis=0, skipna=True):
         )
 
     def cummax(self, axis=0, skipna=True):
-        """Compute the cumulative maximum of the series"""
+        """
+        Compute the cumulative maximum of the series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([1, 5, 2, 4, 3])
+        >>> ser.cummax()
+        0    1
+        1    5
+        2    5
+        3    5
+        4    5
+        """
         assert axis in (None, 0) and skipna is True
         return Series(
             self._column._apply_scan_op("max"),
@@ -1888,7 +1915,21 @@ def cummax(self, axis=0, skipna=True):
         )
 
     def cumsum(self, axis=0, skipna=True):
-        """Compute the cumulative sum of the series"""
+        """
+        Compute the cumulative sum of the series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([1, 5, 2, 4, 3])
+        >>> ser.cumsum()
+        0    1
+        1    6
+        2    8
+        3    12
+        4    15
+        """
+
         assert axis in (None, 0) and skipna is True
 
         # pandas always returns int64 dtype if original dtype is int or `bool`
@@ -1908,7 +1949,20 @@ def cumsum(self, axis=0, skipna=True):
             )
 
     def cumprod(self, axis=0, skipna=True):
-        """Compute the cumulative product of the series"""
+        """
+        Compute the cumulative product of the series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([1, 5, 2, 4, 3])
+        >>> ser.cumprod()
+        0    1
+        1    5
+        2    10
+        3    40
+        4    120
+        """
         assert axis in (None, 0) and skipna is True
 
         # pandas always returns int64 dtype if original dtype is int or `bool`
@@ -1928,7 +1982,16 @@ def cumprod(self, axis=0, skipna=True):
             )
 
     def mean(self, axis=None, skipna=True):
-        """Compute the mean of the series
+        """
+
+        Compute the mean of the series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([10, 25, 3, 25, 24, 6])
+        >>> ser.mean()
+        15.5
         """
         assert axis in (None, 0) and skipna is True
         return self._column.mean()
@@ -2031,6 +2094,14 @@ def skew(self, axis=None, skipna=None, level=None, numeric_only=None):
     def cov(self, other, min_periods=None):
         """Calculates the sample covariance between two Series,
         excluding missing values.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser1 = cudf.Series([0.9, 0.13, 0.62])
+        >>> ser2 = cudf.Series([0.12, 0.26, 0.51])
+        >>> ser1.cov(ser2)
+        -0.015750000000000004
         """
         assert min_periods in (None,)
 
@@ -2052,7 +2123,16 @@ def cov(self, other, min_periods=None):
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser1 = cudf.Series([0.9, 0.13, 0.62])
+        >>> ser2 = cudf.Series([0.12, 0.26, 0.51])
+        >>> ser1.corr(ser2)
+        -0.20454263717316112
         """
+
         assert method in ("pearson",) and min_periods in (None,)
 
         if self.empty or other.empty:
@@ -2124,7 +2204,7 @@ def unique_k(self, k):
         return self.unique()
 
     def unique(self, method="sort", sort=True):
-        """Returns unique values of this Series.
+        """Returns unique values of a Series.
         default='sort' will be changed to 'hash' when implemented.
         """
         if method != "sort":
@@ -2548,9 +2628,9 @@ def to_dlpack(self):
 
     def rename(self, index=None, copy=True):
         """
-        Alter Series name.
+        Alter Series name
 
-        Change Series.name with a scalar value.
+        Change Series.name with a scalar value
 
         Parameters
         ----------

@@ -372,3 +372,13 @@ def test_corr1d(data1, data2):
     got = gs1.corr(gs2)
     expected = ps1.corr(ps2)
     np.testing.assert_approx_equal(got, expected, significant=8)
+
+
+def test_df_corr():
+    from cudf.tests import utils
+
+    gdf = randomdata(100, {str(x): float for x in range(50)})
+    pdf = gdf.to_pandas()
+    got = gdf.corr()
+    expected = pdf.corr()
+    utils.assert_eq(got, expected)
@@ -772,6 +772,28 @@
 1  456 2018-11-14T12:35:01.000 5784
 2  789 2018-11-15T18:02:59.000 6117
 
+Read ``hexadecimal values`` from a csv file as integer column with cudf
+
+Create a test hex csv file
+
+>>> import cudf
+>>> fname = 'test.csv'
+>>> cdf = cudf.DataFrame()
+>>> cdf['hex_col'] = ['9512c20b']*10
+>>> cdf.to_csv(fname,index=False)
+
+Read the file with ``cudf.read_csv`` and use `hex64` as dtype.
+dtype: `hex64`, `hex32` or `hex` (alias for hex64) for hexadecimal parsing.
+
+>>> gdf = cudf.read_csv(fname, dtype = {"hex_col" : "hex64"})
+>>> gdf
+    hex_col
+0   2501034507
+1   2501034507
+2   2501034507
+3   2501034507
+4   2501034507
+
 See Also
 --------
 cudf.io.csv.to_csv
@@ -781,6 +803,7 @@
 doc_read_csv = docfmt_partial(docstring=_docstring_read_csv)
 
 _docstring_to_csv = """
+
 Write a dataframe to csv file format.
 
 Parameters