From fc0fa934124d924f16b6728ce368ef97b5a2557d Mon Sep 17 00:00:00 2001
From: Michael Wang <michaelwang0905@icloud.com>
Date: Wed, 5 Jan 2022 16:41:48 -0800
Subject: [PATCH] Remove methods and tests, add coverage for generic index in
 get_dummies

---
 python/cudf/cudf/core/dataframe.py    |  72 ------------------
 python/cudf/cudf/core/reshape.py      |  12 +--
 python/cudf/cudf/core/series.py       |  77 -------------------
 python/cudf/cudf/tests/test_onehot.py | 105 +++-----------------------
 4 files changed, 16 insertions(+), 250 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d97ea456f72..3366a0af4ba 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3051,78 +3051,6 @@ def as_matrix(self, columns=None):
         )
         return self.as_gpu_matrix(columns=columns).copy_to_host()
 
-    def one_hot_encoding(
-        self, column, prefix, cats, prefix_sep="_", dtype="float64"
-    ):
-        """
-        Expand a column with one-hot-encoding.
-
-        Parameters
-        ----------
-
-        column : str
-            the source column with binary encoding for the data.
-        prefix : str
-            the new column name prefix.
-        cats : sequence of ints
-            the sequence of categories as integers.
-        prefix_sep : str
-            the separator between the prefix and the category.
-        dtype :
-            the dtype for the outputs; defaults to float64.
-
-        Returns
-        -------
-
-        a new dataframe with new columns append for each category.
-
-        Examples
-        --------
-        >>> import pandas as pd
-        >>> import cudf
-        >>> pet_owner = [1, 2, 3, 4, 5]
-        >>> pet_type = ['fish', 'dog', 'fish', 'bird', 'fish']
-        >>> df = pd.DataFrame({'pet_owner': pet_owner, 'pet_type': pet_type})
-        >>> df.pet_type = df.pet_type.astype('category')
-
-        Create a column with numerically encoded category values
-
-        >>> df['pet_codes'] = df.pet_type.cat.codes
-        >>> gdf = cudf.from_pandas(df)
-
-        Create the list of category codes to use in the encoding
-
-        >>> codes = gdf.pet_codes.unique()
-        >>> gdf.one_hot_encoding('pet_codes', 'pet_dummy', codes).head()
-          pet_owner  pet_type  pet_codes  pet_dummy_0  pet_dummy_1  pet_dummy_2
-        0         1      fish          2          0.0          0.0          1.0
-        1         2       dog          1          0.0          1.0          0.0
-        2         3      fish          2          0.0          0.0          1.0
-        3         4      bird          0          1.0          0.0          0.0
-        4         5      fish          2          0.0          0.0          1.0
-        """
-
-        warnings.warn(
-            "DataFrame.one_hot_encoding is deprecated and will be removed in "
-            "future, use `get_dummies` instead.",
-            FutureWarning,
-        )
-
-        if hasattr(cats, "to_arrow"):
-            cats = cats.to_arrow().to_pylist()
-        else:
-            cats = pd.Series(cats, dtype="object")
-
-        newnames = [
-            prefix_sep.join([prefix, "null" if cat is None else str(cat)])
-            for cat in cats
-        ]
-        newcols = self[column].one_hot_encoding(cats=cats, dtype=dtype)
-        outdf = self.copy()
-        for name, col in zip(newnames, newcols):
-            outdf.insert(len(outdf._data), name, col)
-        return outdf
-
     def label_encoding(
         self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1
     ):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index b2fac7a6140..1733a6c0b9a 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -600,18 +600,18 @@ def get_dummies(
     df : array-like, Series, or DataFrame
         Data of which to get dummy indicators.
     prefix : str, dict, or sequence, optional
-        prefix to append. Either a str (to apply a constant prefix), dict
+        Prefix to append. Either a str (to apply a constant prefix), dict
         mapping column names to prefixes, or sequence of prefixes to apply with
         the same length as the number of columns. If not supplied, defaults
         to the empty string
     prefix_sep : str, dict, or sequence, optional, default '_'
-        separator to use when appending prefixes
+        Separator to use when appending prefixes
     dummy_na : boolean, optional
         Add a column to indicate Nones, if False Nones are ignored.
     cats : dict, optional
-        dictionary mapping column names to sequences of integers representing
-        that column's category. See `cudf.DataFrame.one_hot_encoding` for more
-        information. if not supplied, it will be computed
+        Dictionary mapping column names to sequences of values representing
+        that column's category. If not supplied, it is computed as the unique
+        values of the column.
     sparse : boolean, optional
         Right now this is NON-FUNCTIONAL argument in rapids.
     drop_first : boolean, optional
@@ -621,7 +621,7 @@ def get_dummies(
         columns. Note this is different from pandas default behavior, which
         encodes all columns with dtype object or categorical
     dtype : str, optional
-        output dtype, default 'uint8'
+        Output dtype, default 'uint8'
 
     Examples
     --------
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index fb86cf85c4c..178c40b3cd8 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2264,83 +2264,6 @@ def reverse(self):
             {self.name: self._column[rinds]}, self.index._values[rinds]
         )
 
-    def one_hot_encoding(self, cats, dtype="float64"):
-        """Perform one-hot-encoding
-
-        Parameters
-        ----------
-        cats : sequence of values
-                values representing each category.
-        dtype : numpy.dtype
-                specifies the output dtype.
-
-        Returns
-        -------
-        Sequence
-            A sequence of new series for each category. Its length is
-            determined by the length of ``cats``.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series(['a', 'b', 'c', 'a'])
-        >>> s
-        0    a
-        1    b
-        2    c
-        3    a
-        dtype: object
-        >>> s.one_hot_encoding(['a', 'c', 'b'])
-        [0    1.0
-        1    0.0
-        2    0.0
-        3    1.0
-        dtype: float64, 0    0.0
-        1    0.0
-        2    1.0
-        3    0.0
-        dtype: float64, 0    0.0
-        1    1.0
-        2    0.0
-        3    0.0
-        dtype: float64]
-        """
-
-        warnings.warn(
-            "Series.one_hot_encoding is deprecated and will be removed in "
-            "future, use `get_dummies` instead.",
-            FutureWarning,
-        )
-
-        if hasattr(cats, "to_arrow"):
-            cats = cats.to_pandas()
-        else:
-            cats = pd.Series(cats, dtype="object")
-        dtype = cudf.dtype(dtype)
-
-        try:
-            cats_col = as_column(cats, nan_as_null=False, dtype=self.dtype)
-        except TypeError:
-            raise ValueError("Cannot convert `cats` as cudf column.")
-
-        if self._column.size * cats_col.size >= np.iinfo("int32").max:
-            raise ValueError(
-                "Size limitation exceeded: series.size * category.size < "
-                "np.iinfo('int32').max. Consider reducing size of category"
-            )
-
-        res = libcudf.transform.one_hot_encode(self._column, cats_col)
-        if dtype.type == np.bool_:
-            return [
-                Series._from_data({None: x}, index=self._index)
-                for x in list(res.values())
-            ]
-        else:
-            return [
-                Series._from_data({None: x.astype(dtype)}, index=self._index)
-                for x in list(res.values())
-            ]
-
     def label_encoding(self, cats, dtype=None, na_sentinel=-1):
         """Perform label encoding.
 
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index f2a20a73b63..2b0422ffecb 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -7,108 +7,23 @@
 import pytest
 
 import cudf
-from cudf import DataFrame, Index, Series
+from cudf import DataFrame
 from cudf.testing import _utils as utils
 
 
-def test_onehot_simple():
-    np.random.seed(0)
-    df = DataFrame()
-    # Populate with data [0, 10)
-    df["vals"] = np.arange(10, dtype=np.int32)
-    # One Hot (Series)
-    for i, col in enumerate(df["vals"].one_hot_encoding(list(range(10)))):
-        arr = col.to_numpy()
-        # Verify 1 in the right position
-        np.testing.assert_equal(arr[i], 1)
-        # Every other slots are 0s
-        np.testing.assert_equal(arr[:i], 0)
-        np.testing.assert_equal(arr[i + 1 :], 0)
-    # One Hot (DataFrame)
-    df2 = df.one_hot_encoding(
-        column="vals", prefix="vals", cats=list(range(10))
-    )
-    assert df2.columns[0] == "vals"
-    for i in range(1, len(df2.columns)):
-        assert df2.columns[i] == "vals_%s" % (i - 1)
-    got = df2[df2.columns[1:]].values_host
-    expect = np.identity(got.shape[0])
-    np.testing.assert_equal(got, expect)
-
-
-def test_onehot_random():
-    df = DataFrame()
-    low = 10
-    high = 17
-    size = 10
-    df["src"] = src = np.random.randint(low=low, high=high, size=size)
-    df2 = df.one_hot_encoding(
-        column="src", prefix="out_", cats=tuple(range(10, 17))
-    )
-    mat = df2[df2.columns[1:]].values_host
-
-    for val in range(low, high):
-        colidx = val - low
-        arr = mat[:, colidx]
-        mask = src == val
-        np.testing.assert_equal(arr, mask)
-
-
-def test_onehot_masked():
-    np.random.seed(0)
-    high = 5
-    size = 100
-    arr = np.random.randint(low=0, high=high, size=size)
-    bitmask = utils.random_bitmask(size)
-    bytemask = np.asarray(
-        utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_
-    )
-    arr[~bytemask] = -1
-
-    df = DataFrame()
-    df["a"] = Series(arr).set_mask(bitmask)
-
-    out = df.one_hot_encoding(
-        "a", cats=list(range(high)), prefix="a", dtype=np.int32
-    )
-
-    assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4")
-    np.testing.assert_array_equal((out["a_0"] == 1).to_numpy(), arr == 0)
-    np.testing.assert_array_equal((out["a_1"] == 1).to_numpy(), arr == 1)
-    np.testing.assert_array_equal((out["a_2"] == 1).to_numpy(), arr == 2)
-    np.testing.assert_array_equal((out["a_3"] == 1).to_numpy(), arr == 3)
-    np.testing.assert_array_equal((out["a_4"] == 1).to_numpy(), arr == 4)
-
-
-def test_onehot_generic_index():
-    np.random.seed(0)
-    size = 33
-    indices = np.random.randint(low=0, high=100, size=size)
-    df = DataFrame()
-    values = np.random.randint(low=0, high=4, size=size)
-    df["fo"] = Series(values, index=Index(indices))
-    out = df.one_hot_encoding(
-        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
-    )
-    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
-    np.testing.assert_array_equal(values == 0, out.fo_0.to_numpy())
-    np.testing.assert_array_equal(values == 1, out.fo_1.to_numpy())
-    np.testing.assert_array_equal(values == 2, out.fo_2.to_numpy())
-    np.testing.assert_array_equal(values == 3, out.fo_3.to_numpy())
-
-
 @pytest.mark.parametrize(
-    "data",
+    "data, index",
     [
-        np.arange(10),
-        ["abc", "zyx", "pppp"],
-        [],
-        pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"),
+        (np.arange(10), None),
+        (["abc", "zyx", "pppp"], None),
+        ([], None),
+        (pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"), None),
+        (range(10), [1, 2, 3, 4, 5] * 2),
     ],
 )
-def test_get_dummies(data):
-    gdf = DataFrame({"x": data})
-    pdf = pd.DataFrame({"x": data})
+def test_get_dummies(data, index):
+    gdf = DataFrame({"x": data}, index=index)
+    pdf = pd.DataFrame({"x": data}, index=index)
 
     encoded_expected = pd.get_dummies(pdf, prefix="test")
     encoded_actual = cudf.get_dummies(gdf, prefix="test")