Remove methods and tests, add coverage for generic index in get_dummies

rapidsai · Jan 6, 2022 · fc0fa93 · fc0fa93
1 parent eba4f03
commit fc0fa93
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 250 deletions.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -3051,78 +3051,6 @@ def as_matrix(self, columns=None):
         )
         return self.as_gpu_matrix(columns=columns).copy_to_host()
 
-    def one_hot_encoding(
-        self, column, prefix, cats, prefix_sep="_", dtype="float64"
-    ):
-        """
-        Expand a column with one-hot-encoding.
-
-        Parameters
-        ----------
-
-        column : str
-            the source column with binary encoding for the data.
-        prefix : str
-            the new column name prefix.
-        cats : sequence of ints
-            the sequence of categories as integers.
-        prefix_sep : str
-            the separator between the prefix and the category.
-        dtype :
-            the dtype for the outputs; defaults to float64.
-
-        Returns
-        -------
-
-        a new dataframe with new columns append for each category.
-
-        Examples
-        --------
-        >>> import pandas as pd
-        >>> import cudf
-        >>> pet_owner = [1, 2, 3, 4, 5]
-        >>> pet_type = ['fish', 'dog', 'fish', 'bird', 'fish']
-        >>> df = pd.DataFrame({'pet_owner': pet_owner, 'pet_type': pet_type})
-        >>> df.pet_type = df.pet_type.astype('category')
-
-        Create a column with numerically encoded category values
-
-        >>> df['pet_codes'] = df.pet_type.cat.codes
-        >>> gdf = cudf.from_pandas(df)
-
-        Create the list of category codes to use in the encoding
-
-        >>> codes = gdf.pet_codes.unique()
-        >>> gdf.one_hot_encoding('pet_codes', 'pet_dummy', codes).head()
-          pet_owner  pet_type  pet_codes  pet_dummy_0  pet_dummy_1  pet_dummy_2
-        0         1      fish          2          0.0          0.0          1.0
-        1         2       dog          1          0.0          1.0          0.0
-        2         3      fish          2          0.0          0.0          1.0
-        3         4      bird          0          1.0          0.0          0.0
-        4         5      fish          2          0.0          0.0          1.0
-        """
-
-        warnings.warn(
-            "DataFrame.one_hot_encoding is deprecated and will be removed in "
-            "future, use `get_dummies` instead.",
-            FutureWarning,
-        )
-
-        if hasattr(cats, "to_arrow"):
-            cats = cats.to_arrow().to_pylist()
-        else:
-            cats = pd.Series(cats, dtype="object")
-
-        newnames = [
-            prefix_sep.join([prefix, "null" if cat is None else str(cat)])
-            for cat in cats
-        ]
-        newcols = self[column].one_hot_encoding(cats=cats, dtype=dtype)
-        outdf = self.copy()
-        for name, col in zip(newnames, newcols):
-            outdf.insert(len(outdf._data), name, col)
-        return outdf
-
     def label_encoding(
         self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1
     ):

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
@@ -600,18 +600,18 @@ def get_dummies(
     df : array-like, Series, or DataFrame
         Data of which to get dummy indicators.
     prefix : str, dict, or sequence, optional
-        prefix to append. Either a str (to apply a constant prefix), dict
+        Prefix to append. Either a str (to apply a constant prefix), dict
         mapping column names to prefixes, or sequence of prefixes to apply with
         the same length as the number of columns. If not supplied, defaults
         to the empty string
     prefix_sep : str, dict, or sequence, optional, default '_'
-        separator to use when appending prefixes
+        Separator to use when appending prefixes
     dummy_na : boolean, optional
         Add a column to indicate Nones, if False Nones are ignored.
     cats : dict, optional
-        dictionary mapping column names to sequences of integers representing
-        that column's category. See `cudf.DataFrame.one_hot_encoding` for more
-        information. if not supplied, it will be computed
+        Dictionary mapping column names to sequences of values representing
+        that column's category. If not supplied, it is computed as the unique
+        values of the column.
     sparse : boolean, optional
         Right now this is NON-FUNCTIONAL argument in rapids.
     drop_first : boolean, optional
@@ -621,7 +621,7 @@ def get_dummies(
         columns. Note this is different from pandas default behavior, which
         encodes all columns with dtype object or categorical
     dtype : str, optional
-        output dtype, default 'uint8'
+        Output dtype, default 'uint8'
 
     Examples
     --------

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -2264,83 +2264,6 @@ def reverse(self):
             {self.name: self._column[rinds]}, self.index._values[rinds]
         )
 
-    def one_hot_encoding(self, cats, dtype="float64"):
-        """Perform one-hot-encoding
-
-        Parameters
-        ----------
-        cats : sequence of values
-                values representing each category.
-        dtype : numpy.dtype
-                specifies the output dtype.
-
-        Returns
-        -------
-        Sequence
-            A sequence of new series for each category. Its length is
-            determined by the length of ``cats``.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series(['a', 'b', 'c', 'a'])
-        >>> s
-        0    a
-        1    b
-        2    c
-        3    a
-        dtype: object
-        >>> s.one_hot_encoding(['a', 'c', 'b'])
-        [0    1.0
-        1    0.0
-        2    0.0
-        3    1.0
-        dtype: float64, 0    0.0
-        1    0.0
-        2    1.0
-        3    0.0
-        dtype: float64, 0    0.0
-        1    1.0
-        2    0.0
-        3    0.0
-        dtype: float64]
-        """
-
-        warnings.warn(
-            "Series.one_hot_encoding is deprecated and will be removed in "
-            "future, use `get_dummies` instead.",
-            FutureWarning,
-        )
-
-        if hasattr(cats, "to_arrow"):
-            cats = cats.to_pandas()
-        else:
-            cats = pd.Series(cats, dtype="object")
-        dtype = cudf.dtype(dtype)
-
-        try:
-            cats_col = as_column(cats, nan_as_null=False, dtype=self.dtype)
-        except TypeError:
-            raise ValueError("Cannot convert `cats` as cudf column.")
-
-        if self._column.size * cats_col.size >= np.iinfo("int32").max:
-            raise ValueError(
-                "Size limitation exceeded: series.size * category.size < "
-                "np.iinfo('int32').max. Consider reducing size of category"
-            )
-
-        res = libcudf.transform.one_hot_encode(self._column, cats_col)
-        if dtype.type == np.bool_:
-            return [
-                Series._from_data({None: x}, index=self._index)
-                for x in list(res.values())
-            ]
-        else:
-            return [
-                Series._from_data({None: x.astype(dtype)}, index=self._index)
-                for x in list(res.values())
-            ]
-
     def label_encoding(self, cats, dtype=None, na_sentinel=-1):
         """Perform label encoding.
 

diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
@@ -7,108 +7,23 @@
 import pytest
 
 import cudf
-from cudf import DataFrame, Index, Series
+from cudf import DataFrame
 from cudf.testing import _utils as utils
 
 
-def test_onehot_simple():
-    np.random.seed(0)
-    df = DataFrame()
-    # Populate with data [0, 10)
-    df["vals"] = np.arange(10, dtype=np.int32)
-    # One Hot (Series)
-    for i, col in enumerate(df["vals"].one_hot_encoding(list(range(10)))):
-        arr = col.to_numpy()
-        # Verify 1 in the right position
-        np.testing.assert_equal(arr[i], 1)
-        # Every other slots are 0s
-        np.testing.assert_equal(arr[:i], 0)
-        np.testing.assert_equal(arr[i + 1 :], 0)
-    # One Hot (DataFrame)
-    df2 = df.one_hot_encoding(
-        column="vals", prefix="vals", cats=list(range(10))
-    )
-    assert df2.columns[0] == "vals"
-    for i in range(1, len(df2.columns)):
-        assert df2.columns[i] == "vals_%s" % (i - 1)
-    got = df2[df2.columns[1:]].values_host
-    expect = np.identity(got.shape[0])
-    np.testing.assert_equal(got, expect)
-
-
-def test_onehot_random():
-    df = DataFrame()
-    low = 10
-    high = 17
-    size = 10
-    df["src"] = src = np.random.randint(low=low, high=high, size=size)
-    df2 = df.one_hot_encoding(
-        column="src", prefix="out_", cats=tuple(range(10, 17))
-    )
-    mat = df2[df2.columns[1:]].values_host
-
-    for val in range(low, high):
-        colidx = val - low
-        arr = mat[:, colidx]
-        mask = src == val
-        np.testing.assert_equal(arr, mask)
-
-
-def test_onehot_masked():
-    np.random.seed(0)
-    high = 5
-    size = 100
-    arr = np.random.randint(low=0, high=high, size=size)
-    bitmask = utils.random_bitmask(size)
-    bytemask = np.asarray(
-        utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_
-    )
-    arr[~bytemask] = -1
-
-    df = DataFrame()
-    df["a"] = Series(arr).set_mask(bitmask)
-
-    out = df.one_hot_encoding(
-        "a", cats=list(range(high)), prefix="a", dtype=np.int32
-    )
-
-    assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4")
-    np.testing.assert_array_equal((out["a_0"] == 1).to_numpy(), arr == 0)
-    np.testing.assert_array_equal((out["a_1"] == 1).to_numpy(), arr == 1)
-    np.testing.assert_array_equal((out["a_2"] == 1).to_numpy(), arr == 2)
-    np.testing.assert_array_equal((out["a_3"] == 1).to_numpy(), arr == 3)
-    np.testing.assert_array_equal((out["a_4"] == 1).to_numpy(), arr == 4)
-
-
-def test_onehot_generic_index():
-    np.random.seed(0)
-    size = 33
-    indices = np.random.randint(low=0, high=100, size=size)
-    df = DataFrame()
-    values = np.random.randint(low=0, high=4, size=size)
-    df["fo"] = Series(values, index=Index(indices))
-    out = df.one_hot_encoding(
-        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
-    )
-    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
-    np.testing.assert_array_equal(values == 0, out.fo_0.to_numpy())
-    np.testing.assert_array_equal(values == 1, out.fo_1.to_numpy())
-    np.testing.assert_array_equal(values == 2, out.fo_2.to_numpy())
-    np.testing.assert_array_equal(values == 3, out.fo_3.to_numpy())
-
-
 @pytest.mark.parametrize(
-    "data",
+    "data, index",
     [
-        np.arange(10),
-        ["abc", "zyx", "pppp"],
-        [],
-        pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"),
+        (np.arange(10), None),
+        (["abc", "zyx", "pppp"], None),
+        ([], None),
+        (pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"), None),
+        (range(10), [1, 2, 3, 4, 5] * 2),
     ],
 )
-def test_get_dummies(data):
-    gdf = DataFrame({"x": data})
-    pdf = pd.DataFrame({"x": data})
+def test_get_dummies(data, index):
+    gdf = DataFrame({"x": data}, index=index)
+    pdf = pd.DataFrame({"x": data}, index=index)
 
     encoded_expected = pd.get_dummies(pdf, prefix="test")
     encoded_actual = cudf.get_dummies(gdf, prefix="test")