From fc0fa934124d924f16b6728ce368ef97b5a2557d Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 5 Jan 2022 16:41:48 -0800 Subject: [PATCH] Remove methods and tests, add coverage for generic index in get_dummies --- python/cudf/cudf/core/dataframe.py | 72 ------------------ python/cudf/cudf/core/reshape.py | 12 +-- python/cudf/cudf/core/series.py | 77 ------------------- python/cudf/cudf/tests/test_onehot.py | 105 +++----------------------- 4 files changed, 16 insertions(+), 250 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d97ea456f72..3366a0af4ba 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3051,78 +3051,6 @@ def as_matrix(self, columns=None): ) return self.as_gpu_matrix(columns=columns).copy_to_host() - def one_hot_encoding( - self, column, prefix, cats, prefix_sep="_", dtype="float64" - ): - """ - Expand a column with one-hot-encoding. - - Parameters - ---------- - - column : str - the source column with binary encoding for the data. - prefix : str - the new column name prefix. - cats : sequence of ints - the sequence of categories as integers. - prefix_sep : str - the separator between the prefix and the category. - dtype : - the dtype for the outputs; defaults to float64. - - Returns - ------- - - a new dataframe with new columns append for each category. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> pet_owner = [1, 2, 3, 4, 5] - >>> pet_type = ['fish', 'dog', 'fish', 'bird', 'fish'] - >>> df = pd.DataFrame({'pet_owner': pet_owner, 'pet_type': pet_type}) - >>> df.pet_type = df.pet_type.astype('category') - - Create a column with numerically encoded category values - - >>> df['pet_codes'] = df.pet_type.cat.codes - >>> gdf = cudf.from_pandas(df) - - Create the list of category codes to use in the encoding - - >>> codes = gdf.pet_codes.unique() - >>> gdf.one_hot_encoding('pet_codes', 'pet_dummy', codes).head() - pet_owner pet_type pet_codes pet_dummy_0 pet_dummy_1 pet_dummy_2 - 0 1 fish 2 0.0 0.0 1.0 - 1 2 dog 1 0.0 1.0 0.0 - 2 3 fish 2 0.0 0.0 1.0 - 3 4 bird 0 1.0 0.0 0.0 - 4 5 fish 2 0.0 0.0 1.0 - """ - - warnings.warn( - "DataFrame.one_hot_encoding is deprecated and will be removed in " - "future, use `get_dummies` instead.", - FutureWarning, - ) - - if hasattr(cats, "to_arrow"): - cats = cats.to_arrow().to_pylist() - else: - cats = pd.Series(cats, dtype="object") - - newnames = [ - prefix_sep.join([prefix, "null" if cat is None else str(cat)]) - for cat in cats - ] - newcols = self[column].one_hot_encoding(cats=cats, dtype=dtype) - outdf = self.copy() - for name, col in zip(newnames, newcols): - outdf.insert(len(outdf._data), name, col) - return outdf - def label_encoding( self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1 ): diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b2fac7a6140..1733a6c0b9a 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -600,18 +600,18 @@ def get_dummies( df : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, dict, or sequence, optional - prefix to append. Either a str (to apply a constant prefix), dict + Prefix to append. Either a str (to apply a constant prefix), dict mapping column names to prefixes, or sequence of prefixes to apply with the same length as the number of columns. If not supplied, defaults to the empty string prefix_sep : str, dict, or sequence, optional, default '_' - separator to use when appending prefixes + Separator to use when appending prefixes dummy_na : boolean, optional Add a column to indicate Nones, if False Nones are ignored. cats : dict, optional - dictionary mapping column names to sequences of integers representing - that column's category. See `cudf.DataFrame.one_hot_encoding` for more - information. if not supplied, it will be computed + Dictionary mapping column names to sequences of values representing + that column's category. If not supplied, it is computed as the unique + values of the column. sparse : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. drop_first : boolean, optional @@ -621,7 +621,7 @@ def get_dummies( columns. Note this is different from pandas default behavior, which encodes all columns with dtype object or categorical dtype : str, optional - output dtype, default 'uint8' + Output dtype, default 'uint8' Examples -------- diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fb86cf85c4c..178c40b3cd8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2264,83 +2264,6 @@ def reverse(self): {self.name: self._column[rinds]}, self.index._values[rinds] ) - def one_hot_encoding(self, cats, dtype="float64"): - """Perform one-hot-encoding - - Parameters - ---------- - cats : sequence of values - values representing each category. - dtype : numpy.dtype - specifies the output dtype. - - Returns - ------- - Sequence - A sequence of new series for each category. Its length is - determined by the length of ``cats``. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['a', 'b', 'c', 'a']) - >>> s - 0 a - 1 b - 2 c - 3 a - dtype: object - >>> s.one_hot_encoding(['a', 'c', 'b']) - [0 1.0 - 1 0.0 - 2 0.0 - 3 1.0 - dtype: float64, 0 0.0 - 1 0.0 - 2 1.0 - 3 0.0 - dtype: float64, 0 0.0 - 1 1.0 - 2 0.0 - 3 0.0 - dtype: float64] - """ - - warnings.warn( - "Series.one_hot_encoding is deprecated and will be removed in " - "future, use `get_dummies` instead.", - FutureWarning, - ) - - if hasattr(cats, "to_arrow"): - cats = cats.to_pandas() - else: - cats = pd.Series(cats, dtype="object") - dtype = cudf.dtype(dtype) - - try: - cats_col = as_column(cats, nan_as_null=False, dtype=self.dtype) - except TypeError: - raise ValueError("Cannot convert `cats` as cudf column.") - - if self._column.size * cats_col.size >= np.iinfo("int32").max: - raise ValueError( - "Size limitation exceeded: series.size * category.size < " - "np.iinfo('int32').max. Consider reducing size of category" - ) - - res = libcudf.transform.one_hot_encode(self._column, cats_col) - if dtype.type == np.bool_: - return [ - Series._from_data({None: x}, index=self._index) - for x in list(res.values()) - ] - else: - return [ - Series._from_data({None: x.astype(dtype)}, index=self._index) - for x in list(res.values()) - ] - def label_encoding(self, cats, dtype=None, na_sentinel=-1): """Perform label encoding. diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index f2a20a73b63..2b0422ffecb 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -7,108 +7,23 @@ import pytest import cudf -from cudf import DataFrame, Index, Series +from cudf import DataFrame from cudf.testing import _utils as utils -def test_onehot_simple(): - np.random.seed(0) - df = DataFrame() - # Populate with data [0, 10) - df["vals"] = np.arange(10, dtype=np.int32) - # One Hot (Series) - for i, col in enumerate(df["vals"].one_hot_encoding(list(range(10)))): - arr = col.to_numpy() - # Verify 1 in the right position - np.testing.assert_equal(arr[i], 1) - # Every other slots are 0s - np.testing.assert_equal(arr[:i], 0) - np.testing.assert_equal(arr[i + 1 :], 0) - # One Hot (DataFrame) - df2 = df.one_hot_encoding( - column="vals", prefix="vals", cats=list(range(10)) - ) - assert df2.columns[0] == "vals" - for i in range(1, len(df2.columns)): - assert df2.columns[i] == "vals_%s" % (i - 1) - got = df2[df2.columns[1:]].values_host - expect = np.identity(got.shape[0]) - np.testing.assert_equal(got, expect) - - -def test_onehot_random(): - df = DataFrame() - low = 10 - high = 17 - size = 10 - df["src"] = src = np.random.randint(low=low, high=high, size=size) - df2 = df.one_hot_encoding( - column="src", prefix="out_", cats=tuple(range(10, 17)) - ) - mat = df2[df2.columns[1:]].values_host - - for val in range(low, high): - colidx = val - low - arr = mat[:, colidx] - mask = src == val - np.testing.assert_equal(arr, mask) - - -def test_onehot_masked(): - np.random.seed(0) - high = 5 - size = 100 - arr = np.random.randint(low=0, high=high, size=size) - bitmask = utils.random_bitmask(size) - bytemask = np.asarray( - utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_ - ) - arr[~bytemask] = -1 - - df = DataFrame() - df["a"] = Series(arr).set_mask(bitmask) - - out = df.one_hot_encoding( - "a", cats=list(range(high)), prefix="a", dtype=np.int32 - ) - - assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4") - np.testing.assert_array_equal((out["a_0"] == 1).to_numpy(), arr == 0) - np.testing.assert_array_equal((out["a_1"] == 1).to_numpy(), arr == 1) - np.testing.assert_array_equal((out["a_2"] == 1).to_numpy(), arr == 2) - np.testing.assert_array_equal((out["a_3"] == 1).to_numpy(), arr == 3) - np.testing.assert_array_equal((out["a_4"] == 1).to_numpy(), arr == 4) - - -def test_onehot_generic_index(): - np.random.seed(0) - size = 33 - indices = np.random.randint(low=0, high=100, size=size) - df = DataFrame() - values = np.random.randint(low=0, high=4, size=size) - df["fo"] = Series(values, index=Index(indices)) - out = df.one_hot_encoding( - "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32 - ) - assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} - np.testing.assert_array_equal(values == 0, out.fo_0.to_numpy()) - np.testing.assert_array_equal(values == 1, out.fo_1.to_numpy()) - np.testing.assert_array_equal(values == 2, out.fo_2.to_numpy()) - np.testing.assert_array_equal(values == 3, out.fo_3.to_numpy()) - - @pytest.mark.parametrize( - "data", + "data, index", [ - np.arange(10), - ["abc", "zyx", "pppp"], - [], - pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"), + (np.arange(10), None), + (["abc", "zyx", "pppp"], None), + ([], None), + (pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"), None), + (range(10), [1, 2, 3, 4, 5] * 2), ], ) -def test_get_dummies(data): - gdf = DataFrame({"x": data}) - pdf = pd.DataFrame({"x": data}) +def test_get_dummies(data, index): + gdf = DataFrame({"x": data}, index=index) + pdf = pd.DataFrame({"x": data}, index=index) encoded_expected = pd.get_dummies(pdf, prefix="test") encoded_actual = cudf.get_dummies(gdf, prefix="test")