Skip to content

Commit

Permalink
ENH: let get_dummies take a DataFrame
Browse files Browse the repository at this point in the history
implement via 1d

fixup docstring, tests

add documentation

test for dicts
  • Loading branch information
TomAugspurger committed Aug 31, 2014
1 parent b82a4e6 commit 78ccfac
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 6 deletions.
43 changes: 43 additions & 0 deletions doc/source/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,49 @@ This function is often used along with discretization functions like ``cut``:
See also :func:`Series.str.get_dummies <pandas.core.strings.StringMethods.get_dummies>`.

.. versionadded:: 0.15.0

:func:`get_dummies` also accepts a DataFrame. By default all categorical
variables (categorical in the statistical sense,
those with `object` or `categorical` dtype) are encoded as dummy variables.


.. ipython:: python
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'],
'C': [1, 2, 3]})
pd.get_dummies(df)
All non-object columns are included untouched in the output.

You can control the columns that are encoded with the ``columns`` keyword.

.. ipython:: python
pd.get_dummies(df, columns=['A'])
Notice that the ``B`` column is still included in the output, it just hasn't
been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't
want to include it in the output.

As with the Series version, you can pass values for the ``prefix`` and
``prefix_sep``. By default the column name is used as the prefix, and '_' as
the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways

- string: Use the same value for ``prefix`` or ``prefix_sep`` for each column
to be encoded
- list: Must be the same length as the number of columns being encoded.
- dict: Mapping column name to prefix

.. ipython:: python
simple = pd.get_dummies(df, prefix='new_prefix')
simple
from_list = pd.get_dummies(df, prefix=['from_A', 'from_B'])
from_list
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
from_dict
Factorizing values
------------------

Expand Down
8 changes: 8 additions & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,15 @@ Enhancements



- The ``get_dummies`` method can now be used on DataFrames. By default only
catagorical columns are encoded as 0's and 1's, while other columns are
left untouched.

.. ipython:: python

df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'],
'C': [1, 2, 3]})
pd.get_dummies(df)



Expand Down
83 changes: 77 additions & 6 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -981,25 +981,34 @@ def convert_dummies(data, cat_variables, prefix_sep='_'):
"""
result = data.drop(cat_variables, axis=1)
for variable in cat_variables:
dummies = get_dummies(data[variable], prefix=variable,
prefix_sep=prefix_sep)
dummies = _get_dummies_1d(data[variable], prefix=variable,
prefix_sep=prefix_sep)
result = result.join(dummies)
return result


def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
columns=None):
"""
Convert categorical variable into dummy/indicator variables
Parameters
----------
data : array-like or Series
prefix : string, default None
data : array-like, Series, or DataFrame
prefix : string, list of strings, or dict of strings, default None
String to append DataFrame column names
Pass a list with length equal to the number of columns
when calling get_dummies on a DataFrame. Alternativly, `prefix`
can be a dictionary mapping column names to prefixes.
prefix_sep : string, default '_'
If appending prefix, separator/delimiter to use
If appending prefix, separator/delimiter to use. Or pass a
list or dictionary as with `prefix.`
dummy_na : bool, default False
Add a column to indicate NaNs, if False NaNs are ignored.
columns : list-like, default None
Column names in the DataFrame to be encoded.
If `columns` is None then all the columns with
`object` or `category` dtype will be converted.
Returns
-------
Expand Down Expand Up @@ -1031,9 +1040,71 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
1 0 1 0
2 0 0 1
>>> df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
'C': [1, 2, 3]})
>>> get_dummies(df, prefix=['col1', 'col2']):
C col1_a col1_b col2_a col2_b col2_c
0 1 1 0 0 1 0
1 2 0 1 1 0 0
2 3 1 0 0 0 1
See also ``Series.str.get_dummies``.
"""
from pandas.tools.merge import concat
from itertools import cycle

if isinstance(data, DataFrame):
# determine columns being encoded

if columns is None:
columns_to_encode = data.select_dtypes(include=['object',
'category']).columns
else:
columns_to_encode = columns

# validate prefixes and separator to avoid silently dropping cols
def check_len(item, name):
length_msg = ("Length of '{0}' ({1}) did "
"not match the length of the columns "
"being encoded ({2}).")

if com.is_list_like(item):
if not len(item) == len(columns_to_encode):
raise ValueError(length_msg.format(name, len(item),
len(columns_to_encode)))

check_len(prefix, 'prefix')
check_len(prefix_sep, 'prefix_sep')
if isinstance(prefix, compat.string_types):
prefix = cycle([prefix])
if isinstance(prefix, dict):
prefix = [prefix[col] for col in columns_to_encode]

if prefix is None:
prefix = columns_to_encode

# validate separators
if isinstance(prefix_sep, compat.string_types):
prefix_sep = cycle([prefix_sep])
elif isinstance(prefix_sep, dict):
prefix_sep = [prefix_sep[col] for col in columns_to_encode]

result = data.drop(columns_to_encode, axis=1)
with_dummies = [result]
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):

dummy = _get_dummies_1d(data[col], prefix=pre,
prefix_sep=sep, dummy_na=dummy_na)
with_dummies.append(dummy)
result = concat(with_dummies, axis=1)
else:
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na)
return result


def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
# Series avoids inconsistent NaN handling
cat = Categorical.from_array(Series(data))
levels = cat.levels
Expand Down
113 changes: 113 additions & 0 deletions pandas/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ def test_multiindex(self):


class TestGetDummies(tm.TestCase):

def setUp(self):
self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
'C': [1, 2, 3]})

def test_basic(self):
s_list = list('abc')
s_series = Series(s_list)
Expand Down Expand Up @@ -209,6 +214,114 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values
u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}})
assert_frame_equal(res, exp)

def test_dataframe_dummies_all_obj(self):
df = self.df[['A', 'B']]
result = get_dummies(df)
expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0],
'B_b': [1., 1, 0], 'B_c': [0., 0, 1]})
assert_frame_equal(result, expected)

def test_dataframe_dummies_mix_default(self):
df = self.df
result = get_dummies(df)
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
'B_c': [0., 0, 1]})
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_list(self):
prefixes = ['from_A', 'from_B']
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
'C': [1, 2, 3]})
result = get_dummies(df, prefix=prefixes)
expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1],
'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0],
'from_B_c': [0., 0, 1]})
expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
'from_B_c']]
assert_frame_equal(result, expected)

def test_datafrmae_dummies_prefix_str(self):
# not that you should do this...
df = self.df
result = get_dummies(df, prefix='bad')
expected = DataFrame([[1, 1., 0., 1., 0.],
[2, 0., 1., 1., 0.],
[3, 1., 0., 0., 1.]],
columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'])
assert_frame_equal(result, expected)

def test_dataframe_dummies_subset(self):
df = self.df
result = get_dummies(df, prefix=['from_A'],
columns=['A'])
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
'B': ['b', 'b', 'c'], 'C': [1, 2, 3]})
assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_sep(self):
df = self.df
result = get_dummies(df, prefix_sep='..')
expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1],
'A..b': [0., 1, 0], 'B..b': [1., 1, 0],
'B..c': [0., 0, 1]})
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
assert_frame_equal(result, expected)

result = get_dummies(df, prefix_sep=['..', '__'])
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
assert_frame_equal(result, expected)

result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'})
assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_bad_length(self):
with tm.assertRaises(ValueError):
get_dummies(self.df, prefix=['too few'])

def test_dataframe_dummies_prefix_sep_bad_length(self):
with tm.assertRaises(ValueError):
get_dummies(self.df, prefix_sep=['bad'])

def test_dataframe_dummies_prefix_dict(self):
prefixes = {'A': 'from_A', 'B': 'from_B'}
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
'C': [1, 2, 3]})
result = get_dummies(df, prefix=prefixes)
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1],
'C': [1, 2, 3]})
assert_frame_equal(result, expected)

def test_dataframe_dummies_with_na(self):
df = self.df
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True)
expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0],
'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0],
'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]})
expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c',
'B_nan']]
assert_frame_equal(result, expected)

result = get_dummies(df, dummy_na=False)
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
assert_frame_equal(result, expected)

def test_dataframe_dummies_with_categorical(self):
df = self.df
df['cat'] = pd.Categorical(['x', 'y', 'y'])
result = get_dummies(df)
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
'B_c': [0., 0, 1], 'cat_x': [1., 0, 0],
'cat_y': [0., 1, 1]})
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c',
'cat_x', 'cat_y']]
assert_frame_equal(result, expected)


class TestConvertDummies(tm.TestCase):
def test_convert_dummies(self):
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
Expand Down

0 comments on commit 78ccfac

Please sign in to comment.