diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 59737325291cc..611771a47c233 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -539,6 +539,32 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'}) from_dict +.. versionadded:: 0.18.0 + +Sometimes it will be useful to only keep k-1 levels of a categorical +variable to avoid collinearity when feeding the result to statistical models. +You can switch to this mode by turn on ``drop_first``. + +.. ipython:: python + + s = pd.Series(list('abcaa')) + + pd.get_dummies(s) + + pd.get_dummies(s, drop_first=True) + +When a column contains only one level, it will be omitted in the result. + +.. ipython:: python + + df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')}) + + pd.get_dummies(df) + + pd.get_dummies(df, drop_first=True) + + + Factorizing values ------------------ diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index a31efc63269b6..0643250484839 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -949,7 +949,7 @@ def melt_stub(df, stub, i, j): def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False): + columns=None, sparse=False, drop_first=False): """ Convert categorical variable into dummy/indicator variables @@ -976,7 +976,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, Otherwise returns a DataFrame with some SparseBlocks. .. versionadded:: 0.16.1 + drop_first : bool, default False + Whether to get k-1 dummies out of n categorical levels by removing the + first level. + .. versionadded:: 0.18.0 Returns ------- dummies : DataFrame or SparseDataFrame @@ -1016,6 +1020,21 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 1 2 0 1 1 0 0 2 3 1 0 0 0 1 + >>> pd.get_dummies(pd.Series(list('abcaa'))) + a b c + 0 1 0 0 + 1 0 1 0 + 2 0 0 1 + 3 1 0 0 + 4 1 0 0 + + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)) + b c + 0 0 0 + 1 1 0 + 2 0 1 + 3 0 0 + 4 0 0 See also ``Series.str.get_dummies``. """ @@ -1065,23 +1084,23 @@ def check_len(item, name): for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, - dummy_na=dummy_na, sparse=sparse) + dummy_na=dummy_na, sparse=sparse, + drop_first=drop_first) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, - sparse=sparse) + sparse=sparse, drop_first=drop_first) return result def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - sparse=False): + sparse=False, drop_first=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories - # if all NaN - if not dummy_na and len(levels) == 0: + def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: @@ -1091,11 +1110,19 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, else: return SparseDataFrame(index=index) + # if all NaN + if not dummy_na and len(levels) == 0: + return get_empty_Frame(data, sparse) + codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) + # if dummy_na, we just fake a nan level. drop_first will drop it again + if drop_first and len(levels) == 1: + return get_empty_Frame(data, sparse) + number_of_cols = len(levels) if prefix is not None: @@ -1118,6 +1145,11 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, continue sp_indices[code].append(ndx) + if drop_first: + # remove first categorical level to avoid perfect collinearity + # GH12042 + sp_indices = sp_indices[1:] + dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) @@ -1132,6 +1164,10 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, # reset NaN GH4446 dummy_mat[codes == -1] = 0 + if drop_first: + # remove first GH12042 + dummy_mat = dummy_mat[:, 1:] + dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 6de589f87cfd8..671c345898ec2 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -411,6 +411,111 @@ def test_dataframe_dummies_with_categorical(self): ]] assert_frame_equal(result, expected) + # GH12402 Add a new parameter `drop_first` to avoid collinearity + def test_basic_drop_first(self): + # Basic case + s_list = list('abc') + s_series = Series(s_list) + s_series_index = Series(s_list, list('ABC')) + + expected = DataFrame({'b': {0: 0.0, + 1: 1.0, + 2: 0.0}, + 'c': {0: 0.0, + 1: 0.0, + 2: 1.0}}) + + result = get_dummies(s_list, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + result = get_dummies(s_series, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + expected.index = list('ABC') + result = get_dummies(s_series_index, sparse=self.sparse, + drop_first=True) + assert_frame_equal(result, expected) + + def test_basic_drop_first_one_level(self): + # Test the case that categorical variable only has one level. + s_list = list('aaa') + s_series = Series(s_list) + s_series_index = Series(s_list, list('ABC')) + + expected = DataFrame(index=np.arange(3)) + + result = get_dummies(s_list, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + result = get_dummies(s_series, sparse=self.sparse, drop_first=True) + assert_frame_equal(result, expected) + + expected = DataFrame(index=list('ABC')) + result = get_dummies(s_series_index, sparse=self.sparse, + drop_first=True) + assert_frame_equal(result, expected) + + def test_basic_drop_first_NA(self): + # Test NA hadling together with drop_first + s_NA = ['a', 'b', np.nan] + res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) + exp = DataFrame({'b': {0: 0.0, + 1: 1.0, + 2: 0.0}}) + assert_frame_equal(res, exp) + + res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, + drop_first=True) + exp_na = DataFrame({'b': {0: 0.0, + 1: 1.0, + 2: 0.0}, + nan: {0: 0.0, + 1: 0.0, + 2: 1.0}}).reindex_axis( + ['b', nan], 1) + assert_frame_equal(res_na, exp_na) + + res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, + drop_first=True) + exp_just_na = DataFrame(index=np.arange(1)) + assert_frame_equal(res_just_na, exp_just_na) + + def test_dataframe_dummies_drop_first(self): + df = self.df[['A', 'B']] + result = get_dummies(df, sparse=self.sparse, drop_first=True) + expected = DataFrame({'A_b': [0., 1, 0], + 'B_c': [0., 0, 1]}) + assert_frame_equal(result, expected) + + def test_dataframe_dummies_drop_first_with_categorical(self): + df = self.df + df['cat'] = pd.Categorical(['x', 'y', 'y']) + result = get_dummies(df, sparse=self.sparse, drop_first=True) + expected = DataFrame({'C': [1, 2, 3], + 'A_b': [0., 1, 0], + 'B_c': [0., 0, 1], + 'cat_y': [0., 1, 1]}) + expected = expected[['C', 'A_b', 'B_c', 'cat_y']] + assert_frame_equal(result, expected) + + def test_dataframe_dummies_drop_first_with_na(self): + df = self.df + df.loc[3, :] = [np.nan, np.nan, np.nan] + result = get_dummies(df, dummy_na=True, sparse=self.sparse, + drop_first=True) + expected = DataFrame({'C': [1, 2, 3, np.nan], + 'A_b': [0., 1, 0, 0], + 'A_nan': [0., 0, 0, 1], + 'B_c': [0., 0, 1, 0], + 'B_nan': [0., 0, 0, 1]}) + expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] + assert_frame_equal(result, expected) + + result = get_dummies(df, dummy_na=False, sparse=self.sparse, + drop_first=True) + expected = expected[['C', 'A_b', 'B_c']] + assert_frame_equal(result, expected) + class TestGetDummiesSparse(TestGetDummies): sparse = True