diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 08a2946279a98..848fa5a2d7629 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -221,6 +221,28 @@ enhancements are performed to make string operation easier. idx.str.startswith('a') s[s.index.str.startswith('a')] + +- ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`) + + .. ipython:: python + + s = Series(['a,b', 'a,c', 'b,c']) + + # return Series + s.str.split(',') + + # return DataFrame + s.str.split(',', expand=True) + + idx = Index(['a,b', 'a,c', 'b,c']) + + # return Index + idx.str.split(',') + + # return MultiIndex + idx.str.split(',', expand=True) + + - Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`) .. _whatsnew_0161.api: @@ -249,6 +271,13 @@ API changes - By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`) +.. _whatsnew_0161.deprecations: + +Deprecations +^^^^^^^^^^^^ + +- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`) + .. _whatsnew_0161.performance: Performance Improvements diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a25879e61b580..21ae99d27eaeb 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -3,7 +3,7 @@ from pandas.compat import zip from pandas.core.common import isnull, _values_from_object, is_bool_dtype import pandas.compat as compat -from pandas.util.decorators import Appender +from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib import warnings @@ -696,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '): return _na_map(f, arr) -def str_split(arr, pat=None, n=None, return_type='series'): +def str_split(arr, pat=None, n=None): """ Split each string (a la re.split) in the Series/Index by given pattern, propagating NA values. Equivalent to :meth:`str.split`. @@ -706,28 +706,16 @@ def str_split(arr, pat=None, n=None, return_type='series'): pat : string, default None String or regular expression to split on. If None, splits on whitespace n : int, default None (all) - return_type : {'series', 'index', 'frame'}, default 'series' - If frame, returns a DataFrame (elements are strings) - If series or index, returns the same type as the original object - (elements are lists of strings). - - Notes - ----- - Both 0 and -1 will be interpreted as return all splits + Both 0 and -1 will be interpreted as return all splits + expand : bool, default False + * If True, return DataFrame/MultiIndex expanding dimensionality. + * If False, return Series/Index. + return_type : deprecated, use `expand` Returns ------- - split : Series/Index of objects or DataFrame + split : Series/Index or DataFrame/MultiIndex of objects """ - from pandas.core.series import Series - from pandas.core.frame import DataFrame - from pandas.core.index import Index - - if return_type not in ('series', 'index', 'frame'): - raise ValueError("return_type must be {'series', 'index', 'frame'}") - if return_type == 'frame' and isinstance(arr, Index): - raise ValueError("return_type='frame' is not supported for string " - "methods on Index") if pat is None: if n is None or n == 0: n = -1 @@ -742,10 +730,7 @@ def str_split(arr, pat=None, n=None, return_type='series'): n = 0 regex = re.compile(pat) f = lambda x: regex.split(x, maxsplit=n) - if return_type == 'frame': - res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index) - else: - res = _na_map(f, arr) + res = _na_map(f, arr) return res @@ -1083,7 +1068,10 @@ def _wrap_result(self, result, **kwargs): return DataFrame(result, index=self.series.index) def _wrap_result_expand(self, result, expand=False): - from pandas.core.index import Index + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + from pandas.core.index import Index, MultiIndex if not hasattr(result, 'ndim'): return result @@ -1096,7 +1084,9 @@ def _wrap_result_expand(self, result, expand=False): if expand: result = list(result) - return Index(result, name=name) + return MultiIndex.from_tuples(result, names=name) + else: + return Index(result, name=name) else: index = self.series.index if expand: @@ -1114,10 +1104,12 @@ def cat(self, others=None, sep=None, na_rep=None): result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep) return self._wrap_result(result) + @deprecate_kwarg('return_type', 'expand', + mapping={'series': False, 'frame': True}) @copy(str_split) - def split(self, pat=None, n=-1, return_type='series'): - result = str_split(self.series, pat, n=n, return_type=return_type) - return self._wrap_result(result) + def split(self, pat=None, n=-1, expand=False): + result = str_split(self.series, pat, n=n) + return self._wrap_result_expand(result, expand=expand) _shared_docs['str_partition'] = (""" Split the string at the %(side)s occurrence of `sep`, and return 3 elements @@ -1131,7 +1123,7 @@ def split(self, pat=None, n=-1, return_type='series'): String to split on. expand : bool, default True * If True, return DataFrame/MultiIndex expanding dimensionality. - * If False, return Series/Index + * If False, return Series/Index. Returns ------- diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 2ee9d405d9601..0c8c8be5217c3 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1280,11 +1280,12 @@ def test_str_attribute(self): idx = Index(['a b c', 'd e', 'f']) expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']]) tm.assert_index_equal(idx.str.split(), expected) - tm.assert_index_equal(idx.str.split(return_type='series'), expected) - # return_type 'index' is an alias for 'series' - tm.assert_index_equal(idx.str.split(return_type='index'), expected) - with self.assertRaisesRegexp(ValueError, 'not supported'): - idx.str.split(return_type='frame') + tm.assert_index_equal(idx.str.split(expand=False), expected) + + expected = MultiIndex.from_tuples([('a', 'b', 'c'), + ('d', 'e', np.nan), + ('f', np.nan, np.nan)]) + tm.assert_index_equal(idx.str.split(expand=True), expected) # test boolean case, should return np.array instead of boolean Index idx = Index(['a1', 'a2', 'b1', 'b2']) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 9011e6c64b097..b0d8d89d65cf2 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1206,14 +1206,19 @@ def test_split(self): result = values.str.split('__') tm.assert_series_equal(result, exp) + result = values.str.split('__', expand=False) + tm.assert_series_equal(result, exp) + # mixed mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, 2.]) - - rs = Series(mixed).str.split('_') + rs = mixed.str.split('_') xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA]) + tm.assert_isinstance(rs, Series) + tm.assert_almost_equal(rs, xp) + rs = mixed.str.split('_', expand=False) tm.assert_isinstance(rs, Series) tm.assert_almost_equal(rs, xp) @@ -1226,6 +1231,9 @@ def test_split(self): [u('f'), u('g'), u('h')]]) tm.assert_series_equal(result, exp) + result = values.str.split('_', expand=False) + tm.assert_series_equal(result, exp) + def test_split_noargs(self): # #1859 s = Series(['Wes McKinney', 'Travis Oliphant']) @@ -1259,7 +1267,10 @@ def test_split_no_pat_with_nonzero_n(self): def test_split_to_dataframe(self): s = Series(['nosplit', 'alsonosplit']) - result = s.str.split('_', return_type='frame') + + with tm.assert_produces_warning(): + result = s.str.split('_', return_type='frame') + exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) tm.assert_frame_equal(result, exp) @@ -1282,9 +1293,61 @@ def test_split_to_dataframe(self): index=['preserve', 'me']) tm.assert_frame_equal(result, exp) - with tm.assertRaisesRegexp(ValueError, "return_type must be"): + with tm.assertRaisesRegexp(ValueError, "expand must be"): + s.str.split('_', return_type="some_invalid_type") + + def test_split_to_dataframe_expand(self): + s = Series(['nosplit', 'alsonosplit']) + result = s.str.split('_', expand=True) + exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_equal_splits', 'with_no_nans']) + result = s.str.split('_', expand=True) + exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'], + 2: ['splits', 'nans']}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_unequal_splits', 'one_of_these_things_is_not']) + result = s.str.split('_', expand=True) + exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'], + 2: ['splits', 'these'], 3: [NA, 'things'], + 4: [NA, 'is'], 5: [NA, 'not']}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) + result = s.str.split('_', expand=True) + exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, + index=['preserve', 'me']) + tm.assert_frame_equal(result, exp) + + with tm.assertRaisesRegexp(ValueError, "expand must be"): s.str.split('_', return_type="some_invalid_type") + def test_split_to_multiindex_expand(self): + idx = Index(['nosplit', 'alsonosplit']) + result = idx.str.split('_', expand=True) + exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 1) + + idx = Index(['some_equal_splits', 'with_no_nans']) + result = idx.str.split('_', expand=True) + exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), + ('with', 'no', 'nans')]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 3) + + idx = Index(['some_unequal_splits', 'one_of_these_things_is_not']) + result = idx.str.split('_', expand=True) + exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA), + ('one', 'of', 'these', 'things', 'is', 'not')]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 6) + + with tm.assertRaisesRegexp(ValueError, "expand must be"): + idx.str.split('_', return_type="some_invalid_type") + def test_partition_series(self): values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])