Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Deprecate str.split return_type #10085

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,28 @@ enhancements are performed to make string operation easier.
idx.str.startswith('a')
s[s.index.str.startswith('a')]


- ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`)

.. ipython:: python

s = Series(['a,b', 'a,c', 'b,c'])

# return Series
s.str.split(',')

# return DataFrame
s.str.split(',', expand=True)

idx = Index(['a,b', 'a,c', 'b,c'])

# return Index
idx.str.split(',')

# return MultiIndex
idx.str.split(',', expand=True)


- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)

.. _whatsnew_0161.api:
Expand Down Expand Up @@ -249,6 +271,13 @@ API changes

- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`)

.. _whatsnew_0161.deprecations:

Deprecations
^^^^^^^^^^^^

- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`)

.. _whatsnew_0161.performance:

Performance Improvements
Expand Down
54 changes: 23 additions & 31 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pandas.compat import zip
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
import pandas.compat as compat
from pandas.util.decorators import Appender
from pandas.util.decorators import Appender, deprecate_kwarg
import re
import pandas.lib as lib
import warnings
Expand Down Expand Up @@ -696,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
return _na_map(f, arr)


def str_split(arr, pat=None, n=None, return_type='series'):
def str_split(arr, pat=None, n=None):
"""
Split each string (a la re.split) in the Series/Index by given
pattern, propagating NA values. Equivalent to :meth:`str.split`.
Expand All @@ -705,29 +705,17 @@ def str_split(arr, pat=None, n=None, return_type='series'):
----------
pat : string, default None
String or regular expression to split on. If None, splits on whitespace
n : int, default None (all)
return_type : {'series', 'index', 'frame'}, default 'series'
If frame, returns a DataFrame (elements are strings)
If series or index, returns the same type as the original object
(elements are lists of strings).

Notes
-----
Both 0 and -1 will be interpreted as return all splits
n : int, default -1 (all)
None, 0 and -1 will be interpreted as return all splits
expand : bool, default False
* If True, return DataFrame/MultiIndex expanding dimensionality.
* If False, return Series/Index.
return_type : deprecated, use `expand`

Returns
-------
split : Series/Index of objects or DataFrame
split : Series/Index or DataFrame/MultiIndex of objects
"""
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

if return_type not in ('series', 'index', 'frame'):
raise ValueError("return_type must be {'series', 'index', 'frame'}")
if return_type == 'frame' and isinstance(arr, Index):
raise ValueError("return_type='frame' is not supported for string "
"methods on Index")
if pat is None:
if n is None or n == 0:
n = -1
Expand All @@ -742,10 +730,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
n = 0
regex = re.compile(pat)
f = lambda x: regex.split(x, maxsplit=n)
if return_type == 'frame':
res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
else:
res = _na_map(f, arr)
res = _na_map(f, arr)
return res


Expand Down Expand Up @@ -1083,7 +1068,10 @@ def _wrap_result(self, result, **kwargs):
return DataFrame(result, index=self.series.index)

def _wrap_result_expand(self, result, expand=False):
from pandas.core.index import Index
if not isinstance(expand, bool):
raise ValueError("expand must be True or False")

from pandas.core.index import Index, MultiIndex
if not hasattr(result, 'ndim'):
return result

Expand All @@ -1096,7 +1084,9 @@ def _wrap_result_expand(self, result, expand=False):

if expand:
result = list(result)
return Index(result, name=name)
return MultiIndex.from_tuples(result, names=name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this partially closes #10008 yes?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Updated #10008 for current status. The current impl can return MultiIndex for partition (#9773) already, but it doesn't work for split (thus changed).

else:
return Index(result, name=name)
else:
index = self.series.index
if expand:
Expand All @@ -1114,10 +1104,12 @@ def cat(self, others=None, sep=None, na_rep=None):
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
return self._wrap_result(result)

@deprecate_kwarg('return_type', 'expand',
mapping={'series': False, 'frame': True})
@copy(str_split)
def split(self, pat=None, n=-1, return_type='series'):
result = str_split(self.series, pat, n=n, return_type=return_type)
return self._wrap_result(result)
def split(self, pat=None, n=-1, expand=False):
result = str_split(self.series, pat, n=n)
return self._wrap_result_expand(result, expand=expand)

_shared_docs['str_partition'] = ("""
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
Expand All @@ -1131,7 +1123,7 @@ def split(self, pat=None, n=-1, return_type='series'):
String to split on.
expand : bool, default True
* If True, return DataFrame/MultiIndex expanding dimensionality.
* If False, return Series/Index
* If False, return Series/Index.

Returns
-------
Expand Down
11 changes: 6 additions & 5 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1280,11 +1280,12 @@ def test_str_attribute(self):
idx = Index(['a b c', 'd e', 'f'])
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
tm.assert_index_equal(idx.str.split(), expected)
tm.assert_index_equal(idx.str.split(return_type='series'), expected)
# return_type 'index' is an alias for 'series'
tm.assert_index_equal(idx.str.split(return_type='index'), expected)
with self.assertRaisesRegexp(ValueError, 'not supported'):
idx.str.split(return_type='frame')
tm.assert_index_equal(idx.str.split(expand=False), expected)

expected = MultiIndex.from_tuples([('a', 'b', 'c'),
('d', 'e', np.nan),
('f', np.nan, np.nan)])
tm.assert_index_equal(idx.str.split(expand=True), expected)

# test boolean case, should return np.array instead of boolean Index
idx = Index(['a1', 'a2', 'b1', 'b2'])
Expand Down
71 changes: 67 additions & 4 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1206,14 +1206,19 @@ def test_split(self):
result = values.str.split('__')
tm.assert_series_equal(result, exp)

result = values.str.split('__', expand=False)
tm.assert_series_equal(result, exp)

# mixed
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
None, 1, 2.])

rs = Series(mixed).str.split('_')
rs = mixed.str.split('_')
xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
NA, NA, NA])
tm.assert_isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)

rs = mixed.str.split('_', expand=False)
tm.assert_isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)

Expand All @@ -1226,6 +1231,9 @@ def test_split(self):
[u('f'), u('g'), u('h')]])
tm.assert_series_equal(result, exp)

result = values.str.split('_', expand=False)
tm.assert_series_equal(result, exp)

def test_split_noargs(self):
# #1859
s = Series(['Wes McKinney', 'Travis Oliphant'])
Expand Down Expand Up @@ -1259,7 +1267,10 @@ def test_split_no_pat_with_nonzero_n(self):

def test_split_to_dataframe(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.split('_', return_type='frame')

with tm.assert_produces_warning():
result = s.str.split('_', return_type='frame')

exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)

Expand All @@ -1282,9 +1293,61 @@ def test_split_to_dataframe(self):
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)

with tm.assertRaisesRegexp(ValueError, "return_type must be"):
with tm.assertRaisesRegexp(ValueError, "expand must be"):
s.str.split('_', return_type="some_invalid_type")

def test_split_to_dataframe_expand(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)

s = Series(['some_equal_splits', 'with_no_nans'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
2: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)

s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'],
2: ['splits', 'these'], 3: [NA, 'things'],
4: [NA, 'is'], 5: [NA, 'not']})
tm.assert_frame_equal(result, exp)

s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)

with tm.assertRaisesRegexp(ValueError, "expand must be"):
s.str.split('_', return_type="some_invalid_type")

def test_split_to_multiindex_expand(self):
idx = Index(['nosplit', 'alsonosplit'])
result = idx.str.split('_', expand=True)
exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 1)

idx = Index(['some_equal_splits', 'with_no_nans'])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
('with', 'no', 'nans')])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 3)

idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA),
('one', 'of', 'these', 'things', 'is', 'not')])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 6)

with tm.assertRaisesRegexp(ValueError, "expand must be"):
idx.str.split('_', return_type="some_invalid_type")

def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])

Expand Down