Skip to content

Commit

Permalink
BUG: Index.str.partition not nan-safe (#23558) (#23618)
Browse files Browse the repository at this point in the history
  • Loading branch information
meiermark authored and jreback committed Nov 18, 2018
1 parent e2c4f04 commit 91d1c50
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 46 deletions.
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1270,8 +1270,8 @@ Numeric
Strings
^^^^^^^

-
-
- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`).
- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`).
-

Interval
Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2273,7 +2273,7 @@ def to_object_array_tuples(rows: list):

k = 0
for i in range(n):
tmp = len(rows[i])
tmp = 1 if checknull(rows[i]) else len(rows[i])
if tmp > k:
k = tmp

Expand All @@ -2287,7 +2287,7 @@ def to_object_array_tuples(rows: list):
except Exception:
# upcast any subclasses to tuple
for i in range(n):
row = tuple(rows[i])
row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
for j in range(len(row)):
result[i, j] = row[j]

Expand Down
107 changes: 65 additions & 42 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2330,24 +2330,35 @@ def test_split_to_dataframe(self):
s.str.split('_', expand="not_a_boolean")

def test_split_to_multiindex_expand(self):
idx = Index(['nosplit', 'alsonosplit'])
# https://github.com/pandas-dev/pandas/issues/23677

idx = Index(['nosplit', 'alsonosplit', np.nan])
result = idx.str.split('_', expand=True)
exp = idx
tm.assert_index_equal(result, exp)
assert result.nlevels == 1

idx = Index(['some_equal_splits', 'with_no_nans'])
idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), (
'with', 'no', 'nans')])
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
('with', 'no', 'nans'),
[np.nan, np.nan, np.nan],
[None, None, None]])
tm.assert_index_equal(result, exp)
assert result.nlevels == 3

idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
idx = Index(['some_unequal_splits',
'one_of_these_things_is_not',
np.nan, None])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA
), ('one', 'of', 'these', 'things',
'is', 'not')])
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits',
NA, NA, NA),
('one', 'of', 'these',
'things', 'is', 'not'),
(np.nan, np.nan, np.nan,
np.nan, np.nan, np.nan),
(None, None, None,
None, None, None)])
tm.assert_index_equal(result, exp)
assert result.nlevels == 6

Expand Down Expand Up @@ -2441,50 +2452,54 @@ def test_split_with_name(self):
tm.assert_index_equal(res, exp)

def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
# https://github.com/pandas-dev/pandas/issues/23558

values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])

result = values.str.partition('_', expand=False)
exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA,
('f', '_', 'g_h')])
('f', '_', 'g_h'), None])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA,
('f_g', '_', 'h')])
('f_g', '_', 'h'), None])
tm.assert_series_equal(result, exp)

# more than one char
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None])
result = values.str.partition('__', expand=False)
exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA,
('f', '__', 'g__h')])
('f', '__', 'g__h'), None])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('__', expand=False)
exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA,
('f__g', '__', 'h')])
('f__g', '__', 'h'), None])
tm.assert_series_equal(result, exp)

# None
values = Series(['a b c', 'c d e', NA, 'f g h'])
values = Series(['a b c', 'c d e', NA, 'f g h', None])
result = values.str.partition(expand=False)
exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA,
('f', ' ', 'g h')])
('f', ' ', 'g h'), None])
tm.assert_series_equal(result, exp)

result = values.str.rpartition(expand=False)
exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA,
('f g', ' ', 'h')])
('f g', ' ', 'h'), None])
tm.assert_series_equal(result, exp)

# Not splited
values = Series(['abc', 'cde', NA, 'fgh'])
# Not split
values = Series(['abc', 'cde', NA, 'fgh', None])
result = values.str.partition('_', expand=False)
exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')])
exp = Series([('abc', '', ''), ('cde', '', ''), NA,
('fgh', '', ''), None])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')])
exp = Series([('', '', 'abc'), ('', '', 'cde'), NA,
('', '', 'fgh'), None])
tm.assert_series_equal(result, exp)

# unicode
Expand All @@ -2508,57 +2523,65 @@ def test_partition_series(self):
assert result == [v.rpartition('_') for v in values]

def test_partition_index(self):
values = Index(['a_b_c', 'c_d_e', 'f_g_h'])
# https://github.com/pandas-dev/pandas/issues/23558

values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None])

result = values.str.partition('_', expand=False)
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_',
'g_h')]))
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'),
('f', '_', 'g_h'), np.nan, None]))
tm.assert_index_equal(result, exp)
assert result.nlevels == 1

result = values.str.rpartition('_', expand=False)
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), (
'f_g', '_', 'h')]))
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'),
('f_g', '_', 'h'), np.nan, None]))
tm.assert_index_equal(result, exp)
assert result.nlevels == 1

result = values.str.partition('_')
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'),
('f', '_', 'g_h'), (np.nan, np.nan, np.nan),
(None, None, None)])
tm.assert_index_equal(result, exp)
assert isinstance(result, MultiIndex)
assert result.nlevels == 3

result = values.str.rpartition('_')
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'),
('f_g', '_', 'h'), (np.nan, np.nan, np.nan),
(None, None, None)])
tm.assert_index_equal(result, exp)
assert isinstance(result, MultiIndex)
assert result.nlevels == 3

def test_partition_to_dataframe(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
# https://github.com/pandas-dev/pandas/issues/23558

values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
result = values.str.partition('_')
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
1: ['_', '_', np.nan, '_', None],
2: ['b_c', 'd_e', np.nan, 'g_h', None]})
tm.assert_frame_equal(result, exp)

result = values.str.rpartition('_')
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
1: ['_', '_', np.nan, '_', None],
2: ['c', 'e', np.nan, 'h', None]})
tm.assert_frame_equal(result, exp)

values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
result = values.str.partition('_', expand=True)
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
1: ['_', '_', np.nan, '_', None],
2: ['b_c', 'd_e', np.nan, 'g_h', None]})
tm.assert_frame_equal(result, exp)

result = values.str.rpartition('_', expand=True)
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
1: ['_', '_', np.nan, '_', None],
2: ['c', 'e', np.nan, 'h', None]})
tm.assert_frame_equal(result, exp)

def test_partition_with_name(self):
Expand Down

0 comments on commit 91d1c50

Please sign in to comment.