From 24cd841837cca186f81059273c6e778549d7a4c2 Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Sun, 18 Nov 2018 19:32:50 +0100 Subject: [PATCH] BUG: Index.str.partition not nan-safe (#23558) (#23618) --- doc/source/whatsnew/v0.24.0.rst | 4 +- pandas/_libs/lib.pyx | 4 +- pandas/tests/test_strings.py | 107 +++++++++++++++++++------------- 3 files changed, 69 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index af07a65aeb3a90..69232fa8361021 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1270,8 +1270,8 @@ Numeric Strings ^^^^^^^ -- -- +- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). +- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`). - Interval diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0088a698f49e07..e89c8fa5796879 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2273,7 +2273,7 @@ def to_object_array_tuples(rows: list): k = 0 for i in range(n): - tmp = len(rows[i]) + tmp = 1 if checknull(rows[i]) else len(rows[i]) if tmp > k: k = tmp @@ -2287,7 +2287,7 @@ def to_object_array_tuples(rows: list): except Exception: # upcast any subclasses to tuple for i in range(n): - row = tuple(rows[i]) + row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) for j in range(len(row)): result[i, j] = row[j] diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 7cd9182b4dff46..42f0cebea83a09 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2330,24 +2330,35 @@ def test_split_to_dataframe(self): s.str.split('_', expand="not_a_boolean") def test_split_to_multiindex_expand(self): - idx = Index(['nosplit', 'alsonosplit']) + # https://github.com/pandas-dev/pandas/issues/23677 + + idx = Index(['nosplit', 'alsonosplit', np.nan]) result = idx.str.split('_', expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 - idx = Index(['some_equal_splits', 'with_no_nans']) + idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None]) result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), ( - 'with', 'no', 'nans')]) + exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), + ('with', 'no', 'nans'), + [np.nan, np.nan, np.nan], + [None, None, None]]) tm.assert_index_equal(result, exp) assert result.nlevels == 3 - idx = Index(['some_unequal_splits', 'one_of_these_things_is_not']) + idx = Index(['some_unequal_splits', + 'one_of_these_things_is_not', + np.nan, None]) result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA - ), ('one', 'of', 'these', 'things', - 'is', 'not')]) + exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', + NA, NA, NA), + ('one', 'of', 'these', + 'things', 'is', 'not'), + (np.nan, np.nan, np.nan, + np.nan, np.nan, np.nan), + (None, None, None, + None, None, None)]) tm.assert_index_equal(result, exp) assert result.nlevels == 6 @@ -2441,50 +2452,54 @@ def test_split_with_name(self): tm.assert_index_equal(res, exp) def test_partition_series(self): - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + # https://github.com/pandas-dev/pandas/issues/23558 + + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_', expand=False) exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA, - ('f', '_', 'g_h')]) + ('f', '_', 'g_h'), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA, - ('f_g', '_', 'h')]) + ('f_g', '_', 'h'), None]) tm.assert_series_equal(result, exp) # more than one char - values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) + values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None]) result = values.str.partition('__', expand=False) exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA, - ('f', '__', 'g__h')]) + ('f', '__', 'g__h'), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition('__', expand=False) exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA, - ('f__g', '__', 'h')]) + ('f__g', '__', 'h'), None]) tm.assert_series_equal(result, exp) # None - values = Series(['a b c', 'c d e', NA, 'f g h']) + values = Series(['a b c', 'c d e', NA, 'f g h', None]) result = values.str.partition(expand=False) exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA, - ('f', ' ', 'g h')]) + ('f', ' ', 'g h'), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA, - ('f g', ' ', 'h')]) + ('f g', ' ', 'h'), None]) tm.assert_series_equal(result, exp) - # Not splited - values = Series(['abc', 'cde', NA, 'fgh']) + # Not split + values = Series(['abc', 'cde', NA, 'fgh', None]) result = values.str.partition('_', expand=False) - exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')]) + exp = Series([('abc', '', ''), ('cde', '', ''), NA, + ('fgh', '', ''), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')]) + exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, + ('', '', 'fgh'), None]) tm.assert_series_equal(result, exp) # unicode @@ -2508,57 +2523,65 @@ def test_partition_series(self): assert result == [v.rpartition('_') for v in values] def test_partition_index(self): - values = Index(['a_b_c', 'c_d_e', 'f_g_h']) + # https://github.com/pandas-dev/pandas/issues/23558 + + values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None]) result = values.str.partition('_', expand=False) - exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', - 'g_h')])) + exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), + ('f', '_', 'g_h'), np.nan, None])) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.rpartition('_', expand=False) - exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ( - 'f_g', '_', 'h')])) + exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), + ('f_g', '_', 'h'), np.nan, None])) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.partition('_') - exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]) + exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), + ('f', '_', 'g_h'), (np.nan, np.nan, np.nan), + (None, None, None)]) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 result = values.str.rpartition('_') - exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]) + exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), + ('f_g', '_', 'h'), (np.nan, np.nan, np.nan), + (None, None, None)]) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 def test_partition_to_dataframe(self): - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + # https://github.com/pandas-dev/pandas/issues/23558 + + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_') - exp = DataFrame({0: ['a', 'c', np.nan, 'f'], - 1: ['_', '_', np.nan, '_'], - 2: ['b_c', 'd_e', np.nan, 'g_h']}) + exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], + 1: ['_', '_', np.nan, '_', None], + 2: ['b_c', 'd_e', np.nan, 'g_h', None]}) tm.assert_frame_equal(result, exp) result = values.str.rpartition('_') - exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], - 1: ['_', '_', np.nan, '_'], - 2: ['c', 'e', np.nan, 'h']}) + exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None], + 1: ['_', '_', np.nan, '_', None], + 2: ['c', 'e', np.nan, 'h', None]}) tm.assert_frame_equal(result, exp) - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_', expand=True) - exp = DataFrame({0: ['a', 'c', np.nan, 'f'], - 1: ['_', '_', np.nan, '_'], - 2: ['b_c', 'd_e', np.nan, 'g_h']}) + exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], + 1: ['_', '_', np.nan, '_', None], + 2: ['b_c', 'd_e', np.nan, 'g_h', None]}) tm.assert_frame_equal(result, exp) result = values.str.rpartition('_', expand=True) - exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], - 1: ['_', '_', np.nan, '_'], - 2: ['c', 'e', np.nan, 'h']}) + exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None], + 1: ['_', '_', np.nan, '_', None], + 2: ['c', 'e', np.nan, 'h', None]}) tm.assert_frame_equal(result, exp) def test_partition_with_name(self):