From 20f65126e0de65876bf412fa4280d8725afe2260 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 25 Nov 2017 16:51:13 -0500 Subject: [PATCH] Propogating NaN values when using str.split (#18450) (#18462) --- doc/source/whatsnew/v0.21.1.txt | 6 +++++- pandas/core/strings.py | 4 ++++ pandas/tests/test_strings.py | 12 ++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 976f3524e3c71..f8274bda546f7 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -138,9 +138,13 @@ Categorical - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) - Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +String +^^^^^^ + +- :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) + Other ^^^^^ - - -- diff --git a/pandas/core/strings.py b/pandas/core/strings.py index abef6f6086dbd..9614641aa1abf 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1423,6 +1423,10 @@ def cons_row(x): return [x] result = [cons_row(x) for x in result] + if result: + # propogate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [x * max_len if x[0] is np.nan else x for x in result] if not isinstance(expand, bool): raise ValueError("expand must be True or False") diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f1b97081b6d93..8aa69bcbfdf7f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2086,6 +2086,18 @@ def test_rsplit_to_multiindex_expand(self): tm.assert_index_equal(result, exp) assert result.nlevels == 2 + def test_split_nan_expand(self): + # gh-18450 + s = Series(["foo,bar,baz", NA]) + result = s.str.split(",", expand=True) + exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) + tm.assert_frame_equal(result, exp) + + # check that these are actually np.nan and not None + # TODO see GH 18463 + # tm.assert_frame_equal does not differentiate + assert all(np.isnan(x) for x in result.iloc[1]) + def test_split_with_name(self): # GH 12617