pandas-dev · changhiskhan · Dec 20, 2018 · Jun 27, 2019 · jreback · Dec 21, 2018
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -228,4 +228,22 @@ def time_qcut_datetime(self, bins):
         pd.qcut(self.datetime_series, bins)
 
 
+class Explode(object):
+    param_names = ['n_rows', 'max_list_length']
+    params = [[100, 1000, 10000], [3, 5, 10]]
+
+    def setup(self, n_rows, max_list_length):
+        import string
+        num_letters = np.random.randint(0, max_list_length, n_rows)
+        key_column = [','.join([np.random.choice(list(string.ascii_letters))
+                                for _ in range(k)])
+                      for k in num_letters]
+        value_column = np.random.randn(n_rows)
+        self.frame = pd.DataFrame({'key': key_column,
+                                   'value': value_column})
+
+    def time_explode(self, n_rows, max_list_length):
+        self.frame.explode('key', sep=',')
+
+
 from .pandas_vb_common import setup  # noqa: F401
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -801,3 +801,34 @@ Note to subdivide over multiple columns we can pass in a list to the
 
    df.pivot_table(
        values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
+
+.. _reshaping.explode:
+
+Exploding a List-like Column
+----------------------------
+
+Sometimes the value column is list-like:
+
+.. ipython:: python
+
+   keys = ['panda1', 'panda2', 'panda3']
+   values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
+   df = pd.DataFrame({'keys': keys, 'values': values})
+   df
+
+But we actually want to put each value onto its own row.
+For this purpose we can use ``DataFrame.explode``:
+
+.. ipython:: python
+
+   df.explode('values')
+
+For convenience, we can use the optional keyword ``sep`` to automatically
+split a string column before exploding:
+
+.. ipython:: python
+
+   values = ['eats,shoots', 'shoots,leaves', 'eats,shoots,leaves']
+   df2 = pd.DataFrame({'keys': keys, 'values': values})
+   df2
+   df2.explode('values', sep=',')
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -15,7 +15,51 @@ This is a major release from 0.23.4 and includes a number of API changes, new
 features, enhancements, and performance improvements along with a large number
 of bug fixes.
 
+<<<<<<< HEAD
+These are the changes in pandas 0.24.0. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+.. _whatsnew_0240.enhancements:
+
+New features
+~~~~~~~~~~~~
+- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`)
+- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
+- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups <groupby.split>` for more information (:issue:`15475`, :issue:`15506`).
+- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing
+  the user to override the engine's default behavior to include or omit the
+  dataframe's indexes from the resulting Parquet file. (:issue:`20768`)
+- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`)
+- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`)
+- :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
+- :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
+  See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
+- :func:`DataFrame.explode` to split list-like values onto individual rows. See :ref:`section on Exploding list-like column <reshaping.html>` in docs for more information (:issue:`16538`)
+
+.. _whatsnew_0240.values_api:
+
+Accessing the values in a Series or Index
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a
+``Series`` or ``Index``. (:issue:`19954`, :issue:`23623`)
+
+.. ipython:: python
+
+   idx = pd.period_range('2000', periods=4)
+   idx.array
+   pd.Series(idx).array
+
+Historically, this would have been done with ``series.values``, but with
+``.values`` it was unclear whether the returned value would be the actual array,
+some transformation of it, or one of pandas custom arrays (like
+``Categorical``). For example, with :class:`PeriodIndex`, ``.values`` generates
+a new ndarray of period objects each time.
+
+.. ipython:: python
+=======
 Highlights include:
+>>>>>>> master
 
 * :ref:`Optional Integer NA Support <whatsnew_0240.enhancements.intna>`
 * :ref:`New APIs for accessing the array backing a Series or Index <whatsnew_0240.values_api>`

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6165,6 +6165,57 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
                     var_name=var_name, value_name=value_name,
                     col_level=col_level)
 
+    def explode(self, col_name, sep=None, dtype=None):
+        """
+        Create new DataFrame expanding a list-like column.
+
+        .. versionadded:: 0.24.0
+
+        Parameters
+        ----------
+        col_name : str
+            Name of the column to be exploded.
+        sep : str, default None
+            Convenience to split a string `col_name` before exploding.
+        dtype : str or dtype, default None
+            Optionally coerce the dtype of exploded column.
+
+        Returns
+        -------
+        exploded: DataFrame
+
+        See Also
+        --------
+        Series.str.split: Split string values on specified separator.
+        Series.str.extract: Extract groups from the first regex match.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'k': ['a,b', 'c,d'], 'v': [0, 1]})
+        >>> df.explode('k', sep=',')
+           k  v
+        0  a  0
+        0  b  0
+        1  c  1
+        1  d  1
+        """
+        col = self[col_name]
+        if len(self) == 0:
+            return self.copy()
+        if sep:
+            col_expanded = col.str.split(sep, expand=True)
+        else:
+            col_expanded = col.apply(Series)
+        col_stacked = (col_expanded
+                       .stack()
+                       .reset_index(level=-1, drop=True)
+                       .rename(col_name))
+        if dtype:
+            col_stacked = col_stacked.astype(dtype)
+        return (col_stacked.to_frame()
+                .join(self.drop(col_name, axis=1))
+                .reindex(self.columns, axis=1))
+
     # ----------------------------------------------------------------------
     # Time series-related
 

diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py
@@ -900,6 +900,101 @@ def test_unstack_swaplevel_sortlevel(self, level):
         tm.assert_frame_equal(result, expected)
 
 
+class TestDataFrameExplode(object):
+    # GH 16538
+    columns = ['a', 'b', 'c']
+
+    def test_sep(self):
+        # Automatically do str.split
+        df = pd.DataFrame([['foo,bar', 'x', 42],
+                           ['fizz,buzz', 'y', 43]],
+                          columns=self.columns)
+        rs = df.explode('a', sep=',')
+        xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
+                           'b': ['x', 'x', 'y', 'y'],
+                           'c': [42, 42, 43, 43]},
+                          index=[0, 0, 1, 1])
+        tm.assert_frame_equal(rs, xp)
+
+    def test_dtype(self):
+        # Coerce dtype
+        df = pd.DataFrame([[[0, 1, 4], 'x', 42],
+                           [[2, 3], 'y', 43]],
+                          columns=self.columns)
+        rs = df.explode('a', dtype='int')
+        xp = pd.DataFrame({'a': np.array([0, 1, 4, 2, 3], dtype='int'),
+                           'b': ['x', 'x', 'x', 'y', 'y'],
+                           'c': [42, 42, 42, 43, 43]},
+                          index=[0, 0, 0, 1, 1])
+        tm.assert_frame_equal(rs, xp)
+
+    def test_na(self):
+        # NaN's and empty lists are omitted
+        # TODO: option to preserve explicit NAs instead
+        df = pd.DataFrame([[[], 'x', 42],
+                           [[2.0, np.nan], 'y', 43]],
+                          columns=self.columns)
+        rs = df.explode('a')
+        xp = pd.DataFrame({'a': [2.0],
+                           'b': ['y'],
+                           'c': [43]},
+                          index=[1])
+        tm.assert_frame_equal(rs, xp)
+
+    def test_nonuniform_type(self):
+        # Not everything is a list
+        df = pd.DataFrame([[[0, 1, 4], 'x', 42],
+                           [3, 'y', 43]],
+                          columns=self.columns)
+        rs = df.explode('a', dtype='int')
+        xp = pd.DataFrame({'a': np.array([0, 1, 4, 3], dtype='int'),
+                           'b': ['x', 'x', 'x', 'y'],
+                           'c': [42, 42, 42, 43]},
+                          index=[0, 0, 0, 1])
+        tm.assert_frame_equal(rs, xp)
+
+    def test_all_scalars(self):
+        # Nothing is a list
+        df = pd.DataFrame([[0, 'x', 42],
+                           [3, 'y', 43]],
+                          columns=self.columns)
+        rs = df.explode('a')
+        xp = pd.DataFrame({'a': [0, 3],
+                           'b': ['x', 'y'],
+                           'c': [42, 43]},
+                          index=[0, 1])
+        tm.assert_frame_equal(rs, xp)
+
+    def test_empty(self):
+        # Empty frame
+        rs = pd.DataFrame(columns=['a', 'b']).explode('a')
+        xp = pd.DataFrame(columns=['a', 'b'])
+        tm.assert_frame_equal(rs, xp)
+
+    def test_missing_column(self):
+        # Bad column name
+        df = pd.DataFrame([[0, 'x', 42],
+                           [3, 'y', 43]],
+                          columns=self.columns)
+        pytest.raises(KeyError, df.explode, 'badcolumnname')
+
+    def test_multi_index(self):
+        # Multi-index
+        idx = pd.MultiIndex.from_tuples([(0, 'a'), (1, 'b')])
+        df = pd.DataFrame([['foo,bar', 'x', 42],
+                           ['fizz,buzz', 'y', 43]],
+                          columns=self.columns,
+                          index=idx)
+        rs = df.explode('a', sep=',')
+        idx = pd.MultiIndex.from_tuples(
+            [(0, 'a'), (0, 'a'), (1, 'b'), (1, 'b')])
+        xp = pd.DataFrame({'a': ['foo', 'bar', 'fizz', 'buzz'],
+                           'b': ['x', 'x', 'y', 'y'],
+                           'c': [42, 42, 43, 43]},
+                          index=idx)
+        tm.assert_frame_equal(rs, xp)
+
+
 def test_unstack_fill_frame_object():
     # GH12815 Test unstacking with object.
     data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')