From bd111e171db019e60c3ebcd8471710916a329103 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 15 Mar 2021 23:52:09 -0700 Subject: [PATCH 01/15] Passing tests --- python/cudf/cudf/_lib/cpp/lists/explode.pxd | 13 ++++++ python/cudf/cudf/_lib/lists.pyx | 25 ++++++++++- python/cudf/cudf/core/dataframe.py | 50 +++++++++++++++++++++ python/cudf/cudf/core/frame.py | 23 +++++++++- python/cudf/cudf/core/series.py | 42 +++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 28 ++++++++++++ python/cudf/cudf/tests/test_series.py | 21 +++++++++ 7 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/lists/explode.pxd diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd new file mode 100644 index 00000000000..cd2d44d2e42 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/explode.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type + +cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil: + cdef unique_ptr[table] explode_outer( + const table_view, + size_type explode_column_idx, + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index aba13580912..56b89e9244c 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -6,12 +6,19 @@ from libcpp.utility cimport move from cudf._lib.cpp.lists.count_elements cimport ( count_elements as cpp_count_elements ) +from cudf._lib.cpp.lists.explode cimport ( + explode_outer as cpp_explode_outer +) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column -from cudf._lib.column cimport Column +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type +from cudf._lib.column cimport Column +from cudf._lib.table cimport Table from cudf.core.dtypes import ListDtype @@ -32,3 +39,19 @@ def count_elements(Column col): result = Column.from_unique_ptr(move(c_result)) return result + + +def explode_outer(Table tbl, int explode_column_idx): + cdef table_view c_table_view = tbl.view() + cdef size_type c_explode_column_idx = explode_column_idx + + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx)) + + return Table.from_unique_ptr( + move(c_result), + column_names=tbl._column_names, + index_names=tbl._index_names + ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 25f57748765..0a1a8d7ba0c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7709,6 +7709,56 @@ def equals(self, other): return False return super().equals(other) + def explode(self, column, ignore_index=False): + """ + Transform each element of a list-like to a row, replicating index + values. + + Parameters + ---------- + column : str or tuple + Column to explode. Now only supports one column + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + + Notes + ------- + In cudf, empty lists `[]` are mapped to nulls, as opposed to `nan` in + Pandas. + + Examples + ------- + >>> import cudf + >>> cudf.DataFrame( + {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]} + ) + a b + 0 [1, 2, 3] 11 + 1 [] 22 + 2 None 33 + 3 [4, 5] 44 + >>> df.explode('a') + a b + 0 1 11 + 0 2 11 + 0 3 11 + 1 22 + 2 33 + 3 4 44 + 3 5 44 + """ + if column not in self._column_names: + raise KeyError(column) + + explode_num = self._column_names.index(column) + return super()._explode( + explode_num, None if ignore_index else self.index + ) + _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fab5936f94d..22c47213138 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,16 @@ import functools import warnings from collections import OrderedDict, abc as abc -from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Optional, + Tuple, + TypeVar, + Union, + overload, +) import cupy import numpy as np @@ -573,6 +582,18 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) + def _explode(self, explode_column_num: int, index: Optional[cudf.Index]): + if index is not None: + explode_column_num += index.nlevels + res_tbl = libcudf.lists.explode_outer( + cudf._lib.table.Table(self._data, index=index), explode_column_num + ) + + res = self.__class__._from_table(res_tbl) + if index is not None: + res.index.names = index.names + return res + def _get_columns_by_label(self, labels, downcast): """ Returns columns of the Frame specified by `labels` diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5e7121c0488..4aaf2c0f94d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6364,6 +6364,48 @@ def keys(self): """ return self.index + def explode(self, ignore_index=False): + """ + Transform each element of a list-like to a row, replicating index + values. + + Parameters + ---------- + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + + Notes + ------- + In cudf, empty lists `[]` are mapped to nulls, as opposed to `nan` in + Pandas. + + Examples + ------- + >>> import cudf + >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]]) + >>> s + 0 [1, 2, 3] + 1 [] + 2 None + 3 [4, 5] + dtype: list + >>> s.explode() + 0 1 + 0 2 + 0 3 + 1 + 2 + 3 4 + 3 5 + dtype: int64 + """ + + return super()._explode(0, None if ignore_index else self.index) + _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 77548b95277..e2e6c469949 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8442,3 +8442,31 @@ def test_rename_for_level_is_None_MC(): got = gdf.rename(columns={"a": "f"}, level=None) assert_eq(expect, got) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize( + "p_index", + [ + None, + ["ia", "ib", "ic", "id", "ie"], + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), + ], +) +def test_explode(ignore_index, p_index): + gdf = cudf.DataFrame( + { + "a": [[1, 2, 3], None, [4], [], [5, 6]], + "b": [11, 22, 33, 44, 55], + "c": ["a", "e", "i", "o", "u"], + }, + index=p_index, + ) + pdf = gdf.to_pandas(nullable=True) + + expect = pdf.explode("a", ignore_index).fillna(pd.NA) + got = gdf.explode("a", ignore_index) + + assert_eq(got, expect, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a1b4236719d..d8531657177 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1118,3 +1118,24 @@ def test_series_drop_raises(): actual = gs.drop("p", errors="ignore") assert_eq(actual, expect) + + +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize( + "p_index", + [ + None, + ["ia", "ib", "ic", "id", "ie"], + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), + ], +) +def test_explode(ignore_index, p_index): + gdf = cudf.Series([[1, 2, 3], None, [4], [], [5, 6]], index=p_index) + pdf = gdf.to_pandas(nullable=True) + + expect = pdf.explode(ignore_index) + got = gdf.explode(ignore_index) + + assert_eq(expect, got, check_dtype=False) From 2c543c6a4f9b9241efa25f7c7a57c7009dc6a95e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Mar 2021 00:45:24 -0700 Subject: [PATCH 02/15] Cleaning stale docstrings --- python/cudf/cudf/core/dataframe.py | 5 ----- python/cudf/cudf/core/series.py | 5 ----- 2 files changed, 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0a1a8d7ba0c..23030bcb95e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7725,11 +7725,6 @@ def explode(self, column, ignore_index=False): ------- DataFrame - Notes - ------- - In cudf, empty lists `[]` are mapped to nulls, as opposed to `nan` in - Pandas. - Examples ------- >>> import cudf diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4aaf2c0f94d..f93acab0818 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6378,11 +6378,6 @@ def explode(self, ignore_index=False): ------- DataFrame - Notes - ------- - In cudf, empty lists `[]` are mapped to nulls, as opposed to `nan` in - Pandas. - Examples ------- >>> import cudf From 6cc0feedc4e111f1bdd7ec7a854e6dfdcdaa8dae Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Mar 2021 23:00:00 -0700 Subject: [PATCH 03/15] remove fillna in test code --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e2e6c469949..d172dbaaaaa 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8466,7 +8466,7 @@ def test_explode(ignore_index, p_index): ) pdf = gdf.to_pandas(nullable=True) - expect = pdf.explode("a", ignore_index).fillna(pd.NA) + expect = pdf.explode("a", ignore_index) got = gdf.explode("a", ignore_index) assert_eq(got, expect, check_dtype=False) From 87683896a934f4f5c2110b01135fa2b6b93e7286 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Mar 2021 23:01:55 -0700 Subject: [PATCH 04/15] small doc fix --- python/cudf/cudf/core/dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 23030bcb95e..24c138561da 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7717,7 +7717,7 @@ def explode(self, column, ignore_index=False): Parameters ---------- column : str or tuple - Column to explode. Now only supports one column + Column to explode. ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. @@ -7729,8 +7729,7 @@ def explode(self, column, ignore_index=False): ------- >>> import cudf >>> cudf.DataFrame( - {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]} - ) + {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]}) a b 0 [1, 2, 3] 11 1 [] 22 From 7aa72e44e6c6ed188a34e10ddcfbcfe05c5e4557 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 11:01:12 -0700 Subject: [PATCH 05/15] direct passing ignore_index --- python/cudf/cudf/core/dataframe.py | 4 +--- python/cudf/cudf/core/frame.py | 30 ++++++++++++------------------ python/cudf/cudf/core/series.py | 2 +- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 24c138561da..778bb45437f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7749,9 +7749,7 @@ def explode(self, column, ignore_index=False): raise KeyError(column) explode_num = self._column_names.index(column) - return super()._explode( - explode_num, None if ignore_index else self.index - ) + return super()._explode(explode_num, ignore_index) _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 22c47213138..80532e9f0bd 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,16 +6,7 @@ import functools import warnings from collections import OrderedDict, abc as abc -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Optional, - Tuple, - TypeVar, - Union, - overload, -) +from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload import cupy import numpy as np @@ -582,16 +573,19 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) - def _explode(self, explode_column_num: int, index: Optional[cudf.Index]): - if index is not None: - explode_column_num += index.nlevels - res_tbl = libcudf.lists.explode_outer( - cudf._lib.table.Table(self._data, index=index), explode_column_num - ) + def _explode(self, explode_column_num: int, ignore_index: bool): + if ignore_index: + tmp_index, self._index = self._index, None + elif self._index is not None: + explode_column_num += self._index.nlevels + res_tbl = libcudf.lists.explode_outer(self, explode_column_num) res = self.__class__._from_table(res_tbl) - if index is not None: - res.index.names = index.names + + if ignore_index: + self._index = tmp_index + elif self._index is not None: + res.index.names = self._index.names return res def _get_columns_by_label(self, labels, downcast): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f93acab0818..0abffb50f85 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6399,7 +6399,7 @@ def explode(self, ignore_index=False): dtype: int64 """ - return super()._explode(0, None if ignore_index else self.index) + return super()._explode(0, ignore_index) _accessors = set() # type: Set[Any] From 859f0f68ec97465f4a3b649b92fb5c42666f9444 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 14:29:28 -0700 Subject: [PATCH 06/15] handling no-op case --- python/cudf/cudf/core/dataframe.py | 3 +-- python/cudf/cudf/core/frame.py | 10 ++++++++- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 26 ++++++++++++++++-------- python/cudf/cudf/tests/test_series.py | 15 +++++++++++--- 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 778bb45437f..e0c3c69bbea 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7748,8 +7748,7 @@ def explode(self, column, ignore_index=False): if column not in self._column_names: raise KeyError(column) - explode_num = self._column_names.index(column) - return super()._explode(explode_num, ignore_index) + return super()._explode(column, ignore_index) _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 80532e9f0bd..1795ceca2bc 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -23,6 +23,7 @@ from cudf.utils.dtypes import ( is_categorical_dtype, is_column_like, + is_list_dtype, is_numerical_dtype, is_scalar, min_scalar_type, @@ -573,7 +574,14 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) - def _explode(self, explode_column_num: int, ignore_index: bool): + def _explode(self, explode_column: Any, ignore_index: bool): + if not is_list_dtype(self._data[explode_column].dtype): + copy = self.copy() + if ignore_index: + copy._index = cudf.RangeIndex(copy._num_rows) + return copy + + explode_column_num = self._column_names.index(explode_column) if ignore_index: tmp_index, self._index = self._index, None elif self._index is not None: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0abffb50f85..ffba9267430 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6399,7 +6399,7 @@ def explode(self, ignore_index=False): dtype: int64 """ - return super()._explode(0, ignore_index) + return super()._explode(self._column_names[0], ignore_index) _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d172dbaaaaa..d965f997645 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8444,6 +8444,21 @@ def test_rename_for_level_is_None_MC(): assert_eq(expect, got) +@pytest.mark.parametrize( + "data", + [ + { + "a": [[1, 2, 3], None, [4], [], [5, 6]], + "b": [11, 22, 33, 44, 55], + "c": ["a", "e", "i", "o", "u"], + }, # nested + { + "a": [1, 2, 3, 4, 5], + "b": [11, 22, 33, 44, 55], + "c": ["a", "e", "i", "o", "u"], + }, # non-nested + ], +) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize( "p_index", @@ -8455,15 +8470,8 @@ def test_rename_for_level_is_None_MC(): ), ], ) -def test_explode(ignore_index, p_index): - gdf = cudf.DataFrame( - { - "a": [[1, 2, 3], None, [4], [], [5, 6]], - "b": [11, 22, 33, 44, 55], - "c": ["a", "e", "i", "o", "u"], - }, - index=p_index, - ) +def test_explode(data, ignore_index, p_index): + gdf = cudf.DataFrame(data, index=p_index,) pdf = gdf.to_pandas(nullable=True) expect = pdf.explode("a", ignore_index) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index d8531657177..bbdeb55b9ee 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1120,6 +1120,10 @@ def test_series_drop_raises(): assert_eq(actual, expect) +@pytest.mark.parametrize( + "data", + [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], # non-nested +) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize( "p_index", @@ -1131,11 +1135,16 @@ def test_series_drop_raises(): ), ], ) -def test_explode(ignore_index, p_index): - gdf = cudf.Series([[1, 2, 3], None, [4], [], [5, 6]], index=p_index) +def test_explode(data, ignore_index, p_index): + gdf = cudf.Series(data, index=p_index) pdf = gdf.to_pandas(nullable=True) expect = pdf.explode(ignore_index) got = gdf.explode(ignore_index) - assert_eq(expect, got, check_dtype=False) + if data == [1, 2, 3, 4, 5] and ignore_index and p_index is not None: + # https://github.com/pandas-dev/pandas/issues/40487 + with pytest.raises(AssertionError, match="different"): + assert_eq(expect, got, check_dtype=False) + else: + assert_eq(expect, got, check_dtype=False) From fe085f6a0b0fa2b651ff2b96bc58523e9a91b431 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 16:06:01 -0700 Subject: [PATCH 07/15] Account for multi-level column names --- python/cudf/cudf/core/frame.py | 3 ++ python/cudf/cudf/tests/test_dataframe.py | 49 ++++++++++++++++-------- python/cudf/cudf/tests/test_series.py | 4 +- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1795ceca2bc..f34e375103b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -590,6 +590,9 @@ def _explode(self, explode_column: Any, ignore_index: bool): res_tbl = libcudf.lists.explode_outer(self, explode_column_num) res = self.__class__._from_table(res_tbl) + res._data.multiindex = self._data.multiindex + res._data._level_names = self._data._level_names + if ignore_index: self._index = tmp_index elif self._index is not None: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d965f997645..9f426c1bc82 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8447,16 +8447,33 @@ def test_rename_for_level_is_None_MC(): @pytest.mark.parametrize( "data", [ - { - "a": [[1, 2, 3], None, [4], [], [5, 6]], - "b": [11, 22, 33, 44, 55], - "c": ["a", "e", "i", "o", "u"], - }, # nested - { - "a": [1, 2, 3, 4, 5], - "b": [11, 22, 33, 44, 55], - "c": ["a", "e", "i", "o", "u"], - }, # non-nested + [ + [[1, 2, 3], 11, "a"], + [None, 22, "e"], + [[4], 33, "i"], + [[], 44, "o"], + [[5, 6], 55, "u"], + ], # nested + [ + [1, 11, "a"], + [2, 22, "e"], + [3, 33, "i"], + [4, 44, "o"], + [5, 55, "u"], + ], # non-nested + ], +) +@pytest.mark.parametrize( + ("labels", "label_to_explode"), + [ + (None, 0), + (pd.Index(["a", "b", "c"]), "a"), + ( + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] + ), + (0, "a"), + ), ], ) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -8470,11 +8487,11 @@ def test_rename_for_level_is_None_MC(): ), ], ) -def test_explode(data, ignore_index, p_index): - gdf = cudf.DataFrame(data, index=p_index,) - pdf = gdf.to_pandas(nullable=True) +def test_explode(data, labels, ignore_index, p_index, label_to_explode): + pdf = pd.DataFrame(data, index=p_index, columns=labels) + gdf = cudf.from_pandas(pdf) - expect = pdf.explode("a", ignore_index) - got = gdf.explode("a", ignore_index) + expect = pdf.explode(label_to_explode, ignore_index) + got = gdf.explode(label_to_explode, ignore_index) - assert_eq(got, expect, check_dtype=False) + assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index bbdeb55b9ee..5c583fc58fc 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1136,8 +1136,8 @@ def test_series_drop_raises(): ], ) def test_explode(data, ignore_index, p_index): - gdf = cudf.Series(data, index=p_index) - pdf = gdf.to_pandas(nullable=True) + pdf = pd.Series(data, index=p_index, name="someseries") + gdf = cudf.from_pandas(pdf) expect = pdf.explode(ignore_index) got = gdf.explode(ignore_index) From ca419cc19d5bb86957587c959a9b09197e2520c9 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 16:10:36 -0700 Subject: [PATCH 08/15] Doc for _explode --- python/cudf/cudf/core/frame.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f34e375103b..d3cf798af82 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -575,6 +575,12 @@ def equals(self, other, **kwargs): return self._index.equals(other._index) def _explode(self, explode_column: Any, ignore_index: bool): + """Helper function for `explode` in Series and Dataframe. + if the designated column to explode is non-nested, a copy + of the frame is returned. Otherwise, if ignore_index is + set, the original index is not exploded and will use + a `RangeIndex` instead. + """ if not is_list_dtype(self._data[explode_column].dtype): copy = self.copy() if ignore_index: From 9d7e4a12602c24bb5d050f33dffaa17f37b60412 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 21:14:23 -0700 Subject: [PATCH 09/15] Better handling of index column --- python/cudf/cudf/_lib/lists.pyx | 9 ++++++--- python/cudf/cudf/core/frame.py | 14 ++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 56b89e9244c..0f0ee35556a 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr, shared_ptr, make_shared from libcpp.utility cimport move @@ -41,8 +42,10 @@ def count_elements(Column col): return result -def explode_outer(Table tbl, int explode_column_idx): - cdef table_view c_table_view = tbl.view() +def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): + cdef table_view c_table_view = ( + tbl.data_view() if ignore_index else tbl.view() + ) cdef size_type c_explode_column_idx = explode_column_idx cdef unique_ptr[table] c_result @@ -53,5 +56,5 @@ def explode_outer(Table tbl, int explode_column_idx): return Table.from_unique_ptr( move(c_result), column_names=tbl._column_names, - index_names=tbl._index_names + index_names=None if ignore_index else tbl._index_names ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d3cf798af82..eaa27c2fd75 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -579,7 +579,7 @@ def _explode(self, explode_column: Any, ignore_index: bool): if the designated column to explode is non-nested, a copy of the frame is returned. Otherwise, if ignore_index is set, the original index is not exploded and will use - a `RangeIndex` instead. + a `RangeIndex`. """ if not is_list_dtype(self._data[explode_column].dtype): copy = self.copy() @@ -588,20 +588,18 @@ def _explode(self, explode_column: Any, ignore_index: bool): return copy explode_column_num = self._column_names.index(explode_column) - if ignore_index: - tmp_index, self._index = self._index, None - elif self._index is not None: + if not ignore_index and self._index is not None: explode_column_num += self._index.nlevels - res_tbl = libcudf.lists.explode_outer(self, explode_column_num) + res_tbl = libcudf.lists.explode_outer( + self, explode_column_num, ignore_index + ) res = self.__class__._from_table(res_tbl) res._data.multiindex = self._data.multiindex res._data._level_names = self._data._level_names - if ignore_index: - self._index = tmp_index - elif self._index is not None: + if not ignore_index and self._index is not None: res.index.names = self._index.names return res From c09f8151e440e80217b4be6440dda31a570a6005 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 21:39:39 -0700 Subject: [PATCH 10/15] Rev: avoid extra index copy when ignore_index=True --- python/cudf/cudf/core/frame.py | 39 +++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index eaa27c2fd75..fa10c406b56 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -581,27 +581,32 @@ def _explode(self, explode_column: Any, ignore_index: bool): set, the original index is not exploded and will use a `RangeIndex`. """ - if not is_list_dtype(self._data[explode_column].dtype): - copy = self.copy() - if ignore_index: - copy._index = cudf.RangeIndex(copy._num_rows) - return copy + if ( + isinstance(self, (cudf.Series, cudf.DataFrame)) + and self._index is not None + ): + if not is_list_dtype(self._data[explode_column].dtype): + data = self._data.copy(deep=True) + idx = None if ignore_index else self._index.copy(deep=True) + return self.__class__._from_data(data, index=idx) - explode_column_num = self._column_names.index(explode_column) - if not ignore_index and self._index is not None: - explode_column_num += self._index.nlevels + explode_column_num = self._column_names.index(explode_column) + if not ignore_index: + explode_column_num += self._index.nlevels - res_tbl = libcudf.lists.explode_outer( - self, explode_column_num, ignore_index - ) - res = self.__class__._from_table(res_tbl) + res_tbl = libcudf.lists.explode_outer( + self, explode_column_num, ignore_index + ) + res = self.__class__._from_table(res_tbl) - res._data.multiindex = self._data.multiindex - res._data._level_names = self._data._level_names + res._data.multiindex = self._data.multiindex + res._data._level_names = self._data._level_names - if not ignore_index and self._index is not None: - res.index.names = self._index.names - return res + if not ignore_index: + res.index.names = self._index.names + return res + else: + raise NotImplementedError("_explode is not implemented for Index.") def _get_columns_by_label(self, labels, downcast): """ From b33a6a04b78dc82a347d4acae38daa9590e24641 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 21:58:40 -0700 Subject: [PATCH 11/15] Remove stale comments --- python/cudf/cudf/tests/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 5c583fc58fc..23d348b1e68 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1122,7 +1122,7 @@ def test_series_drop_raises(): @pytest.mark.parametrize( "data", - [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], # non-nested + [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], ) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize( From 25409c1498453bf147f5e3f53cf8f674498df0e7 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 22:03:07 -0700 Subject: [PATCH 12/15] style --- python/cudf/cudf/tests/test_series.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 23d348b1e68..beda14934ca 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1121,8 +1121,7 @@ def test_series_drop_raises(): @pytest.mark.parametrize( - "data", - [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], + "data", [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], ) @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize( From 767d3e2d975e0ed7be92d2f50e8cf2039900fe4c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 23:02:01 -0700 Subject: [PATCH 13/15] Move copy case 1-level up --- python/cudf/cudf/core/dataframe.py | 5 +++++ python/cudf/cudf/core/frame.py | 36 ++++++++++-------------------- python/cudf/cudf/core/series.py | 4 ++++ 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e0c3c69bbea..86b4b3259a5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7748,6 +7748,11 @@ def explode(self, column, ignore_index=False): if column not in self._column_names: raise KeyError(column) + if not is_list_dtype(self._data[column].dtype): + data = self._data.copy(deep=True) + idx = None if ignore_index else self._index.copy(deep=True) + return self.__class__._from_data(data, index=idx) + return super()._explode(column, ignore_index) _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fa10c406b56..e589eab3775 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -23,7 +23,6 @@ from cudf.utils.dtypes import ( is_categorical_dtype, is_column_like, - is_list_dtype, is_numerical_dtype, is_scalar, min_scalar_type, @@ -581,32 +580,21 @@ def _explode(self, explode_column: Any, ignore_index: bool): set, the original index is not exploded and will use a `RangeIndex`. """ - if ( - isinstance(self, (cudf.Series, cudf.DataFrame)) - and self._index is not None - ): - if not is_list_dtype(self._data[explode_column].dtype): - data = self._data.copy(deep=True) - idx = None if ignore_index else self._index.copy(deep=True) - return self.__class__._from_data(data, index=idx) + explode_column_num = self._column_names.index(explode_column) + if not ignore_index and self._index is not None: + explode_column_num += self._index.nlevels - explode_column_num = self._column_names.index(explode_column) - if not ignore_index: - explode_column_num += self._index.nlevels - - res_tbl = libcudf.lists.explode_outer( - self, explode_column_num, ignore_index - ) - res = self.__class__._from_table(res_tbl) + res_tbl = libcudf.lists.explode_outer( + self, explode_column_num, ignore_index + ) + res = self.__class__._from_table(res_tbl) - res._data.multiindex = self._data.multiindex - res._data._level_names = self._data._level_names + res._data.multiindex = self._data.multiindex + res._data._level_names = self._data._level_names - if not ignore_index: - res.index.names = self._index.names - return res - else: - raise NotImplementedError("_explode is not implemented for Index.") + if not ignore_index and self._index is not None: + res.index.names = self._index.names + return res def _get_columns_by_label(self, labels, downcast): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ffba9267430..93a2260f338 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6398,6 +6398,10 @@ def explode(self, ignore_index=False): 3 5 dtype: int64 """ + if not is_list_dtype(self._column.dtype): + data = self._data.copy(deep=True) + idx = None if ignore_index else self._index.copy(deep=True) + return self.__class__._from_data(data, index=idx) return super()._explode(self._column_names[0], ignore_index) From 7cb25e7957bce8dc7ef09323732b7b0a7466c5e3 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 18 Mar 2021 10:48:27 -0700 Subject: [PATCH 14/15] Rev: _explode doc fix --- python/cudf/cudf/core/frame.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e589eab3775..bfcc2d125db 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -574,11 +574,10 @@ def equals(self, other, **kwargs): return self._index.equals(other._index) def _explode(self, explode_column: Any, ignore_index: bool): - """Helper function for `explode` in Series and Dataframe. - if the designated column to explode is non-nested, a copy - of the frame is returned. Otherwise, if ignore_index is - set, the original index is not exploded and will use - a `RangeIndex`. + """Helper function for `explode` in `Series` and `Dataframe`, explodes + a specified nested column. Other columns' corresponding rows are + duplicated. If ignore_index is set, the original index is not exploded + and will be replaced with a `RangeIndex`. """ explode_column_num = self._column_names.index(explode_column) if not ignore_index and self._index is not None: From 6ce751aaf7691736de1c4dd397ad06d34fe4fcc4 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 18 Mar 2021 10:49:25 -0700 Subject: [PATCH 15/15] Rev: small docstrings bug fix Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 86b4b3259a5..4414b9324d6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7726,7 +7726,7 @@ def explode(self, column, ignore_index=False): DataFrame Examples - ------- + -------- >>> import cudf >>> cudf.DataFrame( {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]}) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 93a2260f338..0c356d33606 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6379,7 +6379,7 @@ def explode(self, ignore_index=False): DataFrame Examples - ------- + -------- >>> import cudf >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]]) >>> s