From d70d4f046d8ac01fe02346f75f4e0f54aec790ff Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 4 Aug 2023 18:03:50 -0700 Subject: [PATCH 1/4] DataFrame with namedtuples uses ._field as column names --- python/cudf/cudf/core/dataframe.py | 24 ++++++++++++++++++++++-- python/cudf/cudf/tests/test_dataframe.py | 14 +++++++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d421258b06b..cc04d080e48 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -708,7 +708,10 @@ def __init__( ) else: self._init_from_list_like( - data, index=index, columns=columns + data, + index=index, + columns=columns, + nan_as_null=nan_as_null, ) self._check_data_index_length_match() else: @@ -827,7 +830,9 @@ def _init_from_series_list(self, data, columns, index): self._data = self._data.select_by_label(columns) @_cudf_nvtx_annotate - def _init_from_list_like(self, data, index=None, columns=None): + def _init_from_list_like( + self, data, index=None, columns=None, nan_as_null=None + ): if index is None: index = RangeIndex(start=0, stop=len(data)) else: @@ -842,6 +847,21 @@ def _init_from_list_like(self, data, index=None, columns=None): elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval): data = DataFrame.from_pandas(pd.DataFrame(data)) self._data = data._data + # namedtuple in a list + elif ( + len(data) > 0 + and columns is None + and isinstance(data[0], tuple) + and hasattr(data[0], "_fields") + ): + # pandas behavior is to use the fields from the first tuple + # as the column names + columns = data[0]._fields + values = itertools.zip_longest(*data) + data = dict(zip(columns, values)) + self._init_from_dict_like( + data, index=index, nan_as_null=nan_as_null + ) else: if any( not isinstance(col, (abc.Iterable, abc.Sequence)) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 0898cb2ef3d..3b87aa2812c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10,7 +10,7 @@ import string import textwrap import warnings -from collections import OrderedDict, defaultdict +from collections import OrderedDict, defaultdict, namedtuple from copy import copy import cupy @@ -10261,3 +10261,15 @@ def __getitem__(self, key): with pytest.raises(TypeError): cudf.DataFrame({"a": A()}) + + +def test_dataframe_constructor_from_namedtuple(): + Point1 = namedtuple("Point1", ["a", "b", "c"]) + Point2 = namedtuple("Point1", ["x", "y"]) + + data = [Point1(1, 2, 3), Point2(4, 5)] + idx = ["a", "b"] + gdf = cudf.DataFrame(data, index=idx) + pdf = pd.DataFrame(data, index=idx) + + assert_eq(gdf, pdf) From 4476ab5e023ecbf9c33ba6157eb49cfcca04d71d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 8 Aug 2023 18:07:35 -0700 Subject: [PATCH 2/4] Simplify code path, add test for raising --- python/cudf/cudf/core/dataframe.py | 24 +++++++++--------------- python/cudf/cudf/tests/test_dataframe.py | 6 ++++++ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a778793e3f3..97539e240e5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -847,27 +847,21 @@ def _init_from_list_like( elif len(data) > 0 and isinstance(data[0], pd._libs.interval.Interval): data = DataFrame.from_pandas(pd.DataFrame(data)) self._data = data._data - # namedtuple in a list - elif ( - len(data) > 0 - and columns is None - and isinstance(data[0], tuple) - and hasattr(data[0], "_fields") - ): - # pandas behavior is to use the fields from the first tuple - # as the column names - columns = data[0]._fields - values = itertools.zip_longest(*data) - data = dict(zip(columns, values)) - self._init_from_dict_like( - data, index=index, nan_as_null=nan_as_null - ) else: if any( not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data ): raise TypeError("Inputs should be an iterable or sequence.") + if ( + len(data) > 0 + and columns is None + and isinstance(data[0], tuple) + and hasattr(data[0], "_fields") + ): + # pandas behavior is to use the fields from the first + # namedtuple as the column names + columns = data[0]._fields data = list(itertools.zip_longest(*data)) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3b87aa2812c..97e399a9cd5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10273,3 +10273,9 @@ def test_dataframe_constructor_from_namedtuple(): pdf = pd.DataFrame(data, index=idx) assert_eq(gdf, pdf) + + data = [Point2(4, 5), Point1(1, 2, 3)] + with pytest.raises(ValueError): + cudf.DataFrame(data, index=idx) + with pytest.raises(ValueError): + pd.DataFrame(data, index=idx) From d4feb46392805501bca5783bcc25a2ee9676e239 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 8 Aug 2023 18:14:23 -0700 Subject: [PATCH 3/4] Remove unused keyword --- python/cudf/cudf/core/dataframe.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 97539e240e5..fa12c733d89 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -711,7 +711,6 @@ def __init__( data, index=index, columns=columns, - nan_as_null=nan_as_null, ) self._check_data_index_length_match() else: @@ -830,9 +829,7 @@ def _init_from_series_list(self, data, columns, index): self._data = self._data.select_by_label(columns) @_cudf_nvtx_annotate - def _init_from_list_like( - self, data, index=None, columns=None, nan_as_null=None - ): + def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) else: From 414d7a1cff5d0cb7a5d887f2d32fd722c3a5d7ad Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 8 Aug 2023 18:15:07 -0700 Subject: [PATCH 4/4] Reduce diff further --- python/cudf/cudf/core/dataframe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fa12c733d89..281c96a8c54 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -708,9 +708,7 @@ def __init__( ) else: self._init_from_list_like( - data, - index=index, - columns=columns, + data, index=index, columns=columns ) self._check_data_index_length_match() else: