From 5e58e71836fd69ead04fbed5fdccb5e2e2c4d95c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Nov 2023 16:25:19 -1000 Subject: [PATCH] REF: Make DataFrame.from_pandas process by column (#14483) Also encountered a bug where `cudf.Index.from_pandas` would return an `cudf.Index[int64]` from a `pandas.RangeIndex` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14483 --- python/cudf/cudf/core/_base_index.py | 16 ++++++-- python/cudf/cudf/core/dataframe.py | 55 +++++++++++----------------- python/cudf/cudf/tests/test_index.py | 7 ++++ 3 files changed, 40 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 8387ef96dfa..fcfe8a21f05 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1873,10 +1873,18 @@ def from_pandas(cls, index, nan_as_null=no_default): if not isinstance(index, pd.Index): raise TypeError("not a pandas.Index") - - ind = cudf.Index(column.as_column(index, nan_as_null=nan_as_null)) - ind.name = index.name - return ind + if isinstance(index, pd.RangeIndex): + return cudf.RangeIndex( + start=index.start, + stop=index.stop, + step=index.step, + name=index.name, + ) + else: + return cudf.Index( + column.as_column(index, nan_as_null=nan_as_null), + name=index.name, + ) @property def _constructor_expanddim(self): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 785f3d98712..4a31866a940 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5245,30 +5245,20 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): if not dataframe.columns.is_unique: raise ValueError("Duplicate column names are not allowed") - # Set columns - data = {} - for col_name, col_value in dataframe.items(): - # necessary because multi-index can return multiple - # columns for a single key - if len(col_value.shape) == 1: - data[col_name] = column.as_column( - col_value.array, nan_as_null=nan_as_null - ) - else: - vals = col_value.values.T - if vals.shape[0] == 1: - data[col_name] = column.as_column( - vals.flatten(), nan_as_null=nan_as_null - ) - else: - if isinstance(col_name, tuple): - col_name = str(col_name) - for idx in range(len(vals.shape)): - data[col_name] = column.as_column( - vals[idx], nan_as_null=nan_as_null - ) - - index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) + data = { + col_name: column.as_column( + col_value.array, nan_as_null=nan_as_null + ) + for col_name, col_value in dataframe.items() + } + if isinstance(dataframe.index, pd.MultiIndex): + index = cudf.MultiIndex.from_pandas( + dataframe.index, nan_as_null=nan_as_null + ) + else: + index = cudf.Index.from_pandas( + dataframe.index, nan_as_null=nan_as_null + ) df = cls._from_data(data, index) df._data._level_names = tuple(dataframe.columns.names) @@ -5279,13 +5269,14 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): df.columns = dataframe.columns return df + elif hasattr(dataframe, "__dataframe__"): + # TODO: Probably should be handled in the constructor as + # this isn't pandas specific + return from_dataframe(dataframe, allow_copy=True) else: - try: - return from_dataframe(dataframe, allow_copy=True) - except Exception: - raise TypeError( - f"Could not construct DataFrame from {type(dataframe)}" - ) + raise TypeError( + f"Could not construct DataFrame from {type(dataframe)}" + ) @classmethod @_cudf_nvtx_annotate @@ -7915,10 +7906,6 @@ def from_pandas(obj, nan_as_null=no_default): return ret elif isinstance(obj, pd.MultiIndex): return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) - elif isinstance(obj, pd.RangeIndex): - return cudf.core.index.RangeIndex( - start=obj.start, stop=obj.stop, step=obj.step, name=obj.name - ) elif isinstance(obj, pd.Index): return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.CategoricalDtype): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index c393522c28b..7b859fefe9f 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2954,6 +2954,13 @@ def test_index_getitem_from_nonint_raises(idx): cudf.Index([1, 2])[idx] +def test_from_pandas_rangeindex_return_rangeindex(): + pidx = pd.RangeIndex(start=3, stop=9, step=3, name="a") + result = cudf.Index.from_pandas(pidx) + expected = cudf.RangeIndex(start=3, stop=9, step=3, name="a") + assert_eq(result, expected, exact=True) + + @pytest.mark.parametrize( "idx", [