Skip to content

Commit

Permalink
REF: Make DataFrame.from_pandas process by column (#14483)
Browse files Browse the repository at this point in the history
Also encountered a bug where `cudf.Index.from_pandas` would return an `cudf.Index[int64]` from a `pandas.RangeIndex`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #14483
  • Loading branch information
mroeschke authored Nov 28, 2023
1 parent 854f4e4 commit 5e58e71
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 38 deletions.
16 changes: 12 additions & 4 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1873,10 +1873,18 @@ def from_pandas(cls, index, nan_as_null=no_default):

if not isinstance(index, pd.Index):
raise TypeError("not a pandas.Index")

ind = cudf.Index(column.as_column(index, nan_as_null=nan_as_null))
ind.name = index.name
return ind
if isinstance(index, pd.RangeIndex):
return cudf.RangeIndex(
start=index.start,
stop=index.stop,
step=index.step,
name=index.name,
)
else:
return cudf.Index(
column.as_column(index, nan_as_null=nan_as_null),
name=index.name,
)

@property
def _constructor_expanddim(self):
Expand Down
55 changes: 21 additions & 34 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5245,30 +5245,20 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
if not dataframe.columns.is_unique:
raise ValueError("Duplicate column names are not allowed")

# Set columns
data = {}
for col_name, col_value in dataframe.items():
# necessary because multi-index can return multiple
# columns for a single key
if len(col_value.shape) == 1:
data[col_name] = column.as_column(
col_value.array, nan_as_null=nan_as_null
)
else:
vals = col_value.values.T
if vals.shape[0] == 1:
data[col_name] = column.as_column(
vals.flatten(), nan_as_null=nan_as_null
)
else:
if isinstance(col_name, tuple):
col_name = str(col_name)
for idx in range(len(vals.shape)):
data[col_name] = column.as_column(
vals[idx], nan_as_null=nan_as_null
)

index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
data = {
col_name: column.as_column(
col_value.array, nan_as_null=nan_as_null
)
for col_name, col_value in dataframe.items()
}
if isinstance(dataframe.index, pd.MultiIndex):
index = cudf.MultiIndex.from_pandas(
dataframe.index, nan_as_null=nan_as_null
)
else:
index = cudf.Index.from_pandas(
dataframe.index, nan_as_null=nan_as_null
)
df = cls._from_data(data, index)
df._data._level_names = tuple(dataframe.columns.names)

Expand All @@ -5279,13 +5269,14 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
df.columns = dataframe.columns

return df
elif hasattr(dataframe, "__dataframe__"):
# TODO: Probably should be handled in the constructor as
# this isn't pandas specific
return from_dataframe(dataframe, allow_copy=True)
else:
try:
return from_dataframe(dataframe, allow_copy=True)
except Exception:
raise TypeError(
f"Could not construct DataFrame from {type(dataframe)}"
)
raise TypeError(
f"Could not construct DataFrame from {type(dataframe)}"
)

@classmethod
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -7915,10 +7906,6 @@ def from_pandas(obj, nan_as_null=no_default):
return ret
elif isinstance(obj, pd.MultiIndex):
return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)
elif isinstance(obj, pd.RangeIndex):
return cudf.core.index.RangeIndex(
start=obj.start, stop=obj.stop, step=obj.step, name=obj.name
)
elif isinstance(obj, pd.Index):
return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null)
elif isinstance(obj, pd.CategoricalDtype):
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2954,6 +2954,13 @@ def test_index_getitem_from_nonint_raises(idx):
cudf.Index([1, 2])[idx]


def test_from_pandas_rangeindex_return_rangeindex():
pidx = pd.RangeIndex(start=3, stop=9, step=3, name="a")
result = cudf.Index.from_pandas(pidx)
expected = cudf.RangeIndex(start=3, stop=9, step=3, name="a")
assert_eq(result, expected, exact=True)


@pytest.mark.parametrize(
"idx",
[
Expand Down

0 comments on commit 5e58e71

Please sign in to comment.