Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix importing list & struct types in from_arrow #7162

Merged
merged 9 commits into from
Jan 20, 2021
45 changes: 35 additions & 10 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import copy
import functools
import operator
Expand Down Expand Up @@ -2056,20 +2057,44 @@ def from_arrow(cls, data):
else:
result = cudf_category_frame

# In a scenario where column is of type list/other non
# pandas types, there will be no pandas metadata associated with
# given arrow table as those types can only originate from
# arrow.
# There are some special cases that need to be handled
# based on metadata.
if pandas_dtypes:
for name in result._data.names:
if pandas_dtypes[name] == "categorical":
dtype = None
if (
len(result._data[name]) == 0
and pandas_dtypes[name] == "categorical"
):
# When pandas_dtype is a categorical column and the size
# of column is 0(i.e., empty) then we will have an
# int8 column in result._data[name] returned by libcudf,
# which needs to be type-casted to 'category' dtype.
dtype = "category"
elif pandas_dtypes[name] == "bool":
dtype = pandas_dtypes[name]
else:
elif (
pandas_dtypes[name] == "empty"
and np_dtypes[name] == "object"
):
# When a string column has all null values, pandas_dtype is
# is specified as 'empty' and np_dtypes as 'object',
# hence handling this special case to type-cast the empty
# float column to str column.
dtype = np_dtypes[name]
elif pandas_dtypes[
name
] == "object" and cudf.utils.dtypes.is_struct_dtype(
np_dtypes[name]
):
# Incase of struct column, libcudf is not aware of names of
# struct fields, hence renaming the struct fields is
# necessary by extracting the field names from arrow
# struct types.
result._data[name] = result._data[name]._rename_fields(
[field.name for field in data[name].type]
)

result._data[name] = result._data[name].astype(dtype)
if dtype is not None:
result._data[name] = result._data[name].astype(dtype)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

result = libcudf.table.Table(
result._data.select_by_label(column_names)
Expand Down
61 changes: 60 additions & 1 deletion python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2018-2020, NVIDIA CORPORATION.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

import array as arr
import io
import operator
Expand Down Expand Up @@ -8191,3 +8192,61 @@ def test_agg_for_dataframe_with_string_columns(aggs):
),
):
gdf.agg(aggs)


@pytest.mark.parametrize(
"gdf",
[
gd.DataFrame({"a": [[1], [2], [3]]}),
gd.DataFrame(
{
"left-a": [0, 1, 2],
"a": [[1], None, [3]],
"right-a": ["abc", "def", "ghi"],
}
),
gd.DataFrame(
{
"left-a": [[], None, None],
"a": [[1], None, [3]],
"right-a": ["abc", "def", "ghi"],
}
),
],
)
def test_dataframe_roundtrip_arrow_list_dtype(gdf):
table = gdf.to_arrow()
expected = gd.DataFrame.from_arrow(table)

assert_eq(gdf, expected)


@pytest.mark.parametrize(
"gdf",
[
gd.DataFrame({"a": [{"one": 3, "two": 4, "three": 10}]}),
gd.DataFrame(
{
"left-a": [0, 1, 2],
"a": [{"x": 0.23, "y": 43}, None, {"x": 23.9, "y": 4.3}],
"right-a": ["abc", "def", "ghi"],
}
),
gd.DataFrame(
{
"left-a": [{"a": 1}, None, None],
"a": [
{"one": 324, "two": 23432, "three": 324},
None,
{"one": 3.24, "two": 1, "three": 324},
],
"right-a": ["abc", "def", "ghi"],
}
),
],
)
def test_dataframe_roundtrip_arrow_struct_dtype(gdf):
table = gdf.to_arrow()
expected = gd.DataFrame.from_arrow(table)

assert_eq(gdf, expected)