Skip to content

Commit

Permalink
Fix importing list & struct types in from_arrow(#7162)
Browse files Browse the repository at this point in the history
Fixes: #7137, #7148

This PR fixes converting a pyarrow table which has llist and struct types via `from_arrow`. Incase of `list` dtype we shouldn't have to perform any typecast and incase of `struct` dtype we should be renaming the fields appropriately.

Authors:
  - galipremsagar <[email protected]>

Approvers:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)
  - Keith Kraus (@kkraus14)

URL: #7162
  • Loading branch information
galipremsagar authored Jan 20, 2021
1 parent 5855bfa commit 0515a42
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 11 deletions.
45 changes: 35 additions & 10 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import copy
import functools
import operator
Expand Down Expand Up @@ -2056,20 +2057,44 @@ def from_arrow(cls, data):
else:
result = cudf_category_frame

# In a scenario where column is of type list/other non
# pandas types, there will be no pandas metadata associated with
# given arrow table as those types can only originate from
# arrow.
# There are some special cases that need to be handled
# based on metadata.
if pandas_dtypes:
for name in result._data.names:
if pandas_dtypes[name] == "categorical":
dtype = None
if (
len(result._data[name]) == 0
and pandas_dtypes[name] == "categorical"
):
# When pandas_dtype is a categorical column and the size
# of column is 0(i.e., empty) then we will have an
# int8 column in result._data[name] returned by libcudf,
# which needs to be type-casted to 'category' dtype.
dtype = "category"
elif pandas_dtypes[name] == "bool":
dtype = pandas_dtypes[name]
else:
elif (
pandas_dtypes[name] == "empty"
and np_dtypes[name] == "object"
):
# When a string column has all null values, pandas_dtype is
# is specified as 'empty' and np_dtypes as 'object',
# hence handling this special case to type-cast the empty
# float column to str column.
dtype = np_dtypes[name]
elif pandas_dtypes[
name
] == "object" and cudf.utils.dtypes.is_struct_dtype(
np_dtypes[name]
):
# Incase of struct column, libcudf is not aware of names of
# struct fields, hence renaming the struct fields is
# necessary by extracting the field names from arrow
# struct types.
result._data[name] = result._data[name]._rename_fields(
[field.name for field in data[name].type]
)

result._data[name] = result._data[name].astype(dtype)
if dtype is not None:
result._data[name] = result._data[name].astype(dtype)

result = libcudf.table.Table(
result._data.select_by_label(column_names)
Expand Down
61 changes: 60 additions & 1 deletion python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2018-2020, NVIDIA CORPORATION.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

import array as arr
import io
import operator
Expand Down Expand Up @@ -8191,3 +8192,61 @@ def test_agg_for_dataframe_with_string_columns(aggs):
),
):
gdf.agg(aggs)


@pytest.mark.parametrize(
"gdf",
[
gd.DataFrame({"a": [[1], [2], [3]]}),
gd.DataFrame(
{
"left-a": [0, 1, 2],
"a": [[1], None, [3]],
"right-a": ["abc", "def", "ghi"],
}
),
gd.DataFrame(
{
"left-a": [[], None, None],
"a": [[1], None, [3]],
"right-a": ["abc", "def", "ghi"],
}
),
],
)
def test_dataframe_roundtrip_arrow_list_dtype(gdf):
table = gdf.to_arrow()
expected = gd.DataFrame.from_arrow(table)

assert_eq(gdf, expected)


@pytest.mark.parametrize(
"gdf",
[
gd.DataFrame({"a": [{"one": 3, "two": 4, "three": 10}]}),
gd.DataFrame(
{
"left-a": [0, 1, 2],
"a": [{"x": 0.23, "y": 43}, None, {"x": 23.9, "y": 4.3}],
"right-a": ["abc", "def", "ghi"],
}
),
gd.DataFrame(
{
"left-a": [{"a": 1}, None, None],
"a": [
{"one": 324, "two": 23432, "three": 324},
None,
{"one": 3.24, "two": 1, "three": 324},
],
"right-a": ["abc", "def", "ghi"],
}
),
],
)
def test_dataframe_roundtrip_arrow_struct_dtype(gdf):
table = gdf.to_arrow()
expected = gd.DataFrame.from_arrow(table)

assert_eq(gdf, expected)

0 comments on commit 0515a42

Please sign in to comment.