Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Python bindings for lists::concatenate_list_elements and expose them as .list.concat() #8006

Merged
merged 13 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
Optional,
Sequence,
Tuple,
Union,
cast,
)

Expand All @@ -26,7 +25,7 @@
from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import column
from cudf.core.column.methods import ColumnMethodsMixin
from cudf.core.column.methods import ColumnMethodsMixin, ParentType
from cudf.core.dtypes import CategoricalDtype
from cudf.utils.dtypes import (
is_categorical_dtype,
Expand All @@ -45,9 +44,6 @@
)


ParentType = Union["cudf.Series", "cudf.Index"]


class CategoricalAccessor(ColumnMethodsMixin):
_column: CategoricalColumn

Expand Down
107 changes: 83 additions & 24 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
extract_element,
sort_lists,
)
from cudf._typing import ColumnLike, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethodsMixin
from cudf.core.column.methods import ColumnMethodsMixin, ParentType
from cudf.core.dtypes import ListDtype
from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype


class ListColumn(ColumnBase):
dtype: ListDtype

def __init__(
self, size, dtype, mask=None, offset=0, null_count=None, children=(),
):
Expand Down Expand Up @@ -177,14 +181,16 @@ class ListMethods(ColumnMethodsMixin):
List methods for Series
"""

def __init__(self, column, parent=None):
_column: ListColumn

def __init__(self, column: ListColumn, parent: ParentType = None):
if not is_list_dtype(column.dtype):
raise AttributeError(
"Can only use .list accessor with a 'list' dtype"
)
super().__init__(column=column, parent=parent)

def get(self, index):
def get(self, index: int) -> ParentType:
"""
Extract element at the given index from each component

Expand Down Expand Up @@ -216,10 +222,10 @@ def get(self, index):
else:
raise IndexError("list index out of range")

def contains(self, search_key):
def contains(self, search_key: ScalarLike) -> ParentType:
"""
Creates a column of bool values indicating whether the specified scalar
is an element of each row of a list column.
Returns boolean values indicating whether the specified scalar
is an element of each row.

Parameters
----------
Expand All @@ -228,7 +234,7 @@ def contains(self, search_key):

Returns
-------
Column
Series or Index

Examples
--------
Expand Down Expand Up @@ -256,14 +262,14 @@ def contains(self, search_key):
return res

@property
def leaves(self):
def leaves(self) -> ParentType:
"""
From a Series of (possibly nested) lists, obtain the elements from
the innermost lists as a flat Series (one value per row).

Returns
-------
Series
Series or Index

Examples
--------
Expand All @@ -284,7 +290,7 @@ def leaves(self):
self._column.elements, retain_index=False
)

def len(self):
def len(self) -> ParentType:
"""
Computes the length of each element in the Series/Index.

Expand All @@ -308,18 +314,18 @@ def len(self):
"""
return self._return_or_inplace(count_elements(self._column))

def take(self, lists_indices):
def take(self, lists_indices: ColumnLike) -> ParentType:
"""
Collect list elements based on given indices.

Parameters
----------
lists_indices: List type arrays
lists_indices: Series-like of lists
Specifies what to collect from each row

Returns
-------
ListColumn
Series or Index

Examples
--------
Expand Down Expand Up @@ -363,14 +369,14 @@ def take(self, lists_indices):
else:
return res

def unique(self):
def unique(self) -> ParentType:
"""
Returns unique element for each list in the column, order for each
unique element is not guaranteed.
Returns the unique elements in each list.
The ordering of elements is not guaranteed.

Returns
-------
ListColumn
Series or Index

Examples
--------
Expand Down Expand Up @@ -400,12 +406,12 @@ def unique(self):

def sort_values(
self,
ascending=True,
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
):
ascending: bool = True,
inplace: bool = False,
kind: str = "quicksort",
na_position: str = "last",
ignore_index: bool = False,
) -> ParentType:
"""
Sort each list by the values.

Expand All @@ -422,7 +428,7 @@ def sort_values(

Returns
-------
ListColumn with each list sorted
Series or Index with each list sorted

Notes
-----
Expand Down Expand Up @@ -451,3 +457,56 @@ def sort_values(
sort_lists(self._column, ascending, na_position),
retain_index=not ignore_index,
)

def ravel(self) -> ParentType:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could use some help here with naming/docstring here.

Unlike np.ravel, this function only removes one level of nesting from each row. What's a better name for that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's a better name for that?

unpack, unbox ? (along with a parameter n for how many levels of nesting to be removed? - but maybe this is something we can do in a future PR)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like unbox, and I also like the suggestion for an n parameter

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about flatten? That's what Spark uses: https://spark.apache.org/docs/latest/api/sql/index.html#flatten

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still want the n parameter with flatten?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like Spark flatten only flattens one level, whereas numpy flatten does similar to ravel where it flattens down to 1d: https://numpy.org/doc/stable/reference/generated/numpy.ndarray.flatten.html#numpy.ndarray.flatten

Any thoughts on what makes the most sense for us?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went with lists.concat() as that's really what we're doing (concatenating the inner lists). If we only have a single inner list, nesting is removed:

In [15]: s
Out[15]:
0    [[1, 2], [3, 4]]
dtype: list

In [16]: s.list.concat()
Out[16]:
0    [1, 2, 3, 4]
dtype: list
In [18]: s
Out[18]:
0    [[1, 2]]
dtype: list

In [19]: s.list.concat()
Out[19]:
0    [1, 2]
dtype: list

"""
Removes one level of nesting from each row of the list Series.

Returns
-------
Series or Index with one level of nesting removed from each row.

Examples
--------
>>> s1
0 [[1.0, 2.0], [3.0, 4.0, 5.0]]
1 [[6.0, nan], [7.0], [8.0, 9.0]]
dtype: list
>>> s1.list.ravel()
0 [1.0, 2.0, 3.0, 4.0, 5.0]
1 [6.0, nan, 7.0, 8.0, 9.0]
dtype: list

Null values at the top-level in each row are dropped:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this desirable? If not, what should our behaviour be?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems ok to me... This may introduce ambiguity of

flatten([[1, 2, 3], None, [4, 5]]
flatten([[1, 2, 3], [], [4, 5]]

Even though, when unwrapping nested lists it sounds reasonable to assume both empty list and null item do not contribute to concrete elements.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I exposed a dropna param which will drop null values by default. If set to False, the corresponding row in the result is null.

(these are the options available at the libcudf level)


>>> s2
0 [[1.0, 2.0], None, [3.0, 4.0, 5.0]]
1 [[6.0, nan], [7.0], [8.0, 9.0]]
dtype: list
>>> s2.list.ravel()
0 [1.0, 2.0, 3.0, 4.0, 5.0]
1 [6.0, nan, 7.0, 8.0, 9.0]
dtype: list
"""
result_dtype = self._column.dtype.element_type
if not isinstance(result_dtype, ListDtype):
return self._return_or_inplace(self._column)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May want to mention in the docstring that this API is designed to work on Lists of Lists and that if you only give it one level it doesn't return an integral type.

Or we could decide to allow it to return an integral type.


self_offsets = self._column.children[0]
child_offsets = self._column.children[1].children[0]
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
result_offsets = child_offsets[self_offsets]
result_children = (
result_offsets,
self._column.children[1].children[1],
)

return self._return_or_inplace(
ListColumn(
self._column.size,
self._column.dtype.element_type,
self._column.mask,
self._column.offset,
self._column.null_count,
result_children,
)
)
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/column/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
if TYPE_CHECKING:
from cudf.core.column import ColumnBase

ParentType = Union["cudf.Series", "cudf.Index"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this become SingleColumnFrame from #8115 ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm changing this in #8306.



class ColumnMethodsMixin:
_column: ColumnBase
Expand Down
5 changes: 1 addition & 4 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@
from cudf._typing import ColumnLike, Dtype, ScalarLike
from cudf.core.buffer import Buffer
from cudf.core.column import column, datetime
from cudf.core.column.methods import ColumnMethodsMixin
from cudf.core.column.methods import ColumnMethodsMixin, ParentType
from cudf.utils import utils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -209,9 +209,6 @@
}


ParentType = Union["cudf.Series", "cudf.Index"]


class StringMethods(ColumnMethodsMixin):
def __init__(self, column, parent=None):
"""
Expand Down
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
import functools
import itertools

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -315,3 +316,31 @@ def test_contains_null_search_key(data, expect):
expect = cudf.Series(expect, dtype="bool")
got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type))
assert_eq(expect, got)


@pytest.mark.parametrize(
"row",
[
[[]],
[[1]],
[[1, 2]],
[[1, 2], [3, 4, 5]],
[[1, 2, None], [3, 4, 5]],
[[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]],
[[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try a few empty list items in here as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

],
)
def test_ravel(row):
def ravel(row):
return list(
itertools.chain.from_iterable([x for x in row if x is not None])
)

expect = pd.Series([ravel(row)])
got = cudf.Series([row]).list.ravel()
assert_eq(expect, got)


def test_ravel_no_nesting():
s = cudf.Series([[1, 2], [3, 4, 5]])
assert_eq(s, s.list.ravel())