-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Python bindings for lists::concatenate_list_elements
and expose them as .list.concat()
#8006
Changes from 3 commits
1f188e1
d9edac7
ddf2907
dfd1719
f3ddaba
400e204
4c985e8
65cfa91
48f6949
978807a
cdb5f4b
c945166
6ee35dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,13 +14,17 @@ | |
extract_element, | ||
sort_lists, | ||
) | ||
from cudf._typing import ColumnLike, ScalarLike | ||
from cudf.core.buffer import Buffer | ||
from cudf.core.column import ColumnBase, as_column, column | ||
from cudf.core.column.methods import ColumnMethodsMixin | ||
from cudf.core.column.methods import ColumnMethodsMixin, ParentType | ||
from cudf.core.dtypes import ListDtype | ||
from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype | ||
|
||
|
||
class ListColumn(ColumnBase): | ||
dtype: ListDtype | ||
|
||
def __init__( | ||
self, size, dtype, mask=None, offset=0, null_count=None, children=(), | ||
): | ||
|
@@ -177,14 +181,16 @@ class ListMethods(ColumnMethodsMixin): | |
List methods for Series | ||
""" | ||
|
||
def __init__(self, column, parent=None): | ||
_column: ListColumn | ||
|
||
def __init__(self, column: ListColumn, parent: ParentType = None): | ||
if not is_list_dtype(column.dtype): | ||
raise AttributeError( | ||
"Can only use .list accessor with a 'list' dtype" | ||
) | ||
super().__init__(column=column, parent=parent) | ||
|
||
def get(self, index): | ||
def get(self, index: int) -> ParentType: | ||
""" | ||
Extract element at the given index from each component | ||
|
||
|
@@ -216,10 +222,10 @@ def get(self, index): | |
else: | ||
raise IndexError("list index out of range") | ||
|
||
def contains(self, search_key): | ||
def contains(self, search_key: ScalarLike) -> ParentType: | ||
""" | ||
Creates a column of bool values indicating whether the specified scalar | ||
is an element of each row of a list column. | ||
Returns boolean values indicating whether the specified scalar | ||
is an element of each row. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -228,7 +234,7 @@ def contains(self, search_key): | |
|
||
Returns | ||
------- | ||
Column | ||
Series or Index | ||
|
||
Examples | ||
-------- | ||
|
@@ -256,14 +262,14 @@ def contains(self, search_key): | |
return res | ||
|
||
@property | ||
def leaves(self): | ||
def leaves(self) -> ParentType: | ||
""" | ||
From a Series of (possibly nested) lists, obtain the elements from | ||
the innermost lists as a flat Series (one value per row). | ||
|
||
Returns | ||
------- | ||
Series | ||
Series or Index | ||
|
||
Examples | ||
-------- | ||
|
@@ -284,7 +290,7 @@ def leaves(self): | |
self._column.elements, retain_index=False | ||
) | ||
|
||
def len(self): | ||
def len(self) -> ParentType: | ||
""" | ||
Computes the length of each element in the Series/Index. | ||
|
||
|
@@ -308,18 +314,18 @@ def len(self): | |
""" | ||
return self._return_or_inplace(count_elements(self._column)) | ||
|
||
def take(self, lists_indices): | ||
def take(self, lists_indices: ColumnLike) -> ParentType: | ||
""" | ||
Collect list elements based on given indices. | ||
|
||
Parameters | ||
---------- | ||
lists_indices: List type arrays | ||
lists_indices: Series-like of lists | ||
Specifies what to collect from each row | ||
|
||
Returns | ||
------- | ||
ListColumn | ||
Series or Index | ||
|
||
Examples | ||
-------- | ||
|
@@ -363,14 +369,14 @@ def take(self, lists_indices): | |
else: | ||
return res | ||
|
||
def unique(self): | ||
def unique(self) -> ParentType: | ||
""" | ||
Returns unique element for each list in the column, order for each | ||
unique element is not guaranteed. | ||
Returns the unique elements in each list. | ||
The ordering of elements is not guaranteed. | ||
|
||
Returns | ||
------- | ||
ListColumn | ||
Series or Index | ||
|
||
Examples | ||
-------- | ||
|
@@ -400,12 +406,12 @@ def unique(self): | |
|
||
def sort_values( | ||
self, | ||
ascending=True, | ||
inplace=False, | ||
kind="quicksort", | ||
na_position="last", | ||
ignore_index=False, | ||
): | ||
ascending: bool = True, | ||
inplace: bool = False, | ||
kind: str = "quicksort", | ||
na_position: str = "last", | ||
ignore_index: bool = False, | ||
) -> ParentType: | ||
""" | ||
Sort each list by the values. | ||
|
||
|
@@ -422,7 +428,7 @@ def sort_values( | |
|
||
Returns | ||
------- | ||
ListColumn with each list sorted | ||
Series or Index with each list sorted | ||
|
||
Notes | ||
----- | ||
|
@@ -451,3 +457,56 @@ def sort_values( | |
sort_lists(self._column, ascending, na_position), | ||
retain_index=not ignore_index, | ||
) | ||
|
||
def ravel(self) -> ParentType: | ||
""" | ||
Removes one level of nesting from each row of the list Series. | ||
|
||
Returns | ||
------- | ||
Series or Index with one level of nesting removed from each row. | ||
|
||
Examples | ||
-------- | ||
>>> s1 | ||
0 [[1.0, 2.0], [3.0, 4.0, 5.0]] | ||
1 [[6.0, nan], [7.0], [8.0, 9.0]] | ||
dtype: list | ||
>>> s1.list.ravel() | ||
0 [1.0, 2.0, 3.0, 4.0, 5.0] | ||
1 [6.0, nan, 7.0, 8.0, 9.0] | ||
dtype: list | ||
|
||
Null values at the top-level in each row are dropped: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this desirable? If not, what should our behaviour be? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems ok to me... This may introduce ambiguity of
Even though, when unwrapping nested lists it sounds reasonable to assume both empty list and null item do not contribute to concrete elements. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I exposed a (these are the options available at the libcudf level) |
||
|
||
>>> s2 | ||
0 [[1.0, 2.0], None, [3.0, 4.0, 5.0]] | ||
1 [[6.0, nan], [7.0], [8.0, 9.0]] | ||
dtype: list | ||
>>> s2.list.ravel() | ||
0 [1.0, 2.0, 3.0, 4.0, 5.0] | ||
1 [6.0, nan, 7.0, 8.0, 9.0] | ||
dtype: list | ||
""" | ||
result_dtype = self._column.dtype.element_type | ||
if not isinstance(result_dtype, ListDtype): | ||
return self._return_or_inplace(self._column) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May want to mention in the docstring that this API is designed to work on Lists of Lists and that if you only give it one level it doesn't return an integral type. Or we could decide to allow it to return an integral type. |
||
|
||
self_offsets = self._column.children[0] | ||
child_offsets = self._column.children[1].children[0] | ||
kkraus14 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
result_offsets = child_offsets[self_offsets] | ||
result_children = ( | ||
result_offsets, | ||
self._column.children[1].children[1], | ||
) | ||
|
||
return self._return_or_inplace( | ||
ListColumn( | ||
self._column.size, | ||
self._column.dtype.element_type, | ||
self._column.mask, | ||
self._column.offset, | ||
self._column.null_count, | ||
result_children, | ||
) | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,8 @@ | |
if TYPE_CHECKING: | ||
from cudf.core.column import ColumnBase | ||
|
||
ParentType = Union["cudf.Series", "cudf.Index"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this become There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm changing this in #8306. |
||
|
||
|
||
class ColumnMethodsMixin: | ||
_column: ColumnBase | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
# Copyright (c) 2020-2021, NVIDIA CORPORATION. | ||
import functools | ||
import itertools | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
@@ -315,3 +316,31 @@ def test_contains_null_search_key(data, expect): | |
expect = cudf.Series(expect, dtype="bool") | ||
got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type)) | ||
assert_eq(expect, got) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"row", | ||
[ | ||
[[]], | ||
[[1]], | ||
[[1, 2]], | ||
[[1, 2], [3, 4, 5]], | ||
[[1, 2, None], [3, 4, 5]], | ||
[[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]], | ||
[[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Try a few empty list items in here as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
], | ||
) | ||
def test_ravel(row): | ||
def ravel(row): | ||
return list( | ||
itertools.chain.from_iterable([x for x in row if x is not None]) | ||
) | ||
|
||
expect = pd.Series([ravel(row)]) | ||
got = cudf.Series([row]).list.ravel() | ||
assert_eq(expect, got) | ||
|
||
|
||
def test_ravel_no_nesting(): | ||
s = cudf.Series([[1, 2], [3, 4, 5]]) | ||
assert_eq(s, s.list.ravel()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could use some help here with naming/docstring here.
Unlike
np.ravel
, this function only removes one level of nesting from each row. What's a better name for that?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unpack
,unbox
? (along with a parametern
for how many levels of nesting to be removed? - but maybe this is something we can do in a future PR)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like
unbox
, and I also like the suggestion for ann
parameterThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about
flatten
? That's what Spark uses: https://spark.apache.org/docs/latest/api/sql/index.html#flattenThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still want the
n
parameter withflatten
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like Spark
flatten
only flattens one level, whereas numpyflatten
does similar toravel
where it flattens down to 1d: https://numpy.org/doc/stable/reference/generated/numpy.ndarray.flatten.html#numpy.ndarray.flattenAny thoughts on what makes the most sense for us?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I went with
lists.concat()
as that's really what we're doing (concatenating the inner lists). If we only have a single inner list, nesting is removed: