Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable passing a sequence for the index argument to .list.get() #10564

Merged
merged 22 commits into from
Apr 12, 2022
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions python/cudf/cudf/_lib/cpp/lists/extract.pxd
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column cimport column, column_view
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.types cimport size_type

Expand All @@ -12,3 +12,7 @@ cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
const lists_column_view,
size_type
) except +
cdef unique_ptr[column] extract_list_element(
const lists_column_view,
column_view
) except +
20 changes: 18 additions & 2 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
Expand Down Expand Up @@ -126,7 +126,7 @@ def sort_lists(Column col, bool ascending, str na_position):
return Column.from_unique_ptr(move(c_result))


def extract_element(Column col, size_type index):
def extract_element_scalar(Column col, int index):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
# shared_ptr required because lists_column_view has no default
# ctor
cdef shared_ptr[lists_column_view] list_view = (
Expand All @@ -142,6 +142,22 @@ def extract_element(Column col, size_type index):
return result


def extract_element_column(Column col, Column index):
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)

cdef column_view index_view = index.view()

cdef unique_ptr[column] c_result

with nogil:
c_result = move(extract_list_element(list_view.get()[0], index_view))

result = Column.from_unique_ptr(move(c_result))
return result


def contains_scalar(Column col, object py_search_key):

cdef DeviceScalar search_key = py_search_key.device_value
Expand Down
74 changes: 60 additions & 14 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pickle
from functools import cached_property
from typing import List, Sequence
from typing import List, Optional, Sequence, Union

import numpy as np
import pyarrow as pa
Expand All @@ -15,12 +15,17 @@
contains_scalar,
count_elements,
drop_list_duplicates,
extract_element,
extract_element_column,
extract_element_scalar,
sort_lists,
)
from cudf._lib.strings.convert.convert_lists import format_list_column
from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
from cudf.api.types import _is_non_decimal_numeric_dtype, is_list_dtype
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
is_list_dtype,
is_scalar,
)
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethods, ParentType
Expand Down Expand Up @@ -337,16 +342,26 @@ def __init__(self, parent: ParentType):
)
super().__init__(parent=parent)

def get(self, index: int) -> ParentType:
def get(
self,
index: int,
default: Optional[Union[ScalarLike, ColumnLike]] = None,
) -> ParentType:
"""
Extract element at the given index from each component
Extract element at the given index from each list.

`index` can be an integer or a sequence of integers.
Passing a sequence enables extracting values at
different indexes for different lists.

Extract element from lists, tuples, or strings in
each element in the Series/Index.
If the index is out of bounds for any list,
return <NA> or, if provided, ``default``.
Thus, this method never raises an ``IndexError``.

Parameters
----------
index : int
index : int or sequence of ints
default : scalar, optional

Returns
-------
Expand All @@ -360,14 +375,45 @@ def get(self, index: int) -> ParentType:
1 5
2 6
dtype: int64

>>> s.list.get(2)
0 <NA>
1 5
2 6
dtype: int64

>>> s.list.get(2, default=0)
0 0
1 5
2 6
dtype: int64

>>> s.list.get([0, 1, 2])
0 1
1 4
2 6
dtype: int64
"""
min_col_list_len = self.len().min()
if -min_col_list_len <= index < min_col_list_len:
return self._return_or_inplace(
extract_element(self._column, index)
)
if is_scalar(index):
out = extract_element_scalar(self._column, cudf.Scalar(index))
else:
raise IndexError("list index out of range")
index = as_column(index)
out = extract_element_column(self._column, as_column(index))
bdice marked this conversation as resolved.
Show resolved Hide resolved

if not (default is None or default is cudf.NA):
# determine rows for which `index` is out-of-bounds
lengths = count_elements(self._column)
out_of_bounds_mask = (np.negative(index) > lengths) | (
shwina marked this conversation as resolved.
Show resolved Hide resolved
index >= lengths
)

# replace the value in those rows (should be NA) with `default`
if out_of_bounds_mask.any():
out = out._scatter_by_column(
out_of_bounds_mask, cudf.Scalar(default)
)

return self._return_or_inplace(out)

def contains(self, search_key: ScalarLike) -> ParentType:
"""
Expand Down
33 changes: 29 additions & 4 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,35 @@ def test_get_nested_lists():
assert_eq(expect, got)


def test_get_nulls():
with pytest.raises(IndexError, match="list index out of range"):
sr = cudf.Series([[], [], []])
sr.list.get(100)
def test_get_default():
sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]])

assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2))
assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA))
assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0))
assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0))
assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1))

string_sr = cudf.Series(
[["apple", "banana"], ["carrot", "daffodil", "elephant"]]
)
assert_eq(
cudf.Series(["default", "elephant"]),
string_sr.list.get(2, default="default"),
)

sr_with_null = cudf.Series([[0, cudf.NA], [1]])
assert_eq(cudf.Series([cudf.NA, 0]), sr_with_null.list.get(1, default=0))


def test_get_ind_sequence():
# test .list.get() when `index` is a sequence
shwina marked this conversation as resolved.
Show resolved Hide resolved
sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]])

assert_eq(cudf.Series([1, 4, 8]), sr.list.get([0, 1, 2]))
assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2])))
assert_eq(cudf.Series([cudf.NA, 5, cudf.NA]), sr.list.get([2, 2, -5]))
assert_eq(cudf.Series([0, 5, 0]), sr.list.get([2, 2, -5], default=0))


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion python/dask_cudf/dask_cudf/tests/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def test_contains(data, search_key):
"data, index, expectation",
[
(data_test_1(), 1, does_not_raise()),
(data_test_2(), 2, pytest.raises(IndexError)),
(data_test_2(), 2, does_not_raise()),
],
)
def test_get(data, index, expectation):
shwina marked this conversation as resolved.
Show resolved Hide resolved
Expand Down