From 0907225c0f79a9b34696e484ccaaa02c6cc49b4c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 30 Mar 2022 16:50:38 -0400 Subject: [PATCH 01/20] Add default= kwarg to .list.get() accessor method --- python/cudf/cudf/core/column/lists.py | 41 ++++++++++++++++++--------- python/cudf/cudf/tests/test_list.py | 10 ++++--- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 60d13150b39..f94c084ed71 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -2,7 +2,7 @@ import pickle from functools import cached_property -from typing import List, Sequence +from typing import List, Optional, Sequence import numpy as np import pyarrow as pa @@ -337,16 +337,18 @@ def __init__(self, parent: ParentType): ) super().__init__(parent=parent) - def get(self, index: int) -> ParentType: + def get( + self, index: int, default: Optional[ScalarLike] = None + ) -> ParentType: """ - Extract element at the given index from each component - - Extract element from lists, tuples, or strings in - each element in the Series/Index. + Extract element at the given index from each list. + If the index is out of bounds for any list, + return or, if provided, ``default``. Parameters ---------- index : int + default : scalar, optional Returns ------- @@ -360,14 +362,27 @@ def get(self, index: int) -> ParentType: 1 5 2 6 dtype: int64 + + >>> s = cudf.Series([[1, 2], [3, 4, 5], [4, 5, 6]]) + >>> s.list.get(2) + 0 + 1 5 + 2 6 + dtype: int64 + + >>> s = cudf.Series([[1, 2], [3, 4, 5], [4, 5, 6]]) + >>> s.list.get(2, default=0) + 0 0 + 1 5 + 2 6 + dtype: int64 """ - min_col_list_len = self.len().min() - if -min_col_list_len <= index < min_col_list_len: - return self._return_or_inplace( - extract_element(self._column, index) - ) - else: - raise IndexError("list index out of range") + out = extract_element(self._column, index) + return self._return_or_inplace( + out + if (default is None or default is cudf.NA) + else out.fillna(default) + ) def contains(self, search_key: ScalarLike) -> ParentType: """ diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 6a665a2b43c..82ee4a1d41e 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -292,10 +292,12 @@ def test_get_nested_lists(): assert_eq(expect, got) -def test_get_nulls(): - with pytest.raises(IndexError, match="list index out of range"): - sr = cudf.Series([[], [], []]) - sr.list.get(100) +def test_get_default(): + sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) + + assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2)) + assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA)) + assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) @pytest.mark.parametrize( From eda60a14a377fec9b38ecd28aca58edcd440b45a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 30 Mar 2022 18:03:02 -0400 Subject: [PATCH 02/20] Add more tests from review --- python/cudf/cudf/tests/test_list.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 82ee4a1d41e..956acc96452 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -298,6 +298,16 @@ def test_get_default(): assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2)) assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA)) assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) + assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) + assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1)) + + string_sr = cudf.Series( + [["apple", "banana"], ["carrot", "daffodil", "elephant"]] + ) + assert_eq( + cudf.Series(["default", "elephant"]), + string_sr.list.get(2, default="default"), + ) @pytest.mark.parametrize( From 91cb7e540da292dfee659e3a0d6a27a7dc575992 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 07:40:32 -0400 Subject: [PATCH 03/20] Summary line + note about IndexError --- python/cudf/cudf/core/column/lists.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index f94c084ed71..097514476bd 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -342,8 +342,10 @@ def get( ) -> ParentType: """ Extract element at the given index from each list. + If the index is out of bounds for any list, return or, if provided, ``default``. + Thus, this method never raises an ``IndexError``. Parameters ---------- From 0c4a4ee1c21494f8d22bb4c564f3abcb0be417f7 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 07:54:26 -0400 Subject: [PATCH 04/20] Change dask_cudf test not to expect an IndexError --- python/dask_cudf/dask_cudf/tests/test_accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 84c0e0e9b39..95cf0c8d56d 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -384,7 +384,7 @@ def test_contains(data, search_key): "data, index, expectation", [ (data_test_1(), 1, does_not_raise()), - (data_test_2(), 2, pytest.raises(IndexError)), + (data_test_2(), 2, does_not_raise()), ], ) def test_get(data, index, expectation): From 1693316f6840ceeaa5cd18a7806726493abffd0d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 08:21:13 -0400 Subject: [PATCH 05/20] Add failing test --- python/cudf/cudf/tests/test_list.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 956acc96452..344ded4d6bd 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -310,6 +310,11 @@ def test_get_default(): ) +def test_get_default_with_null(): + sr = cudf.Series([[0, cudf.NA], [1]]) + assert_eq(cudf.Series([cudf.NA, 0]), sr.list.get(1, default=0)) + + @pytest.mark.parametrize( "data, scalar, expect", [ From 1763e522413931c4f8d8452b9e0ef4040378b714 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 16:54:24 -0400 Subject: [PATCH 06/20] Can't use fillna since we might already have nulls --- python/cudf/cudf/core/column/lists.py | 18 +++++++++++++----- python/cudf/cudf/tests/test_list.py | 7 +++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 097514476bd..aba56a83cc0 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -380,11 +380,19 @@ def get( dtype: int64 """ out = extract_element(self._column, index) - return self._return_or_inplace( - out - if (default is None or default is cudf.NA) - else out.fillna(default) - ) + + if not (default is None or default is cudf.NA): + # determine rows for which `index` is out-of-bounds + lengths = count_elements(self._column) + out_of_bounds_indexes = (-index > lengths) | (index >= lengths) + + # replace the value in those rows (should be NA) with ``default`` + if out_of_bounds_indexes.any(): + out = out._scatter_by_column( + out_of_bounds_indexes, cudf.Scalar(default) + ) + + return self._return_or_inplace(out) def contains(self, search_key: ScalarLike) -> ParentType: """ diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 344ded4d6bd..b099d9decc8 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -299,6 +299,7 @@ def test_get_default(): assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA)) assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) + assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0)) assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1)) string_sr = cudf.Series( @@ -309,10 +310,8 @@ def test_get_default(): string_sr.list.get(2, default="default"), ) - -def test_get_default_with_null(): - sr = cudf.Series([[0, cudf.NA], [1]]) - assert_eq(cudf.Series([cudf.NA, 0]), sr.list.get(1, default=0)) + sr_with_null = cudf.Series([[0, cudf.NA], [1]]) + assert_eq(cudf.Series([cudf.NA, 0]), sr_with_null.list.get(1, default=0)) @pytest.mark.parametrize( From b73900510700e988100cc699cd2ab9e0b51906ec Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 17:01:56 -0400 Subject: [PATCH 07/20] Single backquote? --- python/cudf/cudf/core/column/lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index aba56a83cc0..b5f772e4b71 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -386,7 +386,7 @@ def get( lengths = count_elements(self._column) out_of_bounds_indexes = (-index > lengths) | (index >= lengths) - # replace the value in those rows (should be NA) with ``default`` + # replace the value in those rows (should be NA) with `default` if out_of_bounds_indexes.any(): out = out._scatter_by_column( out_of_bounds_indexes, cudf.Scalar(default) From 8e64c04f2547683aa0c6b994bd32fa93b620621e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 17:45:01 -0400 Subject: [PATCH 08/20] Allow passing a sequence for the `index` argument to `.list.get()` --- python/cudf/cudf/_lib/cpp/lists/extract.pxd | 8 ++++++-- python/cudf/cudf/_lib/lists.pyx | 20 ++++++++++++++++++-- python/cudf/cudf/core/column/lists.py | 14 +++++++++++--- python/cudf/cudf/tests/test_list.py | 9 +++++++++ 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/lists/extract.pxd b/python/cudf/cudf/_lib/cpp/lists/extract.pxd index a023f728989..93a886d7268 100644 --- a/python/cudf/cudf/_lib/cpp/lists/extract.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/extract.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr -from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column cimport column, column_view from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.types cimport size_type @@ -12,3 +12,7 @@ cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil: const lists_column_view, size_type ) except + + cdef unique_ptr[column] extract_list_element( + const lists_column_view, + column_view + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index ef759a21132..d80dfd85b2e 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport make_shared, shared_ptr, unique_ptr @@ -126,7 +126,7 @@ def sort_lists(Column col, bool ascending, str na_position): return Column.from_unique_ptr(move(c_result)) -def extract_element(Column col, size_type index): +def extract_element_scalar(Column col, int index): # shared_ptr required because lists_column_view has no default # ctor cdef shared_ptr[lists_column_view] list_view = ( @@ -142,6 +142,22 @@ def extract_element(Column col, size_type index): return result +def extract_element_column(Column col, Column index): + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + + cdef column_view index_view = index.view() + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(extract_list_element(list_view.get()[0], index_view)) + + result = Column.from_unique_ptr(move(c_result)) + return result + + def contains_scalar(Column col, object py_search_key): cdef DeviceScalar search_key = py_search_key.device_value diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b5f772e4b71..034c1adbe8f 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -15,12 +15,17 @@ contains_scalar, count_elements, drop_list_duplicates, - extract_element, + extract_element_column, + extract_element_scalar, sort_lists, ) from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike -from cudf.api.types import _is_non_decimal_numeric_dtype, is_list_dtype +from cudf.api.types import ( + _is_non_decimal_numeric_dtype, + is_list_dtype, + is_scalar, +) from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethods, ParentType @@ -379,7 +384,10 @@ def get( 2 6 dtype: int64 """ - out = extract_element(self._column, index) + if is_scalar(index): + out = extract_element_scalar(self._column, cudf.Scalar(index)) + else: + out = extract_element_column(self._column, as_column(index)) if not (default is None or default is cudf.NA): # determine rows for which `index` is out-of-bounds diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index b099d9decc8..51c6a450329 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -314,6 +314,15 @@ def test_get_default(): assert_eq(cudf.Series([cudf.NA, 0]), sr_with_null.list.get(1, default=0)) +def test_get_ind_sequence(): + # test .list.get() when `index` is a sequence + sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) + + assert_eq(cudf.Series([1, 4, 8]), sr.list.get([0, 1, 2])) + assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) + assert_eq(cudf.Series([cudf.NA, 5, cudf.NA]), sr.list.get([2, 2, -5])) + + @pytest.mark.parametrize( "data, scalar, expect", [ From f92c736c1fef1232f32d03b6eb6aa6bcf4186b97 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 17:47:29 -0400 Subject: [PATCH 09/20] Updates --- python/cudf/cudf/tests/test_list.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 51c6a450329..046a209a8c5 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -321,6 +321,7 @@ def test_get_ind_sequence(): assert_eq(cudf.Series([1, 4, 8]), sr.list.get([0, 1, 2])) assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) assert_eq(cudf.Series([cudf.NA, 5, cudf.NA]), sr.list.get([2, 2, -5])) + assert_eq(cudf.Series([0, 5, 0]), sr.list.get([2, 2, -5], default=0)) @pytest.mark.parametrize( From 9e86646f311a4859ce2e200fe477cdd4ab380a84 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 17:55:08 -0400 Subject: [PATCH 10/20] Updates... --- python/cudf/cudf/core/column/lists.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 034c1adbe8f..0cf220c4c48 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -387,12 +387,15 @@ def get( if is_scalar(index): out = extract_element_scalar(self._column, cudf.Scalar(index)) else: + index = as_column(index) out = extract_element_column(self._column, as_column(index)) if not (default is None or default is cudf.NA): # determine rows for which `index` is out-of-bounds lengths = count_elements(self._column) - out_of_bounds_indexes = (-index > lengths) | (index >= lengths) + out_of_bounds_indexes = (np.negative(index) > lengths) | ( + index >= lengths + ) # replace the value in those rows (should be NA) with `default` if out_of_bounds_indexes.any(): From 02fa7ad0938b2927cd076f9225b8642592d53428 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 31 Mar 2022 18:03:34 -0400 Subject: [PATCH 11/20] Doc --- python/cudf/cudf/core/column/lists.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 0cf220c4c48..34871377a09 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -2,7 +2,7 @@ import pickle from functools import cached_property -from typing import List, Optional, Sequence +from typing import List, Optional, Sequence, Union import numpy as np import pyarrow as pa @@ -343,18 +343,24 @@ def __init__(self, parent: ParentType): super().__init__(parent=parent) def get( - self, index: int, default: Optional[ScalarLike] = None + self, + index: int, + default: Optional[Union[ScalarLike, ColumnLike]] = None, ) -> ParentType: """ Extract element at the given index from each list. + `index` can be an integer or a sequence of integers. + Passing a sequence enables extracting values at + different indexes for different lists. + If the index is out of bounds for any list, return or, if provided, ``default``. Thus, this method never raises an ``IndexError``. Parameters ---------- - index : int + index : int or sequence of ints default : scalar, optional Returns @@ -370,19 +376,23 @@ def get( 2 6 dtype: int64 - >>> s = cudf.Series([[1, 2], [3, 4, 5], [4, 5, 6]]) >>> s.list.get(2) 0 1 5 2 6 dtype: int64 - >>> s = cudf.Series([[1, 2], [3, 4, 5], [4, 5, 6]]) >>> s.list.get(2, default=0) 0 0 1 5 2 6 dtype: int64 + + >>> s.list.get([0, 1, 2]) + 0 1 + 1 4 + 2 6 + dtype: int64 """ if is_scalar(index): out = extract_element_scalar(self._column, cudf.Scalar(index)) From b56ce6fd42550033b3b669716221fd40a225ee91 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 1 Apr 2022 13:27:43 -0400 Subject: [PATCH 12/20] indexes->mask --- python/cudf/cudf/core/column/lists.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 34871377a09..b4873fab518 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -403,14 +403,14 @@ def get( if not (default is None or default is cudf.NA): # determine rows for which `index` is out-of-bounds lengths = count_elements(self._column) - out_of_bounds_indexes = (np.negative(index) > lengths) | ( + out_of_bounds_mask = (np.negative(index) > lengths) | ( index >= lengths ) # replace the value in those rows (should be NA) with `default` - if out_of_bounds_indexes.any(): + if out_of_bounds_mask.any(): out = out._scatter_by_column( - out_of_bounds_indexes, cudf.Scalar(default) + out_of_bounds_mask, cudf.Scalar(default) ) return self._return_or_inplace(out) From ccccbb28cbfb39162db13d73b34b0fb94d74d6dc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 1 Apr 2022 13:28:16 -0400 Subject: [PATCH 13/20] Duplicate test --- python/cudf/cudf/tests/test_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 046a209a8c5..4e17b945fbd 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -298,7 +298,6 @@ def test_get_default(): assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2)) assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA)) assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) - assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0)) assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1)) From 73de94da95b2664adfff667711789c096f6cbdce Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 6 Apr 2022 14:10:41 -0400 Subject: [PATCH 14/20] Remoe expectation from test --- python/dask_cudf/dask_cudf/tests/test_accessor.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 95cf0c8d56d..aad5ed2078a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -381,19 +381,16 @@ def test_contains(data, search_key): @pytest.mark.parametrize( - "data, index, expectation", + "data, index", [ - (data_test_1(), 1, does_not_raise()), - (data_test_2(), 2, does_not_raise()), + (data_test_1(), 1), + (data_test_2(), 2), ], ) def test_get(data, index, expectation): - with expectation: - expect = Series(data).list.get(index) - - if expectation == does_not_raise(): - ds = dgd.from_cudf(Series(data), 5) - assert_eq(expect, ds.list.get(index).compute()) + expect = Series(data).list.get(index) + ds = dgd.from_cudf(Series(data), 5) + assert_eq(expect, ds.list.get(index).compute()) @pytest.mark.parametrize( From 3b6c96f7ae196973eef159719a4e0d6c882409f1 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 6 Apr 2022 14:36:01 -0400 Subject: [PATCH 15/20] Remove expectation --- python/dask_cudf/dask_cudf/tests/test_accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index aad5ed2078a..f83800bf6b0 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -387,7 +387,7 @@ def test_contains(data, search_key): (data_test_2(), 2), ], ) -def test_get(data, index, expectation): +def test_get(data, index): expect = Series(data).list.get(index) ds = dgd.from_cudf(Series(data), 5) assert_eq(expect, ds.list.get(index).compute()) From b950b7652d7cdaead3bb7af9e64b04245ff36770 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 6 Apr 2022 16:16:19 -0400 Subject: [PATCH 16/20] Clarify docs --- python/cudf/cudf/core/column/lists.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 056a66e9a59..6a41fdde20e 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -348,15 +348,18 @@ def get( default: Optional[Union[ScalarLike, ColumnLike]] = None, ) -> ParentType: """ - Extract element at the given index from each list. - - `index` can be an integer or a sequence of integers. - Passing a sequence enables extracting values at - different indexes for different lists. - - If the index is out of bounds for any list, - return or, if provided, ``default``. - Thus, this method never raises an ``IndexError``. + Extract element at the given index from each list in a Series of lists. + + ``index`` can be an integer or a sequence of integers. If + ``index`` is an integer, the element at position ``index`` is + extracted from each list. If ``index`` is a sequence, it must + be of the same length as the Series, and ``index[i]`` + specifies the position of the element to extract from the + ``i``-th list in the Series. + + If the index is out of bounds for any list, return or, if + provided, ``default``. Thus, this method never raises an + ``IndexError``. Parameters ---------- From 5658fcd4baf55568ba25fbd3330df8df4197b1f0 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 7 Apr 2022 10:34:30 -0400 Subject: [PATCH 17/20] Add a test with Series input --- python/cudf/cudf/tests/test_list.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 6e992bcb165..814803adff0 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -315,6 +315,9 @@ def test_get_default(): sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) assert_eq(cudf.Series([[3, 4], [7, 8]]), sr_nested.list.get(1)) assert_eq(cudf.Series([[5, 6], cudf.NA]), sr_nested.list.get(2)) + assert_eq( + cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0]) + ) def test_get_ind_sequence(): @@ -322,13 +325,11 @@ def test_get_ind_sequence(): sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) assert_eq(cudf.Series([1, 4, 8]), sr.list.get([0, 1, 2])) assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) + assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) assert_eq(cudf.Series([cudf.NA, 5, cudf.NA]), sr.list.get([2, 2, -5])) assert_eq(cudf.Series([0, 5, 0]), sr.list.get([2, 2, -5], default=0)) - sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) - assert_eq( - cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0]) - ) + assert_eq(cudf.Series([[1, 2], [7, 8]]), sr_nested.list.get([0, 1])) @pytest.mark.parametrize( From cb2c46cdcd7c09cedeab6237487c26e05cefeb98 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 8 Apr 2022 15:11:21 -0400 Subject: [PATCH 18/20] Duplicate test --- python/cudf/cudf/tests/test_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 814803adff0..09d40a24a49 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -325,7 +325,6 @@ def test_get_ind_sequence(): sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) assert_eq(cudf.Series([1, 4, 8]), sr.list.get([0, 1, 2])) assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) - assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) assert_eq(cudf.Series([cudf.NA, 5, cudf.NA]), sr.list.get([2, 2, -5])) assert_eq(cudf.Series([0, 5, 0]), sr.list.get([2, 2, -5], default=0)) sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) From aee243d6b85904b4573e0dfbe43a0807516177cf Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 8 Apr 2022 15:14:10 -0400 Subject: [PATCH 19/20] Undo change from size_type->int --- python/cudf/cudf/_lib/lists.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index d80dfd85b2e..06663b5b0af 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -126,7 +126,7 @@ def sort_lists(Column col, bool ascending, str na_position): return Column.from_unique_ptr(move(c_result)) -def extract_element_scalar(Column col, int index): +def extract_element_scalar(Column col, size_type index): # shared_ptr required because lists_column_view has no default # ctor cdef shared_ptr[lists_column_view] list_view = ( From 1e5a2a8de305cd0cf0e7d202af1f79ac902c6cfc Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 11 Apr 2022 18:11:44 -0400 Subject: [PATCH 20/20] Remove old import --- python/cudf/cudf/core/column/lists.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 92e46a8c1ae..8578bfe8147 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -17,7 +17,6 @@ drop_list_duplicates, extract_element_column, extract_element_scalar, - extract_element, index_of, sort_lists, )