From 84b06b35476394c8d1d70ae791b472cda09b91cc Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 12 Mar 2021 19:06:18 -0800 Subject: [PATCH 1/8] Initial --- python/cudf/cudf/_lib/copying.pyx | 20 ++++++++++- python/cudf/cudf/_lib/cpp/lists/gather.pxd | 13 ++++++++ python/cudf/cudf/core/column/lists.py | 39 +++++++++++++++++++++- 3 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/lists/gather.pxd diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index ad798a73ed2..0582f5b37c7 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -3,7 +3,7 @@ import pandas as pd from libcpp cimport bool -from libcpp.memory cimport make_unique, unique_ptr +from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared from libcpp.vector cimport vector from libcpp.utility cimport move from libc.stdint cimport int32_t, int64_t @@ -24,6 +24,8 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view +from cudf._lib.cpp.lists.gather cimport segmented_gather as cpp_segmented_gather cimport cudf._lib.cpp.copying as cpp_copying # workaround for https://github.com/cython/cython/issues/3885 @@ -704,3 +706,19 @@ def sample(Table input, size_type n, else input._index_names ) ) + +def segmented_gather(Column source_column, Column gather_map): + cdef shared_ptr[lists_column_view] source_LCV = ( + make_shared[lists_column_view](source_column.view()) + ) + cdef shared_ptr[lists_column_view] gather_map_LCV = ( + make_shared[lists_column_view](gather_map.view()) + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_segmented_gather(source_LCV.get()[0], + gather_map_LCV.get()[0])) + + result = Column.from_unique_ptr(move(c_result)) + return result diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/cpp/lists/gather.pxd new file mode 100644 index 00000000000..ea664eee82e --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/gather.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view + + +cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] segmented_gather( + const lists_column_view source_column, + const lists_column_view gather_map_list + ) except + diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index a60fe627acb..7a84eb67af3 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -6,8 +6,9 @@ import cudf from cudf._lib.lists import count_elements +from cudf._lib.copying import segmented_gather from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase, column +from cudf.core.column import ColumnBase, column, as_column from cudf.core.column.methods import ColumnMethodsMixin from cudf.utils.dtypes import is_list_dtype @@ -228,3 +229,39 @@ def len(self): dtype: int32 """ return self._return_or_inplace(count_elements(self._column)) + + def take(self, lists_indices): + """ + Collect list elements based on given indices. + + Parameters + ---------- + lists_indices: List of non-nullable lists of position types + Specifies what to collect from each row + + Returns + ------- + ListColumn + + Examples + -------- + >>> s = cudf.Series([[1, 2, 3], None, [4, 5]]) + >>> s + 0 [1, 2, 3] + 1 None + 2 [4, 5] + dtype: list + >>> s.take([[0, 1], [], []]) + 0 [1, 2] + 1 None + 2 [] + dtype: list + """ + + pa_lists_indices = pa.array(lists_indices) + if not pa_lists_indices.type == pa.ListType or not pa.types.is_integer(pa_lists_indices.value.type) or pa_lists_indices.null_count > 0: + raise ValueError("lists_indices is not a list of non-nullable" + "lists of position types.") + lists_indices_col = as_column(pa_lists_indices) + + return self._return_or_inplace(segmented_gather(self._column, lists_indices_col)) From e0f3dede798b216779ecbba3d1dad90c42dfc9bc Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 15 Mar 2021 11:12:09 -0700 Subject: [PATCH 2/8] Tests --- python/cudf/cudf/core/column/lists.py | 33 +++++++++++------ python/cudf/cudf/tests/test_list.py | 52 +++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 7a84eb67af3..e74a32ab6af 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -5,10 +5,10 @@ import pyarrow as pa import cudf -from cudf._lib.lists import count_elements from cudf._lib.copying import segmented_gather +from cudf._lib.lists import count_elements from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase, column, as_column +from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin from cudf.utils.dtypes import is_list_dtype @@ -236,7 +236,7 @@ def take(self, lists_indices): Parameters ---------- - lists_indices: List of non-nullable lists of position types + lists_indices: List type arrays Specifies what to collect from each row Returns @@ -251,17 +251,28 @@ def take(self, lists_indices): 1 None 2 [4, 5] dtype: list - >>> s.take([[0, 1], [], []]) + >>> s.list.take([[0, 1], [], []]) 0 [1, 2] 1 None 2 [] dtype: list """ - pa_lists_indices = pa.array(lists_indices) - if not pa_lists_indices.type == pa.ListType or not pa.types.is_integer(pa_lists_indices.value.type) or pa_lists_indices.null_count > 0: - raise ValueError("lists_indices is not a list of non-nullable" - "lists of position types.") - lists_indices_col = as_column(pa_lists_indices) - - return self._return_or_inplace(segmented_gather(self._column, lists_indices_col)) + lists_indices_col = as_column(lists_indices) + if not isinstance(lists_indices_col, ListColumn): + raise ValueError("lists_indices should be list type array.") + if not len(lists_indices_col) == self._column.size: + raise ValueError( + "lists_indices and list column is of different " "size." + ) + try: + res = segmented_gather(self._column, lists_indices_col) + except RuntimeError as e: + if "of index type" in str(e): + raise ValueError( + "lists_indices should be column of " + "values of index types." + ) + if "contains nulls" in str(e): + raise ValueError("lists_indices contains null elements.") + return self._return_or_inplace(res) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 195d8749ec6..7a974825e4d 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -112,3 +112,55 @@ def test_len(data): got = gsr.list.len() assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + ("data", "idx"), + [ + ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]), + ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]), + ([[1, 2, 3], []], [[0, 1], []]), + ([[1, 2, 3], [None]], [[0, 1], []]), + ([[1, None, 3], None], [[0, 1], []]), + ], +) +def test_take(data, idx): + ps = pd.Series(data) + gs = cudf.from_pandas(ps) + + pdf = pd.DataFrame({"data": ps, "idx": idx}) + expected = pdf.apply( + lambda x: [x["data"][i] for i in x["idx"]] + if x["data"] is not None + else None, + axis=1, + ) + + got = gs.list.take(idx) + assert_eq(expected, got) + + +@pytest.mark.parametrize( + ("invalid", "exception"), + [ + ([[0]], pytest.raises(ValueError, match="different size")), + ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")), + ( + [["a", "b"], ["c"]], + pytest.raises( + ValueError, match="should be column of values of index types" + ), + ), + ( + [[[1], [0]], [[0]]], + pytest.raises( + ValueError, match="should be column of values of index types" + ), + ), + ([[0, 1], None], pytest.raises(ValueError, match="contains null")), + ], +) +def test_take_invalid(invalid, exception): + gs = cudf.Series([[0, 1], [2, 3]]) + with exception: + gs.list.take(invalid) From 457540deecff25f959ea02ad908738834f88d06b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 15 Mar 2021 12:53:10 -0700 Subject: [PATCH 3/8] Improving preliminary checks --- python/cudf/cudf/core/column/lists.py | 30 +++++++++++++++------------ python/cudf/cudf/tests/test_list.py | 4 ++-- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index e74a32ab6af..158cf599af3 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -2,6 +2,7 @@ import pickle +import numpy as np import pyarrow as pa import cudf @@ -10,7 +11,7 @@ from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin -from cudf.utils.dtypes import is_list_dtype +from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype class ListColumn(ColumnBase): @@ -261,18 +262,21 @@ def take(self, lists_indices): lists_indices_col = as_column(lists_indices) if not isinstance(lists_indices_col, ListColumn): raise ValueError("lists_indices should be list type array.") - if not len(lists_indices_col) == self._column.size: + if not lists_indices_col.size == self._column.size: raise ValueError( "lists_indices and list column is of different " "size." ) - try: - res = segmented_gather(self._column, lists_indices_col) - except RuntimeError as e: - if "of index type" in str(e): - raise ValueError( - "lists_indices should be column of " - "values of index types." - ) - if "contains nulls" in str(e): - raise ValueError("lists_indices contains null elements.") - return self._return_or_inplace(res) + if lists_indices_col.null_count > 0: + raise ValueError("lists_indices contains null elements.") + if not is_numerical_dtype( + lists_indices_col.children[1].dtype + ) or not np.issubdtype( + lists_indices_col.children[1].dtype, np.integer + ): + raise TypeError( + "lists_indices should be column of values of index types." + ) + + return self._return_or_inplace( + segmented_gather(self._column, lists_indices_col) + ) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 7a974825e4d..488ec00697b 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -148,13 +148,13 @@ def test_take(data, idx): ( [["a", "b"], ["c"]], pytest.raises( - ValueError, match="should be column of values of index types" + TypeError, match="should be column of values of index types" ), ), ( [[[1], [0]], [[0]]], pytest.raises( - ValueError, match="should be column of values of index types" + TypeError, match="should be column of values of index types" ), ), ([[0, 1], None], pytest.raises(ValueError, match="contains null")), From 6aea4d857f8076c265e900d07b21f6402b196daf Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 15 Mar 2021 12:57:26 -0700 Subject: [PATCH 4/8] Simpler pd test file --- python/cudf/cudf/tests/test_list.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 488ec00697b..33812cfa7a7 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -128,14 +128,9 @@ def test_take(data, idx): ps = pd.Series(data) gs = cudf.from_pandas(ps) - pdf = pd.DataFrame({"data": ps, "idx": idx}) - expected = pdf.apply( - lambda x: [x["data"][i] for i in x["idx"]] - if x["data"] is not None - else None, - axis=1, + expected = pd.Series(zip(ps, idx)).map( + lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None ) - got = gs.list.take(idx) assert_eq(expected, got) From 61e9237cc568de5c7369bb5c73ff4d2d7abb52f4 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 15 Mar 2021 13:06:27 -0700 Subject: [PATCH 5/8] Cython style --- python/cudf/cudf/_lib/copying.pyx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 0582f5b37c7..e5501428624 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -25,7 +25,9 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view -from cudf._lib.cpp.lists.gather cimport segmented_gather as cpp_segmented_gather +from cudf._lib.cpp.lists.gather cimport ( + segmented_gather as cpp_segmented_gather +) cimport cudf._lib.cpp.copying as cpp_copying # workaround for https://github.com/cython/cython/issues/3885 @@ -707,6 +709,7 @@ def sample(Table input, size_type n, ) ) + def segmented_gather(Column source_column, Column gather_map): cdef shared_ptr[lists_column_view] source_LCV = ( make_shared[lists_column_view](source_column.view()) @@ -717,8 +720,10 @@ def segmented_gather(Column source_column, Column gather_map): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_segmented_gather(source_LCV.get()[0], - gather_map_LCV.get()[0])) + c_result = move( + cpp_segmented_gather( + source_LCV.get()[0], gather_map_LCV.get()[0]) + ) result = Column.from_unique_ptr(move(c_result)) return result From f331f4150b614b5291e8ece0a3df695437549f5c Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Mon, 15 Mar 2021 14:25:02 -0700 Subject: [PATCH 6/8] Rev: Fix docs bug --- python/cudf/cudf/core/column/lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 158cf599af3..222abcf1ce4 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -238,7 +238,7 @@ def take(self, lists_indices): Parameters ---------- lists_indices: List type arrays - Specifies what to collect from each row + Specifies what to collect from each row Returns ------- From cab305911dece511b2ba617fe1cae504c85e9652 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 16 Mar 2021 01:27:46 -0700 Subject: [PATCH 7/8] Rev: proper indent for docs Co-authored-by: Keith Kraus --- python/cudf/cudf/core/column/lists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 222abcf1ce4..96edc669ecb 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -238,7 +238,7 @@ def take(self, lists_indices): Parameters ---------- lists_indices: List type arrays - Specifies what to collect from each row + Specifies what to collect from each row Returns ------- From 098c099865f5d99c863f5dc0cabae9fe7c0df7ca Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 16:36:52 -0700 Subject: [PATCH 8/8] Use try block for null detection --- python/cudf/cudf/core/column/lists.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 222abcf1ce4..b1c3a520714 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -266,8 +266,6 @@ def take(self, lists_indices): raise ValueError( "lists_indices and list column is of different " "size." ) - if lists_indices_col.null_count > 0: - raise ValueError("lists_indices contains null elements.") if not is_numerical_dtype( lists_indices_col.children[1].dtype ) or not np.issubdtype( @@ -277,6 +275,13 @@ def take(self, lists_indices): "lists_indices should be column of values of index types." ) - return self._return_or_inplace( - segmented_gather(self._column, lists_indices_col) - ) + try: + res = self._return_or_inplace( + segmented_gather(self._column, lists_indices_col) + ) + except RuntimeError as e: + if "contains nulls" in str(e): + raise ValueError("lists_indices contains null.") from e + raise + else: + return res