From 84b06b35476394c8d1d70ae791b472cda09b91cc Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 12 Mar 2021 19:06:18 -0800
Subject: [PATCH 1/8] Initial

---
 python/cudf/cudf/_lib/copying.pyx          | 20 ++++++++++-
 python/cudf/cudf/_lib/cpp/lists/gather.pxd | 13 ++++++++
 python/cudf/cudf/core/column/lists.py      | 39 +++++++++++++++++++++-
 3 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/lists/gather.pxd

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index ad798a73ed2..0582f5b37c7 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from libcpp cimport bool
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared
 from libcpp.vector cimport vector
 from libcpp.utility cimport move
 from libc.stdint cimport int32_t, int64_t
@@ -24,6 +24,8 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.cpp.lists.gather cimport segmented_gather as cpp_segmented_gather
 cimport cudf._lib.cpp.copying as cpp_copying
 
 # workaround for https://github.com/cython/cython/issues/3885
@@ -704,3 +706,19 @@ def sample(Table input, size_type n,
             else input._index_names
         )
     )
+
+def segmented_gather(Column source_column, Column gather_map):
+    cdef shared_ptr[lists_column_view] source_LCV = (
+        make_shared[lists_column_view](source_column.view())
+    )
+    cdef shared_ptr[lists_column_view] gather_map_LCV = (
+        make_shared[lists_column_view](gather_map.view())
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_segmented_gather(source_LCV.get()[0],
+                                        gather_map_LCV.get()[0]))
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
new file mode 100644
index 00000000000..ea664eee82e
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/gather.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+
+
+cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] segmented_gather(
+        const lists_column_view source_column,
+        const lists_column_view gather_map_list
+    ) except +
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index a60fe627acb..7a84eb67af3 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -6,8 +6,9 @@
 
 import cudf
 from cudf._lib.lists import count_elements
+from cudf._lib.copying import segmented_gather
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column
+from cudf.core.column import ColumnBase, column, as_column
 from cudf.core.column.methods import ColumnMethodsMixin
 from cudf.utils.dtypes import is_list_dtype
 
@@ -228,3 +229,39 @@ def len(self):
         dtype: int32
         """
         return self._return_or_inplace(count_elements(self._column))
+
+    def take(self, lists_indices):
+        """
+        Collect list elements based on given indices.
+
+        Parameters
+        ----------
+        lists_indices: List of non-nullable lists of position types
+                       Specifies what to collect from each row
+
+        Returns
+        -------
+        ListColumn
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
+        >>> s
+        0    [1, 2, 3]
+        1         None
+        2       [4, 5]
+        dtype: list
+        >>> s.take([[0, 1], [], []])
+        0    [1, 2]
+        1      None
+        2        []
+        dtype: list
+        """
+
+        pa_lists_indices = pa.array(lists_indices)
+        if not pa_lists_indices.type == pa.ListType or not pa.types.is_integer(pa_lists_indices.value.type) or pa_lists_indices.null_count > 0:
+            raise ValueError("lists_indices is not a list of non-nullable"
+                                "lists of position types.")
+        lists_indices_col = as_column(pa_lists_indices)
+
+        return self._return_or_inplace(segmented_gather(self._column, lists_indices_col))

From e0f3dede798b216779ecbba3d1dad90c42dfc9bc Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 15 Mar 2021 11:12:09 -0700
Subject: [PATCH 2/8] Tests

---
 python/cudf/cudf/core/column/lists.py | 33 +++++++++++------
 python/cudf/cudf/tests/test_list.py   | 52 +++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 7a84eb67af3..e74a32ab6af 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -5,10 +5,10 @@
 import pyarrow as pa
 
 import cudf
-from cudf._lib.lists import count_elements
 from cudf._lib.copying import segmented_gather
+from cudf._lib.lists import count_elements
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column, as_column
+from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
 from cudf.utils.dtypes import is_list_dtype
 
@@ -236,7 +236,7 @@ def take(self, lists_indices):
 
         Parameters
         ----------
-        lists_indices: List of non-nullable lists of position types
+        lists_indices: List type arrays
                        Specifies what to collect from each row
 
         Returns
@@ -251,17 +251,28 @@ def take(self, lists_indices):
         1         None
         2       [4, 5]
         dtype: list
-        >>> s.take([[0, 1], [], []])
+        >>> s.list.take([[0, 1], [], []])
         0    [1, 2]
         1      None
         2        []
         dtype: list
         """
 
-        pa_lists_indices = pa.array(lists_indices)
-        if not pa_lists_indices.type == pa.ListType or not pa.types.is_integer(pa_lists_indices.value.type) or pa_lists_indices.null_count > 0:
-            raise ValueError("lists_indices is not a list of non-nullable"
-                                "lists of position types.")
-        lists_indices_col = as_column(pa_lists_indices)
-
-        return self._return_or_inplace(segmented_gather(self._column, lists_indices_col))
+        lists_indices_col = as_column(lists_indices)
+        if not isinstance(lists_indices_col, ListColumn):
+            raise ValueError("lists_indices should be list type array.")
+        if not len(lists_indices_col) == self._column.size:
+            raise ValueError(
+                "lists_indices and list column is of different " "size."
+            )
+        try:
+            res = segmented_gather(self._column, lists_indices_col)
+        except RuntimeError as e:
+            if "of index type" in str(e):
+                raise ValueError(
+                    "lists_indices should be column of "
+                    "values of index types."
+                )
+            if "contains nulls" in str(e):
+                raise ValueError("lists_indices contains null elements.")
+        return self._return_or_inplace(res)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 195d8749ec6..7a974825e4d 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -112,3 +112,55 @@ def test_len(data):
     got = gsr.list.len()
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    ("data", "idx"),
+    [
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]),
+        ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]),
+        ([[1, 2, 3], []], [[0, 1], []]),
+        ([[1, 2, 3], [None]], [[0, 1], []]),
+        ([[1, None, 3], None], [[0, 1], []]),
+    ],
+)
+def test_take(data, idx):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+
+    pdf = pd.DataFrame({"data": ps, "idx": idx})
+    expected = pdf.apply(
+        lambda x: [x["data"][i] for i in x["idx"]]
+        if x["data"] is not None
+        else None,
+        axis=1,
+    )
+
+    got = gs.list.take(idx)
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    ("invalid", "exception"),
+    [
+        ([[0]], pytest.raises(ValueError, match="different size")),
+        ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")),
+        (
+            [["a", "b"], ["c"]],
+            pytest.raises(
+                ValueError, match="should be column of values of index types"
+            ),
+        ),
+        (
+            [[[1], [0]], [[0]]],
+            pytest.raises(
+                ValueError, match="should be column of values of index types"
+            ),
+        ),
+        ([[0, 1], None], pytest.raises(ValueError, match="contains null")),
+    ],
+)
+def test_take_invalid(invalid, exception):
+    gs = cudf.Series([[0, 1], [2, 3]])
+    with exception:
+        gs.list.take(invalid)

From 457540deecff25f959ea02ad908738834f88d06b Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 15 Mar 2021 12:53:10 -0700
Subject: [PATCH 3/8] Improving preliminary checks

---
 python/cudf/cudf/core/column/lists.py | 30 +++++++++++++++------------
 python/cudf/cudf/tests/test_list.py   |  4 ++--
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index e74a32ab6af..158cf599af3 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -2,6 +2,7 @@
 
 import pickle
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
@@ -10,7 +11,7 @@
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
-from cudf.utils.dtypes import is_list_dtype
+from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype
 
 
 class ListColumn(ColumnBase):
@@ -261,18 +262,21 @@ def take(self, lists_indices):
         lists_indices_col = as_column(lists_indices)
         if not isinstance(lists_indices_col, ListColumn):
             raise ValueError("lists_indices should be list type array.")
-        if not len(lists_indices_col) == self._column.size:
+        if not lists_indices_col.size == self._column.size:
             raise ValueError(
                 "lists_indices and list column is of different " "size."
             )
-        try:
-            res = segmented_gather(self._column, lists_indices_col)
-        except RuntimeError as e:
-            if "of index type" in str(e):
-                raise ValueError(
-                    "lists_indices should be column of "
-                    "values of index types."
-                )
-            if "contains nulls" in str(e):
-                raise ValueError("lists_indices contains null elements.")
-        return self._return_or_inplace(res)
+        if lists_indices_col.null_count > 0:
+            raise ValueError("lists_indices contains null elements.")
+        if not is_numerical_dtype(
+            lists_indices_col.children[1].dtype
+        ) or not np.issubdtype(
+            lists_indices_col.children[1].dtype, np.integer
+        ):
+            raise TypeError(
+                "lists_indices should be column of values of index types."
+            )
+
+        return self._return_or_inplace(
+            segmented_gather(self._column, lists_indices_col)
+        )
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 7a974825e4d..488ec00697b 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -148,13 +148,13 @@ def test_take(data, idx):
         (
             [["a", "b"], ["c"]],
             pytest.raises(
-                ValueError, match="should be column of values of index types"
+                TypeError, match="should be column of values of index types"
             ),
         ),
         (
             [[[1], [0]], [[0]]],
             pytest.raises(
-                ValueError, match="should be column of values of index types"
+                TypeError, match="should be column of values of index types"
             ),
         ),
         ([[0, 1], None], pytest.raises(ValueError, match="contains null")),

From 6aea4d857f8076c265e900d07b21f6402b196daf Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 15 Mar 2021 12:57:26 -0700
Subject: [PATCH 4/8] Simpler pd test file

---
 python/cudf/cudf/tests/test_list.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 488ec00697b..33812cfa7a7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -128,14 +128,9 @@ def test_take(data, idx):
     ps = pd.Series(data)
     gs = cudf.from_pandas(ps)
 
-    pdf = pd.DataFrame({"data": ps, "idx": idx})
-    expected = pdf.apply(
-        lambda x: [x["data"][i] for i in x["idx"]]
-        if x["data"] is not None
-        else None,
-        axis=1,
+    expected = pd.Series(zip(ps, idx)).map(
+        lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None
     )
-
     got = gs.list.take(idx)
     assert_eq(expected, got)
 

From 61e9237cc568de5c7369bb5c73ff4d2d7abb52f4 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 15 Mar 2021 13:06:27 -0700
Subject: [PATCH 5/8] Cython style

---
 python/cudf/cudf/_lib/copying.pyx | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 0582f5b37c7..e5501428624 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -25,7 +25,9 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.lists.gather cimport segmented_gather as cpp_segmented_gather
+from cudf._lib.cpp.lists.gather cimport (
+    segmented_gather as cpp_segmented_gather
+)
 cimport cudf._lib.cpp.copying as cpp_copying
 
 # workaround for https://github.com/cython/cython/issues/3885
@@ -707,6 +709,7 @@ def sample(Table input, size_type n,
         )
     )
 
+
 def segmented_gather(Column source_column, Column gather_map):
     cdef shared_ptr[lists_column_view] source_LCV = (
         make_shared[lists_column_view](source_column.view())
@@ -717,8 +720,10 @@ def segmented_gather(Column source_column, Column gather_map):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_segmented_gather(source_LCV.get()[0],
-                                        gather_map_LCV.get()[0]))
+        c_result = move(
+            cpp_segmented_gather(
+                source_LCV.get()[0], gather_map_LCV.get()[0])
+        )
 
     result = Column.from_unique_ptr(move(c_result))
     return result

From f331f4150b614b5291e8ece0a3df695437549f5c Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 15 Mar 2021 14:25:02 -0700
Subject: [PATCH 6/8] Rev: Fix docs bug

---
 python/cudf/cudf/core/column/lists.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 158cf599af3..222abcf1ce4 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -238,7 +238,7 @@ def take(self, lists_indices):
         Parameters
         ----------
         lists_indices: List type arrays
-                       Specifies what to collect from each row
+        Specifies what to collect from each row
 
         Returns
         -------

From cab305911dece511b2ba617fe1cae504c85e9652 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 16 Mar 2021 01:27:46 -0700
Subject: [PATCH 7/8] Rev: proper indent for docs

Co-authored-by: Keith Kraus <keith.j.kraus@gmail.com>
---
 python/cudf/cudf/core/column/lists.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 222abcf1ce4..96edc669ecb 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -238,7 +238,7 @@ def take(self, lists_indices):
         Parameters
         ----------
         lists_indices: List type arrays
-        Specifies what to collect from each row
+            Specifies what to collect from each row
 
         Returns
         -------

From 098c099865f5d99c863f5dc0cabae9fe7c0df7ca Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 17 Mar 2021 16:36:52 -0700
Subject: [PATCH 8/8] Use try block for null detection

---
 python/cudf/cudf/core/column/lists.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 222abcf1ce4..b1c3a520714 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -266,8 +266,6 @@ def take(self, lists_indices):
             raise ValueError(
                 "lists_indices and list column is of different " "size."
             )
-        if lists_indices_col.null_count > 0:
-            raise ValueError("lists_indices contains null elements.")
         if not is_numerical_dtype(
             lists_indices_col.children[1].dtype
         ) or not np.issubdtype(
@@ -277,6 +275,13 @@ def take(self, lists_indices):
                 "lists_indices should be column of values of index types."
             )
 
-        return self._return_or_inplace(
-            segmented_gather(self._column, lists_indices_col)
-        )
+        try:
+            res = self._return_or_inplace(
+                segmented_gather(self._column, lists_indices_col)
+            )
+        except RuntimeError as e:
+            if "contains nulls" in str(e):
+                raise ValueError("lists_indices contains null.") from e
+            raise
+        else:
+            return res