From ca21f5fb8ac798c4577240383fefcaca2809a1f8 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 9 Mar 2021 16:11:48 -0500
Subject: [PATCH 01/33] Make sure rmm::rmm CMake target is visibile to cudf
 users (#7524)

Presume that a project is using `cudf` via CPM like the following, and the machine doesn't have cudf installed, but does have rmm.
```
CPMAddPackage(NAME  cudf
        VERSION         "0.19.0"
        GIT_REPOSITORY  https://github.com/rapidsai/cudf.git
        GIT_TAG         branch-0.19
        GIT_SHALLOW     TRUE
        SOURCE_SUBDIR   cpp
        OPTIONS         "BUILD_TESTS OFF"
                        "BUILD_BENCHMARKS OFF"
                        "ARROW_STATIC_LIB ON"
                        "JITIFY_USE_CACHE ON"
                        "CUDA_STATIC_RUNTIME ON"
                        "DISABLE_DEPRECATION_WARNING ON"
                        "AUTO_DETECT_CUDA_ARCHITECTURES ON"
    )

add_library(cudf_example cudf_example.cu)
target_link_libraries(cudf_example PRIVATE cudf::cudf)

add_library(rmm_example rmm_example.cu)
target_link_libraries(rmm_example PRIVATE rmm::rmm)
```

While CPM will fail to find `cudf`, it will find the local install of `rmm` and use it. This poses a problem as CMake import targets have different default visibility compared to 'real' targets. This means that while `cudf::cudf` can see and resolve `rmm::rmm` the `rmm_example` executable won't be able to.

This change makes it possible for users of cudf via CPM to directly access the `rmm::rmm` target

Authors:
  - Robert Maynard (@robertmaynard)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7524
---
 cpp/cmake/thirdparty/CUDF_GetRMM.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
index b4a6a67c24d..ccefaf2ff33 100644
--- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
@@ -49,6 +49,13 @@ function(find_and_configure_rmm VERSION)
     cudf_restore_if_enabled(BUILD_TESTS)
     cudf_restore_if_enabled(BUILD_BENCHMARKS)
 
+    #Make sure consumers of cudf can also see rmm::rmm
+    if(TARGET rmm::rmm)
+        get_target_property(rmm_is_imported rmm::rmm IMPORTED)
+        if(rmm_is_imported)
+            set_target_properties(rmm::rmm PROPERTIES IMPORTED_GLOBAL TRUE)
+        endif()
+    endif()
     if(NOT rmm_BINARY_DIR IN_LIST CMAKE_PREFIX_PATH)
         list(APPEND CMAKE_PREFIX_PATH "${rmm_BINARY_DIR}")
         set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} PARENT_SCOPE)

From 2c9ef52e0ec876b3d67e01b4813345165945be7d Mon Sep 17 00:00:00 2001
From: Dillon Cullinan <dcullinan92@gmail.com>
Date: Tue, 9 Mar 2021 16:49:58 -0500
Subject: [PATCH 02/33] FIX Retry conda output location (#7540)

Utilize `gpuci_conda_retry` when grabbing file output locations. These commands can still result in `JSON Decode` errors that are typically reran in other conda build calls. `gpuci_conda_retry` outputs to `stderr` so that any failures will not be stored in the variable incorrectly.

This PR also removes the upload progress being shown in branch builds. As time has passed, the upload progress has become much more spammy and becomes annoying in CI build outputs.

Authors:
  - Dillon Cullinan (@dillon-cullinan)

Approvers:
  - Ray Douglass (@raydouglass)
  - AJ Schmidt (@ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/7540
---
 ci/cpu/upload.sh | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index ca8ee1d75ac..d75e45dc406 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -28,12 +28,13 @@ fi
 ################################################################################
 
 gpuci_logger "Get conda file output locations"
-export LIBCUDF_FILE=`conda build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf --output`
-export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf_kafka --output`
-export CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output`
-export DASK_CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output`
-export CUDF_KAFKA_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output`
-export CUSTREAMZ_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output`
+
+export LIBCUDF_FILE=`gpuci_conda_retry build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf --output`
+export LIBCUDF_KAFKA_FILE=`gpuci_conda_retry build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf_kafka --output`
+export CUDF_FILE=`gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output`
+export DASK_CUDF_FILE=`gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output`
+export CUDF_KAFKA_FILE=`gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output`
+export CUSTREAMZ_FILE=`gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages
@@ -44,36 +45,36 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then
   test -e ${LIBCUDF_FILE}
   echo "Upload libcudf"
   echo ${LIBCUDF_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUDF_FILE}
+  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUDF_FILE}
 fi
 
 if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then
   test -e ${CUDF_FILE}
   echo "Upload cudf"
   echo ${CUDF_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_FILE}
+  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_FILE}
 
   test -e ${DASK_CUDF_FILE}
   echo "Upload dask-cudf"
   echo ${DASK_CUDF_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${DASK_CUDF_FILE}
+  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${DASK_CUDF_FILE}
 
   test -e ${CUSTREAMZ_FILE}
   echo "Upload custreamz"
   echo ${CUSTREAMZ_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUSTREAMZ_FILE}
+  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUSTREAMZ_FILE}
 fi
 
 if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then
   test -e ${LIBCUDF_KAFKA_FILE}
   echo "Upload libcudf_kafka"
   echo ${LIBCUDF_KAFKA_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUDF_KAFKA_FILE}
+  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUDF_KAFKA_FILE}
 fi
 
 if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF_KAFKA" == "1" ]]; then
   test -e ${CUDF_KAFKA_FILE}
   echo "Upload cudf_kafka"
   echo ${CUDF_KAFKA_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_KAFKA_FILE}
+  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_KAFKA_FILE}
 fi

From 850548d9b705b999c7010ded1e2d7573820228e6 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Tue, 9 Mar 2021 14:34:55 -0800
Subject: [PATCH 03/33] Decimal32 Build Fix (#7544)

This is a replacement of #7542 which was held up because the changes weren't being reflected on it.

Authors:
  - Raza Jafri (@razajafri)

Approvers:
  - Gera Shegalov (@gerashegalov)
  - Niranjan Artal (@nartal1)

URL: https://github.com/rapidsai/cudf/pull/7544
---
 java/src/main/java/ai/rapids/cudf/ColumnView.java       | 8 ++++++--
 java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java | 9 ++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index e0cc96263b3..f36896a3c96 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1394,9 +1394,13 @@ public ColumnView replaceChildrenWithViews(int[] indices,
     List<ColumnView> newChildren = new ArrayList<>(getNumChildren());
     IntStream.range(0, getNumChildren()).forEach(i -> {
       ColumnView view = map.remove(i);
+      ColumnView child = getChildColumnView(i);
       if (view == null) {
-        newChildren.add(getChildColumnView(i));
+        newChildren.add(child);
       } else {
+        if (child.getRowCount() != view.getRowCount()) {
+          throw new IllegalArgumentException("Child row count doesn't match the old child");
+        }
         newChildren.add(view);
       }
     });
@@ -1431,7 +1435,7 @@ public ColumnView replaceChildrenWithViews(int[] indices,
    */
   public ColumnView replaceListChild(ColumnView child) {
     assert(type == DType.LIST);
-    return replaceChildrenWithViews(new int[]{1}, new ColumnView[]{child});
+    return replaceChildrenWithViews(new int[]{0}, new ColumnView[]{child});
   }
 
   /**
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 0675ece4863..d224543e574 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4005,7 +4005,7 @@ void testReplaceLeafNodeInList() {
 
   @Test
   void testReplaceLeafNodeInListWithIllegal() {
-    assertThrows(IllegalArgumentException.class, () -> {
+    Exception e = assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector child1 =
                ColumnVector.decimalFromDoubles(DType.create(DType.DTypeEnum.DECIMAL64, 3),
                    RoundingMode.HALF_UP, 770.892, 961.110);
@@ -4023,6 +4023,7 @@ void testReplaceLeafNodeInListWithIllegal() {
            ColumnView replacedView = created.replaceListChild(newChild)) {
       }
     });
+    assertTrue(e.getMessage().contains("Child row count doesn't match the old child"));
   }
 
   @Test
@@ -4049,7 +4050,7 @@ void testReplaceColumnInStruct() {
 
   @Test
   void testReplaceIllegalIndexColumnInStruct() {
-    assertThrows(IllegalArgumentException.class, () -> {
+    Exception e = assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector child1 = ColumnVector.fromInts(1, 4);
            ColumnVector child2 = ColumnVector.fromInts(2, 5);
            ColumnVector child3 = ColumnVector.fromInts(3, 6);
@@ -4059,11 +4060,12 @@ void testReplaceIllegalIndexColumnInStruct() {
                new ColumnVector[]{replaceWith})) {
       }
     });
+    assertTrue(e.getMessage().contains("One or more invalid child indices passed to be replaced"));
   }
 
   @Test
   void testReplaceSameIndexColumnInStruct() {
-    assertThrows(IllegalArgumentException.class, () -> {
+    Exception e = assertThrows(IllegalArgumentException.class, () -> {
       try (ColumnVector child1 = ColumnVector.fromInts(1, 4);
            ColumnVector child2 = ColumnVector.fromInts(2, 5);
            ColumnVector child3 = ColumnVector.fromInts(3, 6);
@@ -4073,5 +4075,6 @@ void testReplaceSameIndexColumnInStruct() {
                new ColumnVector[]{replaceWith, replaceWith})) {
       }
     });
+    assertTrue(e.getMessage().contains("Duplicate mapping found for replacing child index"));
   }
 }

From 2e4b5a61f7b27a796d6eaa7a11f7cdfc5b9fbb77 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 10 Mar 2021 01:17:44 -0600
Subject: [PATCH 04/33] Update missing docstring examples in python public APIs
 (#7546)

Fully resolves: #5290

This PR:

- [x] Adds missing docstring examples in python public APIs.
- [x] Adds some missing alias APIs.
- [x] Fixes issue in `Series.take` where the index was not correctly being removed when `keep_index=False`.
- [x] **Removes** `Series.values_to_string`, this API seems to be have been touched 4-years ago and since we have removed support for iterating over GPU objects thought it is best to remove this API altogether.

Authors:
  - GALI PREM SAGAR (@galipremsagar)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7546
---
 python/cudf/cudf/core/dataframe.py      |  301 ++++-
 python/cudf/cudf/core/index.py          |   45 +-
 python/cudf/cudf/core/series.py         | 1530 ++++++++++++++++++++++-
 python/cudf/cudf/tests/test_indexing.py |   26 +-
 4 files changed, 1845 insertions(+), 57 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 18a7f052d62..5ab058ff495 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -583,7 +583,32 @@ def deserialize(cls, header, frames):
 
     @property
     def dtypes(self):
-        """Return the dtypes in this object."""
+        """
+        Return the dtypes in this object.
+
+        Returns
+        -------
+        pandas.Series
+            The data type of each column.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pandas as pd
+        >>> df = cudf.DataFrame({'float': [1.0],
+        ...                    'int': [1],
+        ...                    'datetime': [pd.Timestamp('20180310')],
+        ...                    'string': ['foo']})
+        >>> df
+           float  int   datetime string
+        0    1.0    1 2018-03-10    foo
+        >>> df.dtypes
+        float              float64
+        int                  int64
+        datetime    datetime64[us]
+        string              object
+        dtype: object
+        """
         return cudf.utils.utils._create_pandas_series(
             data=[x.dtype for x in self._data.columns], index=self._data.names,
         )
@@ -1133,6 +1158,39 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
         Returns
         -------
         casted : DataFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]})
+        >>> df
+            a  b
+        0  10  1
+        1  20  2
+        2  30  3
+        >>> df.dtypes
+        a    int64
+        b    int64
+        dtype: object
+
+        Cast all columns to `int32`:
+
+        >>> df.astype('int32').dtypes
+        a    int32
+        b    int32
+        dtype: object
+
+        Cast `a` to `float32` using a dictionary:
+
+        >>> df.astype({'a': 'float32'}).dtypes
+        a    float32
+        b      int64
+        dtype: object
+        >>> df.astype({'a': 'float32'})
+            a  b
+        0  10.0  1
+        1  20.0  2
+        2  30.0  3
         """
         result = DataFrame(index=self.index)
 
@@ -3360,7 +3418,71 @@ def drop_duplicates(
         """
         Return DataFrame with duplicate rows removed, optionally only
         considering certain subset of columns.
-        """
+
+        Parameters
+        ----------
+        subset : column label or sequence of labels, optional
+            Only consider certain columns for identifying duplicates, by
+            default use all of the columns.
+        keep : {'first', 'last', False}, default 'first'
+            Determines which duplicates (if any) to keep.
+            - ``first`` : Drop duplicates except for the first occurrence.
+            - ``last`` : Drop duplicates except for the last occurrence.
+            - False : Drop all duplicates.
+        inplace : bool, default False
+            Whether to drop duplicates in place or to return a copy.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame with duplicates removed or None if ``inplace=True``.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({
+        ...     'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
+        ...     'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
+        ...     'rating': [4, 4, 3.5, 15, 5]
+        ... })
+        >>> df
+             brand style  rating
+        0  Yum Yum   cup     4.0
+        1  Yum Yum   cup     4.0
+        2  Indomie   cup     3.5
+        3  Indomie  pack    15.0
+        4  Indomie  pack     5.0
+
+        By default, it removes duplicate rows based
+        on all columns. Note that order of
+        the rows being returned is not guaranteed
+        to be sorted.
+
+        >>> df.drop_duplicates()
+             brand style  rating
+        2  Indomie   cup     3.5
+        4  Indomie  pack     5.0
+        3  Indomie  pack    15.0
+        0  Yum Yum   cup     4.0
+
+        To remove duplicates on specific column(s),
+        use `subset`.
+
+        >>> df.drop_duplicates(subset=['brand'])
+             brand style  rating
+        2  Indomie   cup     3.5
+        0  Yum Yum   cup     4.0
+
+        To remove duplicates and keep last occurrences, use `keep`.
+
+        >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
+             brand style  rating
+        2  Indomie   cup     3.5
+        4  Indomie  pack     5.0
+        1  Yum Yum   cup     4.0
+        """  # noqa: E501
         outdf = super().drop_duplicates(
             subset=subset, keep=keep, ignore_index=ignore_index
         )
@@ -3439,6 +3561,32 @@ def rename(
 
         Rename will not overwite column names. If a list with duplicates is
         passed, column names will be postfixed with a number.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        >>> df
+           A  B
+        0  1  4
+        1  2  5
+        2  3  6
+
+        Rename columns using a mapping:
+
+        >>> df.rename(columns={"A": "a", "B": "c"})
+           a  c
+        0  1  4
+        1  2  5
+        2  3  6
+
+        Rename index using a mapping:
+
+        >>> df.rename(index={0: 10, 1: 20, 2: 30})
+            A  B
+        10  1  4
+        20  2  5
+        30  3  6
         """
         if errors != "ignore":
             raise NotImplementedError(
@@ -3663,6 +3811,21 @@ def label_encoding(
         Returns
         -------
         a new dataframe with a new column append for the coded values.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 10, 20]})
+        >>> df
+           a   b
+        0  1  10
+        1  2  10
+        2  3  20
+        >>> df.label_encoding(column="b", prefix="b_col", cats=[10, 20])
+           a   b  b_col_labels
+        0  1  10             0
+        1  2  10             0
+        2  3  20             1
         """
 
         newname = prefix_sep.join([prefix, "labels"])
@@ -3992,20 +4155,131 @@ def agg(self, aggs, axis=None):
     def nlargest(self, n, columns, keep="first"):
         """Get the rows of the DataFrame sorted by the n largest value of *columns*
 
+        Parameters
+        ----------
+        n : int
+            Number of rows to return.
+        columns : label or list of labels
+            Column label(s) to order by.
+        keep : {'first', 'last'}, default 'first'
+            Where there are duplicate values:
+
+            - `first` : prioritize the first occurrence(s)
+            - `last` : prioritize the last occurrence(s)
+
+        Returns
+        -------
+        DataFrame
+            The first `n` rows ordered by the given columns in descending
+            order.
+
         Notes
         -----
         Difference from pandas:
             - Only a single column is supported in *columns*
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000,
+        ...                                   434000, 434000, 337000, 11300,
+        ...                                   11300, 11300],
+        ...                    'GDP': [1937894, 2583560 , 12011, 4520, 12128,
+        ...                            17036, 182, 38, 311],
+        ...                    'alpha-2': ["IT", "FR", "MT", "MV", "BN",
+        ...                                "IS", "NR", "TV", "AI"]},
+        ...                   index=["Italy", "France", "Malta",
+        ...                          "Maldives", "Brunei", "Iceland",
+        ...                          "Nauru", "Tuvalu", "Anguilla"])
+        >>> df
+                  population      GDP alpha-2
+        Italy       59000000  1937894      IT
+        France      65000000  2583560      FR
+        Malta         434000    12011      MT
+        Maldives      434000     4520      MV
+        Brunei        434000    12128      BN
+        Iceland       337000    17036      IS
+        Nauru          11300      182      NR
+        Tuvalu         11300       38      TV
+        Anguilla       11300      311      AI
+        >>> df.nlargest(3, 'population')
+                population      GDP alpha-2
+        France    65000000  2583560      FR
+        Italy     59000000  1937894      IT
+        Malta       434000    12011      MT
+        >>> df.nlargest(3, 'population', keep='last')
+                population      GDP alpha-2
+        France    65000000  2583560      FR
+        Italy     59000000  1937894      IT
+        Brunei      434000    12128      BN
         """
         return self._n_largest_or_smallest("nlargest", n, columns, keep)
 
     def nsmallest(self, n, columns, keep="first"):
         """Get the rows of the DataFrame sorted by the n smallest value of *columns*
 
+        Parameters
+        ----------
+        n : int
+            Number of items to retrieve.
+        columns : list or str
+            Column name or names to order by.
+        keep : {'first', 'last'}, default 'first'
+            Where there are duplicate values:
+
+            - ``first`` : take the first occurrence.
+            - ``last`` : take the last occurrence.
+
+        Returns
+        -------
+        DataFrame
+
         Notes
         -----
         Difference from pandas:
             - Only a single column is supported in *columns*
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000,
+        ...                                   434000, 434000, 337000, 337000,
+        ...                                   11300, 11300],
+        ...                    'GDP': [1937894, 2583560 , 12011, 4520, 12128,
+        ...                            17036, 182, 38, 311],
+        ...                    'alpha-2': ["IT", "FR", "MT", "MV", "BN",
+        ...                                "IS", "NR", "TV", "AI"]},
+        ...                   index=["Italy", "France", "Malta",
+        ...                          "Maldives", "Brunei", "Iceland",
+        ...                          "Nauru", "Tuvalu", "Anguilla"])
+        >>> df
+                  population      GDP alpha-2
+        Italy       59000000  1937894      IT
+        France      65000000  2583560      FR
+        Malta         434000    12011      MT
+        Maldives      434000     4520      MV
+        Brunei        434000    12128      BN
+        Iceland       337000    17036      IS
+        Nauru         337000      182      NR
+        Tuvalu         11300       38      TV
+        Anguilla       11300      311      AI
+
+        In the following example, we will use ``nsmallest`` to select the
+        three rows having the smallest values in column "population".
+
+        >>> df.nsmallest(3, 'population')
+                  population    GDP alpha-2
+        Tuvalu         11300     38      TV
+        Anguilla       11300    311      AI
+        Iceland       337000  17036      IS
+
+        When using ``keep='last'``, ties are resolved in reverse order:
+
+        >>> df.nsmallest(3, 'population', keep='last')
+                  population  GDP alpha-2
+        Anguilla       11300  311      AI
+        Tuvalu         11300   38      TV
+        Nauru         337000  182      NR
         """
         return self._n_largest_or_smallest("nsmallest", n, columns, keep)
 
@@ -5608,7 +5882,28 @@ def quantile(
         non-numeric types and result is expected to be a Series in case of
         Pandas. cuDF will return a DataFrame as it doesn't support mixed
         types under Series.
-        """
+
+        Examples
+        --------
+        >>> import cupy as cp
+        >>> import cudf
+        >>> df = cudf.DataFrame(cp.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
+        ...                   columns=['a', 'b'])
+        >>> df
+           a    b
+        0  1    1
+        1  2   10
+        2  3  100
+        3  4  100
+        >>> df.quantile(0.1)
+        a    1.3
+        b    3.7
+        Name: 0.1, dtype: float64
+        >>> df.quantile([.1, .5])
+            a     b
+        0.1  1.3   3.7
+        0.5  2.5  55.0
+        """  # noqa: E501
         if axis not in (0, None):
             raise NotImplementedError("axis is not implemented yet")
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 8c86352b2a7..2a5d2647e95 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+
 from __future__ import annotations, division, print_function
 
 import pickle
@@ -156,7 +157,16 @@ def drop_duplicates(self, keep="first"):
         Returns
         -------
         deduplicated : Index
-        """
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
+        >>> idx
+        StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object')
+        >>> idx.drop_duplicates()
+        StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object')
+        """  # noqa: E501
         return super().drop_duplicates(keep=keep)
 
     @property
@@ -1169,6 +1179,19 @@ def rename(self, name, inplace=False):
         -------
         Index
 
+        Examples
+        --------
+        >>> import cudf
+        >>> index = cudf.Index([1, 2, 3], name='one')
+        >>> index
+        Int64Index([1, 2, 3], dtype='int64', name='one')
+        >>> index.name
+        'one'
+        >>> renamed_index = index.rename('two')
+        >>> renamed_index
+        Int64Index([1, 2, 3], dtype='int64', name='two')
+        >>> renamed_index.name
+        'two'
         """
         if inplace is True:
             self.name = name
@@ -1198,6 +1221,15 @@ def astype(self, dtype, copy=False):
         -------
         Index
             Index with values cast to specified dtype.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> index = cudf.Index([1, 2, 3])
+        >>> index
+        Int64Index([1, 2, 3], dtype='int64')
+        >>> index.astype('float64')
+        Float64Index([1.0, 2.0, 3.0], dtype='float64')
         """
         if pd.api.types.is_dtype_equal(dtype, self.dtype):
             return self.copy(deep=copy)
@@ -1290,6 +1322,15 @@ def empty(self):
         -------
         out : bool
             If Index is empty, return True, if not return False.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> index = cudf.Index([])
+        >>> index
+        Float64Index([], dtype='float64')
+        >>> index.empty
+        True
         """
         return not self.size
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f80c6a9b452..2a990eef32e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
+
 from __future__ import annotations
 
 import pickle
@@ -79,8 +80,37 @@ def _constructor_expanddim(self):
     def from_categorical(cls, categorical, codes=None):
         """Creates from a pandas.Categorical
 
-        If ``codes`` is defined, use it instead of ``categorical.codes``
-        """
+        Parameters
+        ----------
+        categorical : pandas.Categorical
+            Contains data stored in a pandas Categorical.
+
+        codes : array-like, optional.
+            The category codes of this categorical. If ``codes`` are
+            defined, they are used instead of ``categorical.codes``
+
+        Returns
+        -------
+        Series
+            A cudf categorical series.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pandas as pd
+        >>> pd_categorical = pd.Categorical(pd.Series(['a', 'b', 'c', 'a'], dtype='category'))
+        >>> pd_categorical
+        ['a', 'b', 'c', 'a']
+        Categories (3, object): ['a', 'b', 'c']
+        >>> series = cudf.Series.from_categorical(pd_categorical)
+        >>> series
+        0    a
+        1    b
+        2    c
+        3    a
+        dtype: category
+        Categories (3, object): ['a', 'b', 'c']
+        """  # noqa: E501
         col = cudf.core.column.categorical.pandas_categorical_as_column(
             categorical, codes=codes
         )
@@ -106,6 +136,31 @@ def from_masked_array(cls, data, mask, null_count=None):
         null_count : int, optional
             The number of null values.
             If None, it is calculated automatically.
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None, 4, None])
+        >>> a
+        0       1
+        1       2
+        2       3
+        3    <NA>
+        4       4
+        5    <NA>
+        dtype: int64
+        >>> b = cudf.Series([10, 11, 12, 13, 14])
+        >>> cudf.Series.from_masked_array(data=b, mask=a._column.mask)
+        0      10
+        1      11
+        2      12
+        3    <NA>
+        4      14
+        dtype: int64
         """
         col = column.as_column(data).set_mask(mask)
         return cls(data=col)
@@ -595,13 +650,74 @@ def reindex(self, index=None, copy=True):
         Returns
         -------
         A new Series that conforms to the supplied index
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
+        >>> series
+        a    10
+        b    20
+        c    30
+        d    40
+        dtype: int64
+        >>> series.reindex(['a', 'b', 'y', 'z'])
+        a      10
+        b      20
+        y    <NA>
+        z    <NA>
+        dtype: int64
         """
         name = self.name or 0
         idx = self._index if index is None else index
-        return self.to_frame(name).reindex(idx, copy=copy)[name]
+        series = self.to_frame(name).reindex(idx, copy=copy)[name]
+        series.name = self.name
+        return series
 
     def reset_index(self, drop=False, inplace=False):
-        """ Reset index to RangeIndex """
+        """
+        Reset index to RangeIndex
+
+        Parameters
+        ----------
+        drop : bool, default False
+            Just reset the index, without inserting it as a column in
+            the new DataFrame.
+        inplace : bool, default False
+            Modify the Series in place (do not create a new object).
+
+        Returns
+        -------
+        Series or DataFrame or None
+            When `drop` is False (the default), a DataFrame is returned.
+            The newly created columns will come first in the DataFrame,
+            followed by the original Series values.
+            When `drop` is True, a `Series` is returned.
+            In either case, if ``inplace=True``, no value is returned.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13])
+        >>> series
+        10    a
+        11    b
+        12    c
+        13    d
+        dtype: object
+        >>> series.reset_index()
+           index  0
+        0     10  a
+        1     11  b
+        2     12  c
+        3     13  d
+        >>> series.reset_index(drop=True)
+        0    a
+        1    b
+        2    c
+        3    d
+        dtype: object
+        """
         if not drop:
             if inplace is True:
                 raise TypeError(
@@ -622,6 +738,30 @@ def set_index(self, index):
         ----------
         index : Index, Series-convertible
             the new index or values for the new index
+
+        Returns
+        -------
+        Series
+            A new Series with assigned index.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 11, 12, 13, 14])
+        >>> series
+        0    10
+        1    11
+        2    12
+        3    13
+        4    14
+        dtype: int64
+        >>> series.set_index(['a', 'b', 'c', 'd', 'e'])
+        a    10
+        b    11
+        c    12
+        d    13
+        e    14
+        dtype: int64
         """
         index = index if isinstance(index, Index) else as_index(index)
         return self._copy_construct(index=index)
@@ -657,7 +797,26 @@ def to_frame(self, name=None):
         -------
         DataFrame
             cudf DataFrame
-        """
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series(['a', 'b', 'c', None, 'd'], name='sample', index=[10, 11, 12, 13, 15])
+        >>> series
+        10       a
+        11       b
+        12       c
+        13    <NA>
+        15       d
+        Name: sample, dtype: object
+        >>> series.to_frame()
+           sample
+        10      a
+        11      b
+        12      c
+        13   <NA>
+        15      d
+        """  # noqa: E501
 
         if name is not None:
             col = name
@@ -684,6 +843,38 @@ def set_mask(self, mask, null_count=None):
         null_count : int, optional
             The number of null values.
             If None, it is calculated automatically.
+
+        Returns
+        -------
+        Series
+            A new series with the applied mask.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4, 5])
+        >>> ref_array = cudf.Series([10, None, 11, None, 16])
+        >>> series
+        0    1
+        1    2
+        2    3
+        3    4
+        4    5
+        dtype: int64
+        >>> ref_array
+        0      10
+        1    <NA>
+        2      11
+        3    <NA>
+        4      16
+        dtype: int64
+        >>> series.set_mask(ref_array._column.mask)
+        0       1
+        1    <NA>
+        2       3
+        3    <NA>
+        4       5
+        dtype: int64
         """
         col = self._column.set_mask(mask)
         return self._copy_construct(data=col)
@@ -894,14 +1085,49 @@ def __setitem__(self, key, value):
             self.loc[key] = value
 
     def take(self, indices, keep_index=True):
-        """Return Series by taking values from the corresponding *indices*.
+        """
+        Return Series by taking values from the corresponding *indices*.
+
+        Parameters
+        ----------
+        indices : array-like or scalar
+            An array/scalar like integers indicating which positions to take.
+        keep_index : bool, default True
+            Whethere to retain the index in result Series or not.
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 11, 12, 13, 14])
+        >>> series
+        0    10
+        1    11
+        2    12
+        3    13
+        4    14
+        dtype: int64
+        >>> series.take([0, 4])
+        0    10
+        4    14
+        dtype: int64
+
+        If you want to drop the index, pass `keep_index=False`
+
+        >>> series.take([0, 4], keep_index=False)
+        0    10
+        1    14
+        dtype: int64
         """
         if keep_index is True or is_scalar(indices):
             return self.iloc[indices]
         else:
             col_inds = as_column(indices)
             data = self._column.take(col_inds, keep_index=False)
-            return self._copy_construct(data=data)
+            return self._copy_construct(data=data, index=None)
 
     def __bool__(self):
         """Always raise TypeError when converting a Series
@@ -909,16 +1135,6 @@ def __bool__(self):
         """
         raise TypeError(f"can't compute boolean for {type(self)}")
 
-    def values_to_string(self, nrows=None):
-        """Returns a list of string for each element.
-        """
-        values = self[:nrows]
-        if self.dtype == np.dtype("object"):
-            out = [str(v) for v in values]
-        else:
-            out = ["" if v is None else str(v) for v in values]
-        return out
-
     def tolist(self):
 
         raise TypeError(
@@ -1022,7 +1238,26 @@ def to_string(self):
         Uses Pandas formatting internals to produce output identical to Pandas.
         Use the Pandas formatting settings directly in Pandas to control cuDF
         output.
-        """
+
+        Returns
+        -------
+        str
+            String representation of Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series(['a', None, 'b', 'c', None])
+        >>> series
+        0       a
+        1    <NA>
+        2       b
+        3       c
+        4    <NA>
+        dtype: object
+        >>> series.to_string()
+        '0       a\\n1    <NA>\\n2       b\\n3       c\\n4    <NA>\\ndtype: object'
+        """  # noqa : E501
         return self.__repr__()
 
     def __str__(self):
@@ -1189,7 +1424,8 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False):
         return result
 
     def add(self, other, fill_value=None, axis=0):
-        """Addition of series and other, element-wise
+        """
+        Addition of series and other, element-wise
         (binary operator add).
 
         Parameters
@@ -1198,6 +1434,43 @@ def add(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the addition.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       1
+        c       1
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       1
+        e    <NA>
+        dtype: int64
+        >>> a.add(b)
+        a       2
+        b    <NA>
+        c    <NA>
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.add(b, fill_value=0)
+        a       2
+        b       1
+        c       1
+        d       1
+        e    <NA>
+        dtype: int64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1216,6 +1489,36 @@ def radd(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       1
+        e    <NA>
+        dtype: int64
+        >>> a.add(b, fill_value=0)
+        a       2
+        b       2
+        c       3
+        d       1
+        e    <NA>
+        dtype: int64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1226,7 +1529,7 @@ def radd(self, other, fill_value=None, axis=0):
     def __radd__(self, other):
         return self._binaryop(other, "add", reflect=True)
 
-    def sub(self, other, fill_value=None, axis=0):
+    def subtract(self, other, fill_value=None, axis=0):
         """Subtraction of series and other, element-wise
         (binary operator sub).
 
@@ -1236,11 +1539,44 @@ def sub(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([10, 20, None, 30, None], index=['a', 'b', 'c', 'd', 'e'])
+        >>> a
+        a      10
+        b      20
+        c    <NA>
+        d      30
+        e    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, 30], index=['a', 'c', 'b', 'd'])
+        >>> b
+        a       1
+        c    <NA>
+        b       2
+        d      30
+        dtype: int64
+        >>> a.subtract(b, fill_value=2)
+        a       9
+        b      18
+        c    <NA>
+        d       0
+        e    <NA>
+        dtype: int64
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "sub", fill_value)
 
+    sub = subtract
+
     def __sub__(self, other):
         return self._binaryop(other, "sub")
 
@@ -1254,6 +1590,36 @@ def rsub(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       2
+        e    <NA>
+        dtype: int64
+        >>> a.rsub(b, fill_value=10)
+        a       0
+        b       8
+        c       7
+        d      -8
+        e    <NA>
+        dtype: int64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1262,7 +1628,7 @@ def rsub(self, other, fill_value=None, axis=0):
     def __rsub__(self, other):
         return self._binaryop(other, "sub", reflect=True)
 
-    def mul(self, other, fill_value=None, axis=0):
+    def multiply(self, other, fill_value=None, axis=0):
         """Multiplication of series and other, element-wise
         (binary operator mul).
 
@@ -1272,11 +1638,43 @@ def mul(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       2
+        e    <NA>
+        dtype: int64
+        >>> a.multiply(b, fill_value=0)
+        a       1
+        b       0
+        c       0
+        d       0
+        e    <NA>
+        dtype: int64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "mul", fill_value=fill_value)
 
+    mul = multiply
+
     def __mul__(self, other):
         return self._binaryop(other, "mul")
 
@@ -1290,7 +1688,40 @@ def rmul(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e'])
+        >>> a
+        a      10
+        b      20
+        c    <NA>
+        d      30
+        e      40
+        dtype: int64
+        >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f'])
+        >>> b
+        a    <NA>
+        b       1
+        d      20
+        e       5
+        f       4
+        dtype: int64
+        >>> a.rmul(b, fill_value=2)
+        a      20
+        b      20
+        c    <NA>
+        d     600
+        e     200
+        f       8
+        dtype: int64
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "mul", fill_value, True)
@@ -1308,6 +1739,26 @@ def mod(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 20, 30])
+        >>> series
+        0    10
+        1    20
+        2    30
+        dtype: int64
+        >>> series.mod(4)
+        0    2
+        1    0
+        2    2
+        dtype: int64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1326,7 +1777,40 @@ def rmod(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e'])
+        >>> a
+        a      10
+        b      20
+        c    <NA>
+        d      30
+        e      40
+        dtype: int64
+        >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f'])
+        >>> b
+        a    <NA>
+        b       1
+        d      20
+        e       5
+        f       4
+        dtype: int64
+        >>> a.rmod(b, fill_value=10)
+        a       0
+        b       1
+        c    <NA>
+        d      20
+        e       5
+        f       4
+        dtype: int64
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "mod", fill_value, True)
@@ -1344,6 +1828,36 @@ def pow(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a      10
+        b    <NA>
+        d      12
+        e    <NA>
+        dtype: int64
+        >>> a.pow(b, fill_value=0)
+        a       1
+        b       1
+        c       1
+        d       0
+        e    <NA>
+        dtype: int64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1362,6 +1876,36 @@ def rpow(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a      10
+        b    <NA>
+        d      12
+        e    <NA>
+        dtype: int64
+        >>> a.rpow(b, fill_value=0)
+        a      10
+        b       0
+        c       0
+        d       1
+        e    <NA>
+        dtype: int64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1380,6 +1924,36 @@ def floordiv(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       1
+        c       1
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       1
+        e    <NA>
+        dtype: int64
+        >>> a.floordiv(b)
+        a       1
+        b    <NA>
+        c    <NA>
+        d    <NA>
+        e    <NA>
+        dtype: int64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1454,6 +2028,36 @@ def truediv(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The reuslt of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 10, 20, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b      10
+        c      20
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       2
+        e    <NA>
+        dtype: int64
+        >>> a.truediv(b, fill_value=0)
+        a     1.0
+        b     Inf
+        c     Inf
+        d     0.0
+        e    <NA>
+        dtype: float64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1472,6 +2076,36 @@ def rtruediv(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([10, 20, None, 30], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a      10
+        b      20
+        c    <NA>
+        d      30
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, 3], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       2
+        e       3
+        dtype: int64
+        >>> a.rtruediv(b, fill_value=0)
+        a            0.1
+        b            0.0
+        c           <NA>
+        d    0.066666667
+        e            Inf
+        dtype: float64
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
@@ -1783,6 +2417,22 @@ def has_nulls(self):
         out : bool
             If Series has atleast one null value, return True, if not
             return False.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, None, 3, 4])
+        >>> series
+        0       1
+        1       2
+        2    <NA>
+        3       3
+        4       4
+        dtype: int64
+        >>> series.has_nulls
+        True
+        >>> series.dropna().has_nulls
+        False
         """
         return self._column.has_nulls
 
@@ -1867,7 +2517,72 @@ def dropna(self, axis=0, inplace=False, how=None):
 
     def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         """
-        Return Series with duplicate values removed
+        Return Series with duplicate values removed.
+
+        Parameters
+        ----------
+        keep : {'first', 'last', ``False``}, default 'first'
+            Method to handle dropping duplicates:
+
+            - 'first' : Drop duplicates except for the first occurrence.
+            - 'last' : Drop duplicates except for the last occurrence.
+            - ``False`` : Drop all duplicates.
+
+        inplace : bool, default ``False``
+            If ``True``, performs operation inplace and returns None.
+
+        Returns
+        -------
+        Series or None
+            Series with duplicates dropped or None if ``inplace=True``.
+
+        Examples
+        --------
+        >>> s = cudf.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'],
+        ...               name='animal')
+        >>> s
+        0      lama
+        1       cow
+        2      lama
+        3    beetle
+        4      lama
+        5     hippo
+        Name: animal, dtype: object
+
+        With the `keep` parameter, the selection behaviour of duplicated
+        values can be changed. The value ‘first’ keeps the first
+        occurrence for each set of duplicated entries.
+        The default value of keep is ‘first’. Note that order of
+        the rows being returned is not guaranteed
+        to be sorted.
+
+        >>> s.drop_duplicates()
+        3    beetle
+        1       cow
+        5     hippo
+        0      lama
+        Name: animal, dtype: object
+
+        The value ‘last’ for parameter `keep` keeps the last occurrence
+        for each set of duplicated entries.
+
+        >>> s.drop_duplicates(keep='last')
+        3    beetle
+        1       cow
+        5     hippo
+        4      lama
+        Name: animal, dtype: object
+
+        The value `False` for parameter `keep` discards all sets
+        of duplicated entries. Setting the value of ‘inplace’ to
+        `True` performs the operation inplace and returns `None`.
+
+        >>> s.drop_duplicates(keep=False, inplace=True)
+        >>> s
+        3    beetle
+        1       cow
+        5     hippo
+        Name: animal, dtype: object
         """
         result = super().drop_duplicates(keep=keep, ignore_index=ignore_index)
 
@@ -1909,17 +2624,62 @@ def to_array(self, fillna=None):
             If it equals "pandas", null values are filled with NaNs.
             Non integral dtype is promoted to np.float64.
 
+        Returns
+        -------
+        numpy.ndarray
+            A numpy array representation of the elements in the Series.
+
         Notes
         -----
-
         If ``fillna`` is ``None``, null values are skipped.  Therefore, the
         output size could be smaller.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 11, 12, 13, 14])
+        >>> series
+        0    10
+        1    11
+        2    12
+        3    13
+        4    14
+        dtype: int64
+        >>> array = series.to_array()
+        >>> array
+        array([10, 11, 12, 13, 14])
+        >>> type(array)
+        <class 'numpy.ndarray'>
         """
         return self._column.to_array(fillna=fillna)
 
     def nans_to_nulls(self):
         """
         Convert nans (if any) to nulls
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False)
+        >>> series
+        0     1.0
+        1     2.0
+        2     NaN
+        3    <NA>
+        4    10.0
+        dtype: float64
+        >>> series.nans_to_nulls()
+        0     1.0
+        1     2.0
+        2    <NA>
+        3    <NA>
+        4    10.0
+        dtype: float64
         """
         result_col = self._column.nans_to_nulls()
         return self._copy_construct(data=result_col)
@@ -2040,6 +2800,24 @@ def to_gpu_array(self, fillna=None):
 
         if ``fillna`` is ``None``, null values are skipped.  Therefore, the
         output size could be smaller.
+
+        Returns
+        -------
+        numba DeviceNDArray
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([10, 20, 30, 40, 50])
+        >>> s
+        0    10
+        1    20
+        2    30
+        3    40
+        4    50
+        dtype: int64
+        >>> s.to_gpu_array()
+        <numba.cuda.cudadrv.devicearray.DeviceNDArray object at 0x7f1840858890>
         """
         return self._column.to_gpu_array(fillna=fillna)
 
@@ -2110,7 +2888,27 @@ def to_pandas(self, index=True, nullable=False, **kwargs):
     @property
     def data(self):
         """The gpu buffer for the data
-        """
+
+        Returns
+        -------
+        out : The GPU buffer of the Series.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4])
+        >>> series
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+        >>> series.data
+        <cudf.core.buffer.Buffer object at 0x7f23c192d110>
+        >>> series.data.to_host_array()
+        array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
+               0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)
+        """  # noqa: E501
         return self._column.data
 
     @property
@@ -2131,6 +2929,18 @@ def loc(self):
         See also
         --------
         cudf.core.dataframe.DataFrame.loc
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 11, 12], index=['a', 'b', 'c'])
+        >>> series
+        a    10
+        b    11
+        c    12
+        dtype: int64
+        >>> series.loc['b']
+        11
         """
         return _SeriesLocIndexer(self)
 
@@ -2142,6 +2952,18 @@ def iloc(self):
         See also
         --------
         cudf.core.dataframe.DataFrame.iloc
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([10, 20, 30])
+        >>> s
+        0    10
+        1    20
+        2    30
+        dtype: int64
+        >>> s.iloc[2]
+        30
         """
         return _SeriesIlocIndexer(self)
 
@@ -2157,7 +2979,26 @@ def as_mask(self):
         Returns
         -------
         device array
-        """
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([True, False, True])
+        >>> s.as_mask()
+        <cudf.core.buffer.Buffer object at 0x7f23c3eed0d0>
+        >>> s.as_mask().to_host_array()
+        array([  5,   0,   0,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,
+                 0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0, 181, 164,
+               188,   1,   0,   0,   0,   0, 255, 255, 255, 255, 255, 255, 255,
+               127, 253, 214,  62, 241,   1,   0,   0,   0,   0,   0,   0,   0,
+                 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
+             dtype=uint8)
+        """
+        if not pd.api.types.is_bool_dtype(self.dtype):
+            raise TypeError(
+                f"Series must of boolean dtype, found: {self.dtype}"
+            )
+
         return self._column.as_mask()
 
     def astype(self, dtype, copy=False, errors="raise"):
@@ -2177,17 +3018,64 @@ def astype(self, dtype, copy=False, errors="raise"):
             values then may propagate to other cudf objects.
         errors : {'raise', 'ignore', 'warn'}, default 'raise'
             Control raising of exceptions on invalid data for provided dtype.
+
             - ``raise`` : allow exceptions to be raised
             - ``ignore`` : suppress exceptions. On error return original
-            object.
+              object.
             - ``warn`` : prints last exceptions as warnings and
-            return original object.
+              return original object.
 
         Returns
         -------
         out : Series
             Returns ``self.copy(deep=copy)`` if ``dtype`` is the same
             as ``self.dtype``.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 2], dtype='int32')
+        >>> series
+        0    1
+        1    2
+        dtype: int32
+        >>> series.astype('int64')
+        0    1
+        1    2
+        dtype: int64
+
+        Convert to categorical type:
+
+        >>> series.astype('category')
+        0    1
+        1    2
+        dtype: category
+        Categories (2, int64): [1, 2]
+
+        Convert to ordered categorical type with custom ordering:
+
+        >>> cat_dtype = cudf.CategoricalDtype(categories=[2, 1], ordered=True)
+        >>> series.astype(cat_dtype)
+        0    1
+        1    2
+        dtype: category
+        Categories (2, int64): [2 < 1]
+
+        Note that using ``copy=False`` (enabled by default)
+        and changing data on a new Series will
+        propagate changes:
+
+        >>> s1 = cudf.Series([1, 2])
+        >>> s1
+        0    1
+        1    2
+        dtype: int64
+        >>> s2 = s1.astype('int64', copy=False)
+        >>> s2[0] = 10
+        >>> s1
+        0    10
+        1     2
+        dtype: int64
         """
         if errors not in ("ignore", "raise", "warn"):
             raise ValueError("invalid error value specified")
@@ -2229,11 +3117,68 @@ def argsort(self, ascending=True, na_position="last"):
         Returns
         -------
         result: Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([3, 1, 2])
+        >>> s
+        0    3
+        1    1
+        2    2
+        dtype: int64
+        >>> s.argsort()
+        0    1
+        1    2
+        2    0
+        dtype: int32
+        >>> s[s.argsort()]
+        1    1
+        2    2
+        0    3
+        dtype: int64
         """
         return self._sort(ascending=ascending, na_position=na_position)[1]
 
     def sort_index(self, ascending=True):
-        """Sort by the index.
+        """
+        Sort by the index.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            Sort ascending vs. descending.
+
+        Returns
+        -------
+        Series
+            The original Series sorted by the labels.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
+        >>> series
+        3    a
+        2    b
+        1    c
+        4    d
+        dtype: object
+        >>> series.sort_index()
+        1    c
+        2    b
+        3    a
+        4    d
+        dtype: object
+
+        Sort Descending
+
+        >>> series.sort_index(ascending=False)
+        4    d
+        3    a
+        2    b
+        1    c
+        dtype: object
         """
         inds = self.index.argsort(ascending=ascending)
         return self.take(inds)
@@ -2316,11 +3261,138 @@ def _n_largest_or_smallest(self, largest, n, keep):
 
     def nlargest(self, n=5, keep="first"):
         """Returns a new Series of the *n* largest element.
+
+        Parameters
+        ----------
+        n : int, default 5
+            Return this many descending sorted values.
+        keep : {'first', 'last'}, default 'first'
+            When there are duplicate values that cannot all fit in a
+            Series of `n` elements:
+
+            - ``first`` : return the first `n` occurrences in order
+              of appearance.
+            - ``last`` : return the last `n` occurrences in reverse
+              order of appearance.
+
+        Returns
+        -------
+        Series
+            The `n` largest values in the Series, sorted in decreasing order.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> countries_population = {"Italy": 59000000, "France": 65000000,
+        ...                         "Malta": 434000, "Maldives": 434000,
+        ...                         "Brunei": 434000, "Iceland": 337000,
+        ...                         "Nauru": 11300, "Tuvalu": 11300,
+        ...                         "Anguilla": 11300, "Montserrat": 5200}
+        >>> series = cudf.Series(countries_population)
+        >>> series
+        Italy         59000000
+        France        65000000
+        Malta           434000
+        Maldives        434000
+        Brunei          434000
+        Iceland         337000
+        Nauru            11300
+        Tuvalu           11300
+        Anguilla         11300
+        Montserrat        5200
+        dtype: int64
+        >>> series.nlargest()
+        France      65000000
+        Italy       59000000
+        Malta         434000
+        Maldives      434000
+        Brunei        434000
+        dtype: int64
+        >>> series.nlargest(3)
+        France    65000000
+        Italy     59000000
+        Malta       434000
+        dtype: int64
+        >>> series.nlargest(3, keep='last')
+        France    65000000
+        Italy     59000000
+        Brunei      434000
+        dtype: int64
         """
         return self._n_largest_or_smallest(n=n, keep=keep, largest=True)
 
     def nsmallest(self, n=5, keep="first"):
-        """Returns a new Series of the *n* smallest element.
+        """
+        Returns a new Series of the *n* smallest element.
+
+        Parameters
+        ----------
+        n : int, default 5
+            Return this many ascending sorted values.
+        keep : {'first', 'last'}, default 'first'
+            When there are duplicate values that cannot all fit in a
+            Series of `n` elements:
+
+            - ``first`` : return the first `n` occurrences in order
+              of appearance.
+            - ``last`` : return the last `n` occurrences in reverse
+              order of appearance.
+
+        Returns
+        -------
+        Series
+            The `n` smallest values in the Series, sorted in increasing order.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> countries_population = {"Italy": 59000000, "France": 65000000,
+        ...                         "Brunei": 434000, "Malta": 434000,
+        ...                         "Maldives": 434000, "Iceland": 337000,
+        ...                         "Nauru": 11300, "Tuvalu": 11300,
+        ...                         "Anguilla": 11300, "Montserrat": 5200}
+        >>> s = cudf.Series(countries_population)
+        >>> s
+        Italy       59000000
+        France      65000000
+        Brunei        434000
+        Malta         434000
+        Maldives      434000
+        Iceland       337000
+        Nauru          11300
+        Tuvalu         11300
+        Anguilla       11300
+        Montserrat      5200
+        dtype: int64
+
+        The `n` smallest elements where ``n=5`` by default.
+
+        >>> s.nsmallest()
+        Montserrat    5200
+        Nauru        11300
+        Tuvalu       11300
+        Anguilla     11300
+        Iceland     337000
+        dtype: int64
+
+        The `n` smallest elements where ``n=3``. Default `keep` value is
+        'first' so Nauru and Tuvalu will be kept.
+
+        >>> s.nsmallest(3)
+        Montserrat   5200
+        Nauru       11300
+        Tuvalu      11300
+        dtype: int64
+
+        The `n` smallest elements where ``n=3`` and keeping the last
+        duplicates. Anguilla and Tuvalu will be kept since they are the last
+        with value 11300 based on the index order.
+
+        >>> s.nsmallest(3, keep='last')
+        Montserrat   5200
+        Anguilla    11300
+        Tuvalu      11300
+        dtype: int64
         """
         return self._n_largest_or_smallest(n=n, keep=keep, largest=False)
 
@@ -2505,7 +3577,34 @@ def replace(
         return self._mimic_inplace(result, inplace=inplace)
 
     def reverse(self):
-        """Reverse the Series
+        """
+        Reverse the Series
+
+        Returns
+        -------
+        Series
+            A reversed Series.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4, 5, 6])
+        >>> series
+        0    1
+        1    2
+        2    3
+        3    4
+        4    5
+        5    6
+        dtype: int64
+        >>> series.reverse()
+        5    6
+        4    5
+        3    4
+        2    3
+        1    2
+        0    1
+        dtype: int64
         """
         rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32)
         col = self._column[rinds]
@@ -2527,6 +3626,31 @@ def one_hot_encoding(self, cats, dtype="float64"):
         Sequence
             A sequence of new series for each category. Its length is
             determined by the length of ``cats``.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series(['a', 'b', 'c', 'a'])
+        >>> s
+        0    a
+        1    b
+        2    c
+        3    a
+        dtype: object
+        >>> s.one_hot_encoding(['a', 'c', 'b'])
+        [0    1.0
+        1    0.0
+        2    0.0
+        3    1.0
+        dtype: float64, 0    0.0
+        1    0.0
+        2    1.0
+        3    0.0
+        dtype: float64, 0    0.0
+        1    1.0
+        2    0.0
+        3    0.0
+        dtype: float64]
         """
         if hasattr(cats, "to_arrow"):
             cats = cats.to_pandas()
@@ -3407,6 +4531,22 @@ def std(
         -----
         Parameters currently not supported are `axis`, `level` and
         `numeric_only`
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 10, 20, 30, 40])
+        >>> series
+        0    10
+        1    10
+        2    20
+        3    30
+        4    40
+        dtype: int64
+        >>> series.std()
+        13.038404810405298
+        >>> series.std(ddof=2)
+        15.05545305418162
         """
 
         if axis not in (None, 0):
@@ -3456,6 +4596,20 @@ def var(
         -----
         Parameters currently not supported are `axis`, `level` and
         `numeric_only`
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 11, 12, 0, 1])
+        >>> series
+        0    10
+        1    11
+        2    12
+        3     0
+        4     1
+        dtype: int64
+        >>> series.var()
+        33.7
         """
 
         if axis not in (None, 0):
@@ -3647,6 +4801,13 @@ def kurtosis(
         -----
         Parameters currently not supported are `axis`, `level` and
         `numeric_only`
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4])
+        >>> series.kurtosis()
+        -1.1999999999999904
         """
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
@@ -3683,6 +4844,22 @@ def skew(
         -----
         Parameters currently not supported are `axis`, `level` and
         `numeric_only`
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6])
+        >>> series
+        0    1
+        1    2
+        2    3
+        3    4
+        4    5
+        5    6
+        6    6
+        dtype: int64
+        >>> series.skew()
+        -0.288195490292614
         """
 
         if axis not in (None, 0):
@@ -3834,6 +5011,31 @@ def isin(self, values):
     def unique(self):
         """
         Returns unique values of this Series.
+
+        Returns
+        -------
+        Series
+            A series with only the unique values.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series(['a', 'a', 'b', None, 'b', None, 'c'])
+        >>> series
+        0       a
+        1       a
+        2       b
+        3    <NA>
+        4       b
+        5    <NA>
+        6       c
+        dtype: object
+        >>> series.unique()
+        0    <NA>
+        1       a
+        2       b
+        3       c
+        dtype: object
         """
         res = self._column.unique()
         return Series(res, name=self.name)
@@ -3841,6 +5043,31 @@ def unique(self):
     def nunique(self, method="sort", dropna=True):
         """Returns the number of unique values of the Series: approximate version,
         and exact version to be moved to libgdf
+
+        Excludes NA values by default.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include NA values in the count.
+
+        Returns
+        -------
+        int
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([1, 3, 5, 7, 7])
+        >>> s
+        0    1
+        1    3
+        2    5
+        3    7
+        4    7
+        dtype: int64
+        >>> s.nunique()
+        4
         """
         if method != "sort":
             msg = "non sort based distinct_count() not implemented yet"
@@ -3973,7 +5200,32 @@ def value_counts(
         return res
 
     def scale(self):
-        """Scale values to [0, 1] in float64
+        """
+        Scale values to [0, 1] in float64
+
+        Returns
+        -------
+        Series
+            A new series with values scaled to [0, 1].
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 11, 12, 0.5, 1])
+        >>> series
+        0    10.0
+        1    11.0
+        2    12.0
+        3     0.5
+        4     1.0
+        dtype: float64
+        >>> series.scale()
+        0    0.826087
+        1    0.913043
+        2    1.000000
+        3    0.000000
+        4    0.043478
+        dtype: float64
         """
         vmin = self.min()
         vmax = self.max()
@@ -3984,7 +5236,27 @@ def scale(self):
     def abs(self):
         """Absolute value of each element of the series.
 
-        Returns a new Series.
+        Returns
+        -------
+        abs
+            Series containing the absolute value of each element.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([-1.10, 2, -3.33, 4])
+        >>> series
+        0   -1.10
+        1    2.00
+        2   -3.33
+        3    4.00
+        dtype: float64
+        >>> series.abs()
+        0    1.10
+        1    2.00
+        2    3.33
+        3    4.00
+        dtype: float64
         """
         return self._unaryop("abs")
 
@@ -3993,10 +5265,31 @@ def __abs__(self):
 
     # Rounding
     def ceil(self):
-        """Rounds each value upward to the smallest integral value not less
+        """
+        Rounds each value upward to the smallest integral value not less
         than the original.
 
-        Returns a new Series.
+        Returns
+        -------
+        res
+            Returns a new Series with ceiling value of each element.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1.1, 2.8, 3.5, 4.5])
+        >>> series
+        0    1.1
+        1    2.8
+        2    3.5
+        3    4.5
+        dtype: float64
+        >>> series.ceil()
+        0    2.0
+        1    3.0
+        2    4.0
+        3    5.0
+        dtype: float64
         """
         return self._unaryop("ceil")
 
@@ -4004,12 +5297,53 @@ def floor(self):
         """Rounds each value downward to the largest integral value not greater
         than the original.
 
-        Returns a new Series.
+        Returns
+        -------
+        res
+            Returns a new Series with floor of each element.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([-1.9, 2, 0.2, 1.5, 0.0, 3.0])
+        >>> series
+        0   -1.9
+        1    2.0
+        2    0.2
+        3    1.5
+        4    0.0
+        5    3.0
+        dtype: float64
+        >>> series.floor()
+        0   -2.0
+        1    2.0
+        2    0.0
+        3    1.0
+        4    0.0
+        5    3.0
+        dtype: float64
         """
         return self._unaryop("floor")
 
     def hash_values(self):
         """Compute the hash of values in this column.
+
+        Returns
+        -------
+        cupy array
+            A cupy array with hash values.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 120, 30])
+        >>> series
+        0     10
+        1    120
+        2     30
+        dtype: int64
+        >>> series.hash_values()
+        array([-1930516747,   422619251,  -941520876], dtype=int32)
         """
         return Series(self._hash()).values
 
@@ -4030,6 +5364,25 @@ def hash_encode(self, stop, use_name=False):
         -------
         result : Series
             The encoded Series.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 120, 30])
+        >>> series.hash_encode(stop=200)
+        0     53
+        1     51
+        2    124
+        dtype: int32
+
+        You can choose to include name while hash
+        encoding by specifying `use_name=True`
+
+        >>> series.hash_encode(stop=200, use_name=True)
+        0    131
+        1     29
+        2     76
+        dtype: int32
         """
         assert stop > 0
 
@@ -4069,6 +5422,24 @@ def quantile(
             If ``q`` is an array, a Series will be returned where the
             index is ``q`` and the values are the quantiles, otherwise
             a float will be returned.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4])
+        >>> series
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+        >>> series.quantile(0.5)
+        2.5
+        >>> series.quantile([0.25, 0.5, 0.75])
+        0.25    1.75
+        0.50    2.50
+        0.75    3.25
+        dtype: float64
         """
 
         result = self._column.quantile(q, interpolation, exact)
@@ -4252,6 +5623,19 @@ def digitize(self, bins, right=False):
         Returns
         -------
         A new Series containing the indices.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([0.2, 6.4, 3.0, 1.6])
+        >>> bins = cudf.Series([0.0, 1.0, 2.5, 4.0, 10.0])
+        >>> inds = s.digitize(bins)
+        >>> inds
+        0    1
+        1    4
+        2    3
+        3    2
+        dtype: int32
         """
         return Series(
             cudf.core.column.numerical.digitize(self._column, bins, right)
@@ -4261,10 +5645,61 @@ def diff(self, periods=1):
         """Calculate the difference between values at positions i and i - N in
         an array and store the output in a new array.
 
+        Returns
+        -------
+        Series
+            First differences of the Series.
+
         Notes
         -----
         Diff currently only supports float and integer dtype columns with
         no null values.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1, 1, 2, 3, 5, 8])
+        >>> series
+        0    1
+        1    1
+        2    2
+        3    3
+        4    5
+        5    8
+        dtype: int64
+
+        Difference with previous row
+
+        >>> series.diff()
+        0    <NA>
+        1       0
+        2       1
+        3       1
+        4       2
+        5       3
+        dtype: int64
+
+        Difference with 3rd previous row
+
+        >>> series.diff(periods=3)
+        0    <NA>
+        1    <NA>
+        2    <NA>
+        3       2
+        4       4
+        5       6
+        dtype: int64
+
+        Difference with following row
+
+        >>> series.diff(periods=-1)
+        0       0
+        1      -1
+        2      -1
+        3      -2
+        4      -3
+        5    <NA>
+        dtype: int64
         """
         if self.has_nulls:
             raise AssertionError(
@@ -4388,6 +5823,25 @@ def rename(self, index=None, copy=True):
         Difference from pandas:
           - Supports scalar values only for changing name attribute
           - Not supporting : inplace, level
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 20, 30])
+        >>> series
+        0    10
+        1    20
+        2    30
+        dtype: int64
+        >>> series.name
+        >>> renamed_series = series.rename('numeric_series')
+        >>> renamed_series
+        0    10
+        1    20
+        2    30
+        Name: numeric_series, dtype: int64
+        >>> renamed_series.name
+        'numeric_series'
         """
         out = self.copy(deep=False)
         out = out.set_index(self.index)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 73a074c0376..15d504799e4 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -709,20 +709,18 @@ def test_series_take(ntake, keep_index):
     np.random.seed(0)
     nelem = 123
 
-    data = np.random.randint(0, 20, nelem)
-    sr = cudf.Series(data)
-
-    take_indices = np.random.randint(0, len(sr), ntake)
-
-    if keep_index is True:
-        out = sr.take(take_indices)
-        np.testing.assert_array_equal(out.to_array(), data[take_indices])
-    elif keep_index is False:
-        out = sr.take(take_indices, keep_index=False)
-        np.testing.assert_array_equal(out.to_array(), data[take_indices])
-        np.testing.assert_array_equal(
-            out.index.to_array(), sr.index.to_array()
-        )
+    psr = pd.Series(np.random.randint(0, 20, nelem))
+    gsr = cudf.Series(psr)
+
+    take_indices = np.random.randint(0, len(gsr), ntake)
+
+    actual = gsr.take(take_indices, keep_index=keep_index)
+    expected = psr.take(take_indices)
+
+    if not keep_index:
+        expected = expected.reset_index(drop=True)
+
+    assert_eq(actual, expected)
 
 
 def test_series_take_positional():

From 8c44d62282c9a1e483b44de56f6f820be1cb37e6 Mon Sep 17 00:00:00 2001
From: ChrisJar <chris.jarrett.0@gmail.com>
Date: Wed, 10 Mar 2021 10:07:48 -0600
Subject: [PATCH 05/33] Enable type conversion from float to decimal type
 (#7450)

This implements typecasting between `decimal` and `float` types.

Addresses half of #7440

Authors:
  - @ChrisJar

Approvers:
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)
  - Ashwin Srinath (@shwina)

URL: https://github.com/rapidsai/cudf/pull/7450
---
 python/cudf/cudf/_lib/unary.pyx           |  26 +++--
 python/cudf/cudf/core/column/decimal.py   |  25 ++++-
 python/cudf/cudf/core/column/numerical.py |  13 +++
 python/cudf/cudf/tests/test_decimal.py    | 123 +++++++++++++++++++++-
 4 files changed, 177 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx
index 70cbc56e525..6e20dcaf299 100644
--- a/python/cudf/cudf/_lib/unary.pyx
+++ b/python/cudf/cudf/_lib/unary.pyx
@@ -1,6 +1,7 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from enum import IntEnum
+from cudf.utils.dtypes import is_decimal_dtype
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -27,6 +28,7 @@ from cudf._lib.cpp.unary cimport (
 from cudf._lib.types cimport underlying_type_t_type_id
 
 cimport cudf._lib.cpp.unary as libcudf_unary
+cimport cudf._lib.cpp.types as libcudf_types
 
 
 class UnaryOp(IntEnum):
@@ -93,14 +95,24 @@ def is_valid(Column input):
 
 def cast(Column input, object dtype=np.float64):
     cdef column_view c_input = input.view()
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
+    cdef type_id tid
+    cdef data_type c_dtype
+
+    # TODO: Use dtype_to_data_type when it becomes available
+    # to simplify this conversion
+    if is_decimal_dtype(dtype):
+        tid = libcudf_types.type_id.DECIMAL64
+        c_dtype = data_type(tid, -dtype.scale)
+    else:
+        tid = (
+            <type_id> (
+                <underlying_type_t_type_id> (
+                    np_to_cudf_types[np.dtype(dtype)]
+                )
             )
         )
-    )
-    cdef data_type c_dtype = data_type(tid)
+        c_dtype = data_type(tid)
+
     cdef unique_ptr[column] c_result
 
     with nogil:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 0056b3a8454..4766426892a 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -4,7 +4,7 @@
 import cupy as cp
 import numpy as np
 import pyarrow as pa
-
+from pandas.api.types import is_integer_dtype
 from typing import cast
 
 from cudf import _lib as libcudf
@@ -12,10 +12,11 @@
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import Decimal64Dtype
 from cudf.utils.utils import pa_mask_buffer_to_mask
+
+from cudf._typing import Dtype
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
-from cudf._typing import Dtype
 from cudf.core.column import as_column
 
 
@@ -67,6 +68,26 @@ def binary_operator(self, op, other, reflect=False):
         result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
         return result
 
+    def as_decimal_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.DecimalColumn":
+        if dtype == self.dtype:
+            return self
+        result = libcudf.unary.cast(self, dtype)
+        if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+            result.dtype.precision = dtype.precision
+        return result
+
+    def as_numerical_column(
+        self, dtype: Dtype
+    ) -> "cudf.core.column.NumericalColumn":
+        if is_integer_dtype(dtype):
+            raise NotImplementedError(
+                "Casting from decimal types to integer "
+                "types not currently supported"
+            )
+        return libcudf.unary.cast(self, dtype)
+
     def as_string_column(
         self, dtype: Dtype, format=None
     ) -> "cudf.core.column.StringColumn":
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f9b695e9ce3..6fae8c644e3 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -205,6 +205,19 @@ def as_timedelta_column(
             ),
         )
 
+    def as_decimal_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.DecimalColumn":
+        if is_integer_dtype(self.dtype):
+            raise NotImplementedError(
+                "Casting from integer types to decimal "
+                "types not currently supported"
+            )
+        result = libcudf.unary.cast(self, dtype)
+        if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+            result.dtype.precision = dtype.precision
+        return result
+
     def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         dtype = np.dtype(dtype)
         if dtype == self.dtype:
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index f73a785727b..ddf56828c3d 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -2,10 +2,18 @@
 
 from decimal import Decimal
 
+import numpy as np
 import pyarrow as pa
 import pytest
+import cudf
 
-from cudf.core.column import DecimalColumn
+from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.column import DecimalColumn, NumericalColumn
+
+from cudf.tests.utils import (
+    FLOAT_TYPES,
+    assert_eq,
+)
 
 
 @pytest.mark.parametrize(
@@ -41,3 +49,116 @@ def test_from_arrow_max_precision():
         DecimalColumn.from_arrow(
             pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19))
         )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cudf.Series(
+            [
+                14.12302,
+                97938.2,
+                np.nan,
+                0.0,
+                -8.302014,
+                np.nan,
+                94.31304,
+                -112.2314,
+                0.3333333,
+                np.nan,
+            ]
+        ),
+    ],
+)
+@pytest.mark.parametrize("from_dtype", FLOAT_TYPES)
+@pytest.mark.parametrize(
+    "to_dtype",
+    [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)],
+)
+def test_typecast_to_decimal(data, from_dtype, to_dtype):
+    actual = data.astype(from_dtype)
+    expected = actual
+
+    actual = actual.astype(to_dtype)
+    pa_arr = expected.to_arrow().cast(
+        pa.decimal128(to_dtype.precision, to_dtype.scale)
+    )
+    expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))
+
+    assert_eq(actual, expected)
+    assert_eq(actual.dtype, expected.dtype)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cudf.Series(
+            [
+                14.12309,
+                2.343942,
+                np.nan,
+                0.0,
+                -8.302082,
+                np.nan,
+                94.31308,
+                -112.2364,
+                -8.029972,
+                np.nan,
+            ]
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "from_dtype",
+    [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 10)],
+)
+@pytest.mark.parametrize(
+    "to_dtype",
+    [Decimal64Dtype(7, 2), Decimal64Dtype(18, 10), Decimal64Dtype(11, 4)],
+)
+def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
+    actual = data.astype(from_dtype)
+    expected = actual
+
+    actual = actual.astype(to_dtype)
+    pa_arr = expected.to_arrow().cast(
+        pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False
+    )
+    expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))
+
+    assert_eq(actual, expected)
+    assert_eq(actual.dtype, expected.dtype)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cudf.Series(
+            [
+                14.12309,
+                2.343942,
+                np.nan,
+                0.0,
+                -8.302082,
+                np.nan,
+                94.31308,
+                -112.2364,
+                -8.029972,
+                np.nan,
+            ]
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "from_dtype",
+    [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 10)],
+)
+@pytest.mark.parametrize("to_dtype", FLOAT_TYPES)
+def test_typecast_from_decimal(data, from_dtype, to_dtype):
+    actual = data.astype(from_dtype)
+    pa_arr = actual.to_arrow().cast(to_dtype, safe=False)
+
+    actual = actual.astype(to_dtype)
+    expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))
+
+    assert_eq(actual, expected)

From 0155bb16e372d0bdd06b93eddbd854065eab91b1 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Wed, 10 Mar 2021 11:18:11 -0500
Subject: [PATCH 06/33] Update Changelog Link (#7550)

The tag used for pre-releases was recently changed, so this PR updates the link in the changelog.

Authors:
  - AJ Schmidt (@ajschmidt8)

Approvers:
  - Jordan Jacobelli (@Ethyling)

URL: https://github.com/rapidsai/cudf/pull/7550
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6b08a042615..21ab8ed3274 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # cuDF 0.19.0 (Date TBD)
 
-Please see https://github.com/rapidsai/cudf/releases/tag/branch-0.19-latest for the latest changes to this development branch.
+Please see https://github.com/rapidsai/cudf/releases/tag/v0.19.0a for the latest changes to this development branch.
 
 # cuDF 0.18.0 (24 Feb 2021)
 

From 42c6d15e9b6b918f90e11fccab4005a4f5fbd01c Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 10 Mar 2021 13:03:15 -0600
Subject: [PATCH 07/33] Fix contiguous_split not properly handling output
 partitions > 2 GB. (#7515)

Fixes:
https://github.com/rapidsai/cudf/issues/7514

Related:
https://github.com/NVIDIA/spark-rapids/issues/1861

There were a couple of places where 32 bit values were being used for buffer sizes that needed to be 64 bit.

Authors:
  - @nvdbaranec

Approvers:
  - Vukasin Milovanovic (@vuule)
  - Jake Hemstad (@jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/7515
---
 cpp/src/copying/contiguous_split.cu | 120 ++++++++++++++--------------
 1 file changed, 61 insertions(+), 59 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 3f8cd4014f1..9a2f0f26f74 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -38,8 +38,8 @@ namespace {
 
 // align all column size allocations to this boundary so that all output column buffers
 // start at that alignment.
-static constexpr size_t split_align = 64;
-inline __device__ size_t _round_up_safe(size_t number_to_round, size_t modulus)
+static constexpr std::size_t split_align = 64;
+inline __device__ std::size_t _round_up_safe(std::size_t number_to_round, std::size_t modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -88,15 +88,15 @@ struct src_buf_info {
  * M partitions, then we have N*M destination buffers.
  */
 struct dst_buf_info {
-  size_t buf_size;   // total size of buffer, including padding
-  int num_elements;  // # of elements to be copied
-  int element_size;  // size of each element in bytes
+  std::size_t buf_size;  // total size of buffer, including padding
+  int num_elements;      // # of elements to be copied
+  int element_size;      // size of each element in bytes
   int num_rows;  // # of rows (which may be different from num_elements in the case of validity or
                  // offset buffers)
-  int src_row_index;  // row index to start reading from from my associated source buffer
-  int dst_offset;     // my offset into the per-partition allocation
-  int value_shift;    // amount to shift values down by (for offset buffers)
-  int bit_shift;      // # of bits to shift right by (for validity buffers)
+  int src_row_index;       // row index to start reading from from my associated source buffer
+  std::size_t dst_offset;  // my offset into the per-partition allocation
+  int value_shift;         // amount to shift values down by (for offset buffers)
+  int bit_shift;           // # of bits to shift right by (for validity buffers)
   size_type valid_count;
 };
 
@@ -133,13 +133,13 @@ template <int block_size>
 __device__ void copy_buffer(uint8_t* __restrict__ dst,
                             uint8_t* __restrict__ src,
                             int t,
-                            int num_elements,
-                            int element_size,
-                            int src_row_index,
+                            std::size_t num_elements,
+                            std::size_t element_size,
+                            std::size_t src_row_index,
                             uint32_t stride,
                             int value_shift,
                             int bit_shift,
-                            int num_rows,
+                            std::size_t num_rows,
                             size_type* valid_count)
 {
   src += (src_row_index * element_size);
@@ -147,10 +147,10 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
   size_type thread_valid_count = 0;
 
   // handle misalignment. read 16 bytes in 4 byte reads. write in a single 16 byte store.
-  const size_t num_bytes = num_elements * element_size;
+  std::size_t const num_bytes = num_elements * element_size;
   // how many bytes we're misaligned from 4-byte alignment
-  const uint32_t ofs = reinterpret_cast<uintptr_t>(src) % 4;
-  size_t pos         = t * 16;
+  uint32_t const ofs = reinterpret_cast<uintptr_t>(src) % 4;
+  std::size_t pos    = t * 16;
   stride *= 16;
   while (pos + 20 <= num_bytes) {
     // read from the nearest aligned address.
@@ -175,12 +175,12 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
 
   // copy trailing bytes
   if (t == 0) {
-    size_t remainder;
+    std::size_t remainder;
     if (num_bytes < 16) {
       remainder = num_bytes;
     } else {
-      size_t last_bracket = (num_bytes / 16) * 16;
-      remainder           = num_bytes - last_bracket;
+      std::size_t const last_bracket = (num_bytes / 16) * 16;
+      remainder                      = num_bytes - last_bracket;
       if (remainder < 4) {
         // we had less than 20 bytes for the last possible 16 byte copy, so copy 16 + the extra
         remainder += 16;
@@ -191,12 +191,12 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
     // alignment must be a multiple of 4. value shifting and bit shifting are mututally exclusive
     // and will never both be true at the same time.
     if (value_shift || bit_shift) {
-      int idx    = (num_bytes - remainder) / 4;
-      uint32_t v = remainder > 0 ? (reinterpret_cast<uint32_t*>(src)[idx] - value_shift) : 0;
+      std::size_t idx = (num_bytes - remainder) / 4;
+      uint32_t v      = remainder > 0 ? (reinterpret_cast<uint32_t*>(src)[idx] - value_shift) : 0;
       while (remainder) {
-        uint32_t next =
+        uint32_t const next =
           remainder > 0 ? (reinterpret_cast<uint32_t*>(src)[idx + 1] - value_shift) : 0;
-        uint32_t val = (v >> bit_shift) | (next << (32 - bit_shift));
+        uint32_t const val = (v >> bit_shift) | (next << (32 - bit_shift));
         if (valid_count) { thread_valid_count += __popc(val); }
         reinterpret_cast<uint32_t*>(dst)[idx] = val;
         v                                     = next;
@@ -205,8 +205,8 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
       }
     } else {
       while (remainder) {
-        int idx      = num_bytes - remainder--;
-        uint32_t val = reinterpret_cast<uint8_t*>(src)[idx];
+        std::size_t const idx = num_bytes - remainder--;
+        uint32_t const val    = reinterpret_cast<uint8_t*>(src)[idx];
         if (valid_count) { thread_valid_count += __popc(val); }
         reinterpret_cast<uint8_t*>(dst)[idx] = val;
       }
@@ -224,11 +224,11 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
         // we may have copied more bits than there are actual rows in the output.
         // so we need to subtract off the count of any bits that shouldn't have been
         // considered during the copy step.
-        int max_row     = (num_bytes * 8);
-        int slack_bits  = max_row > num_rows ? max_row - num_rows : 0;
-        auto slack_mask = set_most_significant_bits(slack_bits);
+        std::size_t const max_row    = (num_bytes * 8);
+        std::size_t const slack_bits = max_row > num_rows ? max_row - num_rows : 0;
+        auto const slack_mask        = set_most_significant_bits(slack_bits);
         if (slack_mask > 0) {
-          uint32_t last_word = reinterpret_cast<uint32_t*>(dst + (num_bytes - 4))[0];
+          uint32_t const last_word = reinterpret_cast<uint32_t*>(dst + (num_bytes - 4))[0];
           block_valid_count -= __popc(last_word & slack_mask);
         }
         *valid_count = block_valid_count;
@@ -260,9 +260,9 @@ __global__ void copy_partition(int num_src_bufs,
                                uint8_t** dst_bufs,
                                dst_buf_info* buf_info)
 {
-  int const partition_index = blockIdx.x / num_src_bufs;
-  int const src_buf_index   = blockIdx.x % num_src_bufs;
-  size_t const buf_index    = (partition_index * num_src_bufs) + src_buf_index;
+  int const partition_index   = blockIdx.x / num_src_bufs;
+  int const src_buf_index     = blockIdx.x % num_src_bufs;
+  std::size_t const buf_index = (partition_index * num_src_bufs) + src_buf_index;
 
   // copy, shifting offsets and validity bits as needed
   copy_buffer<block_size>(
@@ -322,7 +322,7 @@ bool is_offset_type(type_id id) { return (id == type_id::STRING or id == type_id
  * @returns Total offset stack size needed for this range of columns.
  */
 template <typename InputIter>
-size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0)
+std::size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0)
 {
   return std::accumulate(begin, end, 0, [offset_depth](auto stack_size, column_view const& col) {
     auto const num_buffers = 1 + (col.nullable() ? 1 : 0);
@@ -702,7 +702,7 @@ BufInfo build_output_columns(InputIter begin,
  */
 struct buf_size_functor {
   dst_buf_info const* ci;
-  size_t operator() __device__(int index) { return static_cast<size_t>(ci[index].buf_size); }
+  std::size_t operator() __device__(int index) { return ci[index].buf_size; }
 };
 
 /**
@@ -722,10 +722,10 @@ struct split_key_functor {
  */
 struct dst_offset_output_iterator {
   dst_buf_info* c;
-  using value_type        = int;
-  using difference_type   = int;
-  using pointer           = int*;
-  using reference         = int&;
+  using value_type        = std::size_t;
+  using difference_type   = std::size_t;
+  using pointer           = std::size_t*;
+  using reference         = std::size_t&;
   using iterator_category = thrust::output_device_iterator_tag;
 
   dst_offset_output_iterator operator+ __host__ __device__(int i)
@@ -778,7 +778,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   }
   {
     size_type begin = 0;
-    for (size_t i = 0; i < splits.size(); i++) {
+    for (std::size_t i = 0; i < splits.size(); i++) {
       size_type end = splits[i];
       CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
       CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
@@ -787,8 +787,8 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
     }
   }
 
-  size_t const num_partitions   = splits.size() + 1;
-  size_t const num_root_columns = input.num_columns();
+  std::size_t const num_partitions   = splits.size() + 1;
+  std::size_t const num_root_columns = input.num_columns();
 
   // if inputs are empty, just return num_partitions empty tables
   if (input.column(0).size() == 0) {
@@ -810,12 +810,12 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   // compute # of source buffers (column data, validity, children), # of partitions
   // and total # of buffers
   size_type const num_src_bufs = count_src_bufs(input.begin(), input.end());
-  size_t const num_bufs        = num_src_bufs * num_partitions;
+  std::size_t const num_bufs   = num_src_bufs * num_partitions;
 
   // packed block of memory 1. split indices and src_buf_info structs
-  size_t const indices_size =
+  std::size_t const indices_size =
     cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align);
-  size_t const src_buf_info_size =
+  std::size_t const src_buf_info_size =
     cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align);
   // host-side
   std::vector<uint8_t> h_indices_and_source_info(indices_size + src_buf_info_size);
@@ -825,7 +825,8 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   // device-side
   // gpu-only : stack space needed for nested list offset calculation
   int const offset_stack_partition_size = compute_offset_stack_size(input.begin(), input.end());
-  size_t const offset_stack_size = offset_stack_partition_size * num_partitions * sizeof(size_type);
+  std::size_t const offset_stack_size =
+    offset_stack_partition_size * num_partitions * sizeof(size_type);
   rmm::device_buffer d_indices_and_source_info(indices_size + src_buf_info_size + offset_stack_size,
                                                stream,
                                                rmm::mr::get_current_device_resource());
@@ -852,33 +853,33 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                            stream.value()));
 
   // packed block of memory 2. partition buffer sizes and dst_buf_info structs
-  size_t const buf_sizes_size =
-    cudf::util::round_up_safe(num_partitions * sizeof(size_t), split_align);
-  size_t const dst_buf_info_size =
+  std::size_t const buf_sizes_size =
+    cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align);
+  std::size_t const dst_buf_info_size =
     cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align);
   // host-side
   std::vector<uint8_t> h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size);
-  size_t* h_buf_sizes = reinterpret_cast<size_t*>(h_buf_sizes_and_dst_info.data());
+  std::size_t* h_buf_sizes = reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data());
   dst_buf_info* h_dst_buf_info =
     reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size);
   // device-side
   rmm::device_buffer d_buf_sizes_and_dst_info(
     buf_sizes_size + dst_buf_info_size, stream, rmm::mr::get_current_device_resource());
-  size_t* d_buf_sizes          = reinterpret_cast<size_t*>(d_buf_sizes_and_dst_info.data());
+  std::size_t* d_buf_sizes     = reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data());
   dst_buf_info* d_dst_buf_info = reinterpret_cast<dst_buf_info*>(
     static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size);
 
   // compute sizes of each column in each partition, including alignment.
   thrust::transform(
     rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_t>(0),
-    thrust::make_counting_iterator<size_t>(num_bufs),
+    thrust::make_counting_iterator<std::size_t>(0),
+    thrust::make_counting_iterator<std::size_t>(num_bufs),
     d_dst_buf_info,
     [num_src_bufs,
      d_indices,
      d_src_buf_info,
      d_offset_stack,
-     offset_stack_partition_size] __device__(size_t t) {
+     offset_stack_partition_size] __device__(std::size_t t) {
       int const split_index   = t / num_src_bufs;
       int const src_buf_index = t % num_src_bufs;
       auto const& src_info    = d_src_buf_info[src_buf_index];
@@ -929,7 +930,8 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
         return num_rows;
       }();
       int const element_size = cudf::type_dispatcher(data_type{src_info.type}, size_of_helper{});
-      size_t const bytes     = num_elements * element_size;
+      std::size_t const bytes =
+        static_cast<std::size_t>(num_elements) * static_cast<std::size_t>(element_size);
       return dst_buf_info{_round_up_safe(bytes, 64),
                           num_elements,
                           element_size,
@@ -969,7 +971,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                   keys + num_bufs,
                                   values,
                                   dst_offset_output_iterator{d_dst_buf_info},
-                                  0);
+                                  std::size_t{0});
   }
 
   // DtoH buf sizes and col info back to the host
@@ -986,15 +988,15 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   std::transform(h_buf_sizes,
                  h_buf_sizes + num_partitions,
                  std::back_inserter(out_buffers),
-                 [stream, mr](size_t bytes) {
+                 [stream, mr](std::size_t bytes) {
                    return rmm::device_buffer{bytes, stream, mr};
                  });
 
   // packed block of memory 3. pointers to source and destination buffers (and stack space on the
   // gpu for offset computation)
-  size_t const src_bufs_size =
+  std::size_t const src_bufs_size =
     cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align);
-  size_t const dst_bufs_size =
+  std::size_t const dst_bufs_size =
     cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align);
   // host-side
   std::vector<uint8_t> h_src_and_dst_buffers(src_bufs_size + dst_bufs_size);
@@ -1039,7 +1041,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   std::vector<column_view> cols;
   cols.reserve(num_root_columns);
   auto cur_dst_buf_info = h_dst_buf_info;
-  for (size_t idx = 0; idx < num_partitions; idx++) {
+  for (std::size_t idx = 0; idx < num_partitions; idx++) {
     // traverse the buffers and build the columns.
     cur_dst_buf_info = build_output_columns(
       input.begin(), input.end(), cur_dst_buf_info, std::back_inserter(cols), h_dst_bufs[idx]);

From 3a4dd8e246f2bfcaa515f559b65e3fd4a9c7ceff Mon Sep 17 00:00:00 2001
From: Dillon Cullinan <dcullinan92@gmail.com>
Date: Wed, 10 Mar 2021 14:21:44 -0500
Subject: [PATCH 08/33] FIX Revert gpuci_conda_retry on conda file output
 locations (#7552)

Reverts part of https://github.com/rapidsai/cudf/pull/7540 as it causes some unforeseen issues in uploading.

Authors:
  - Dillon Cullinan (@dillon-cullinan)

Approvers:
  - AJ Schmidt (@ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/7552
---
 ci/cpu/upload.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index d75e45dc406..82060ad40ef 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -29,12 +29,12 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBCUDF_FILE=`gpuci_conda_retry build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf --output`
-export LIBCUDF_KAFKA_FILE=`gpuci_conda_retry build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf_kafka --output`
-export CUDF_FILE=`gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output`
-export DASK_CUDF_FILE=`gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output`
-export CUDF_KAFKA_FILE=`gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output`
-export CUSTREAMZ_FILE=`gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output`
+export LIBCUDF_FILE=`conda build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf --output`
+export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf_kafka --output`
+export CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output`
+export DASK_CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output`
+export CUDF_KAFKA_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output`
+export CUSTREAMZ_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages

From e628101f257817fc392487b1c360f3d1fa72939e Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 10 Mar 2021 11:50:56 -0800
Subject: [PATCH 09/33] Add `Series.drop` api (#7304)

Closes #7045

This PR introduces `Series.drop` API. `Series.drop` allows users to drop certain elements in the series specified `labels` or `index` parameter.

Example:
```python3
>>> s = cudf.Series([1, 2, 3], index=['x', 'y', 'z'])
>>> s.drop(labels=['y'])
x    1
z    3
dtype: int64
```

- [x] Add series test case
- [x] Move common code path from `DataFrame.drop` to helper function
- [x] Add typing annotation
- [x] Add docstring

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - Ashwin Srinath (@shwina)
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7304
---
 python/cudf/cudf/core/dataframe.py       |  71 ++++-----
 python/cudf/cudf/core/frame.py           |  84 ++++++++++-
 python/cudf/cudf/core/series.py          | 124 +++++++++++++++-
 python/cudf/cudf/tests/test_dataframe.py |   9 ++
 python/cudf/cudf/tests/test_series.py    | 178 +++++++++++++++++++++++
 5 files changed, 421 insertions(+), 45 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 5ab058ff495..ecdce9443a1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
-from __future__ import division
+from __future__ import annotations, division
 
 import inspect
 import itertools
@@ -10,7 +10,7 @@
 import warnings
 from collections import OrderedDict, defaultdict
 from collections.abc import Iterable, Sequence
-from typing import Any, Set, TypeVar
+from typing import Any, Optional, Set, TypeVar
 
 import cupy
 import numpy as np
@@ -30,7 +30,7 @@
 from cudf.core.abc import Serializable
 from cudf.core.column import as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame
+from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import DataFrameGroupBy
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
@@ -495,7 +495,12 @@ def _from_table(cls, table, index=None):
         return out
 
     @classmethod
-    def _from_data(cls, data, index=None, columns=None):
+    def _from_data(
+        cls,
+        data: ColumnAccessor,
+        index: Optional[Index] = None,
+        columns: Any = None,
+    ) -> DataFrame:
         out = cls.__new__(cls)
         out._data = data
         if index is None:
@@ -3364,46 +3369,26 @@ def drop(
             )
 
         if inplace:
-            outdf = self
+            out = self
         else:
-            outdf = self.copy()
+            out = self.copy()
 
         if axis in (1, "columns"):
             target = _get_host_unique(target)
 
-            _drop_columns(outdf, target, errors)
+            _drop_columns(out, target, errors)
         elif axis in (0, "index"):
-            if not isinstance(target, (cudf.Series, cudf.Index)):
-                target = column.as_column(target)
-
-            if isinstance(self._index, cudf.MultiIndex):
-                if level is None:
-                    level = 0
-
-                levels_index = outdf.index.get_level_values(level)
-                if errors == "raise" and not target.isin(levels_index).all():
-                    raise KeyError("One or more values not found in axis")
-
-                # TODO : Could use anti-join as a future optimization
-                sliced_df = outdf.take(~levels_index.isin(target))
-                sliced_df._index.names = self._index.names
-            else:
-                if errors == "raise" and not target.isin(outdf.index).all():
-                    raise KeyError("One or more values not found in axis")
-
-                sliced_df = outdf.join(
-                    cudf.DataFrame(index=target), how="leftanti"
-                )
+            dropped = _drop_rows_by_labels(out, target, level, errors)
 
             if columns is not None:
                 columns = _get_host_unique(columns)
-                _drop_columns(sliced_df, columns, errors)
+                _drop_columns(dropped, columns, errors)
 
-            outdf._data = sliced_df._data
-            outdf._index = sliced_df._index
+            out._data = dropped._data
+            out._index = dropped._index
 
         if not inplace:
-            return outdf
+            return out
 
     def _drop_column(self, name):
         """Drop a column by *name*
@@ -7967,17 +7952,6 @@ def _get_union_of_series_names(series_list):
     return names_list
 
 
-def _drop_columns(df, columns, errors):
-    for c in columns:
-        try:
-            df._drop_column(c)
-        except KeyError as e:
-            if errors == "ignore":
-                pass
-            else:
-                raise e
-
-
 def _get_host_unique(array):
     if isinstance(
         array, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
@@ -7987,3 +7961,14 @@ def _get_host_unique(array):
         return [array]
     else:
         return set(array)
+
+
+def _drop_columns(df: DataFrame, columns: Iterable, errors: str):
+    for c in columns:
+        try:
+            df._drop_column(c)
+        except KeyError as e:
+            if errors == "ignore":
+                pass
+            else:
+                raise e
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 926aad368b0..275d085ef5d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import functools
 import warnings
 from collections import OrderedDict, abc as abc
-from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload
 
 import cupy
 import numpy as np
@@ -18,6 +18,7 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._typing import ColumnLike, DataFrameOrSeries
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils.dtypes import (
     is_categorical_dtype,
@@ -3852,3 +3853,84 @@ def _is_series(obj):
     instead of checking for isinstance(obj, cudf.Series)
     """
     return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None
+
+
+def _drop_rows_by_labels(
+    obj: DataFrameOrSeries,
+    labels: Union[ColumnLike, abc.Iterable, str],
+    level: Union[int, str],
+    errors: str,
+) -> DataFrameOrSeries:
+    """Remove rows specified by `labels`. If `errors=True`, an error is raised
+    if some items in `labels` do not exist in `obj._index`.
+
+    Will raise if level(int) is greater or equal to index nlevels
+    """
+    if isinstance(level, int) and level >= obj.index.nlevels:
+        raise ValueError("Param level out of bounds.")
+
+    if not isinstance(labels, (cudf.Series, cudf.Index)):
+        labels = as_column(labels)
+
+    if isinstance(obj._index, cudf.MultiIndex):
+        if level is None:
+            level = 0
+
+        levels_index = obj.index.get_level_values(level)
+        if errors == "raise" and not labels.isin(levels_index).all():
+            raise KeyError("One or more values not found in axis")
+
+        if isinstance(level, int):
+            ilevel = level
+        else:
+            ilevel = obj._index.names.index(level)
+
+        # 1. Merge Index df and data df along column axis:
+        # | id | ._index df | data column(s) |
+        idx_nlv = obj._index.nlevels
+        working_df = obj._index._source_data
+        working_df.columns = [i for i in range(idx_nlv)]
+        for i, col in enumerate(obj._data):
+            working_df[idx_nlv + i] = obj._data[col]
+        # 2. Set `level` as common index:
+        # | level | ._index df w/o level | data column(s) |
+        working_df = working_df.set_index(level)
+
+        # 3. Use "leftanti" join to drop
+        # TODO: use internal API with "leftanti" and specify left and right
+        # join keys to bypass logic check
+        to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
+        join_res = working_df.join(to_join, how="leftanti")
+
+        # 4. Reconstruct original layout, and rename
+        join_res.insert(
+            ilevel, name=join_res._index.name, value=join_res._index
+        )
+        join_res = join_res.reset_index(drop=True)
+
+        midx = cudf.MultiIndex.from_frame(
+            join_res.iloc[:, 0:idx_nlv], names=obj._index.names
+        )
+
+        if isinstance(obj, cudf.Series):
+            return obj.__class__._from_data(
+                join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name
+            )
+        else:
+            return obj.__class__._from_data(
+                join_res.iloc[:, idx_nlv:]._data,
+                index=midx,
+                columns=obj.columns,
+            )
+
+    else:
+        if errors == "raise" and not labels.isin(obj.index).all():
+            raise KeyError("One or more values not found in axis")
+
+        key_df = cudf.DataFrame(index=labels)
+        if isinstance(obj, cudf.Series):
+            res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"]
+            res.name = obj.name
+            return res
+        else:
+            return obj.join(key_df, how="leftanti")
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 2a990eef32e..9bffd28ced5 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -37,7 +37,7 @@
 from cudf.core.column.lists import ListMethods
 from cudf.core.column.string import StringMethods
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame
+from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import SeriesGroupBy
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer
@@ -545,6 +545,128 @@ def to_arrow(self):
         """
         return self._column.to_arrow()
 
+    def drop(
+        self,
+        labels=None,
+        axis=0,
+        index=None,
+        columns=None,
+        level=None,
+        inplace=False,
+        errors="raise",
+    ):
+        """
+        Return Series with specified index labels removed.
+
+        Remove elements of a Series based on specifying the index labels.
+        When using a multi-index, labels on different levels can be removed by
+        specifying the level.
+
+        Parameters
+        ----------
+        labels : single label or list-like
+            Index labels to drop.
+        axis : 0, default 0
+            Redundant for application on Series.
+        index : single label or list-like
+            Redundant for application on Series. But ``index`` can be used
+            instead of ``labels``
+        columns : single label or list-like
+            This parameter is ignored. Use ``index`` or ``labels`` to specify.
+        level : int or level name, optional
+            For MultiIndex, level from which the labels will be removed.
+        inplace : bool, default False
+            If False, return a copy. Otherwise, do operation
+            inplace and return None.
+        errors : {'ignore', 'raise'}, default 'raise'
+            If 'ignore', suppress error and only existing labels are
+            dropped.
+
+        Returns
+        -------
+        Series or None
+            Series with specified index labels removed or None if
+            ``inplace=True``
+
+        Raises
+        ------
+        KeyError
+            If any of the labels is not found in the selected axis and
+            ``error='raise'``
+
+        See Also
+        --------
+        Series.reindex
+            Return only specified index labels of Series
+        Series.dropna
+            Return series without null values
+        Series.drop_duplicates
+            Return series with duplicate values removed
+        cudf.core.dataframe.DataFrame.drop
+            Drop specified labels from rows or columns in dataframe
+
+        Examples
+        --------
+        >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z'])
+        >>> s
+        x    1
+        y    2
+        z    3
+        dtype: int64
+
+        Drop labels x and z
+
+        >>> s.drop(labels=['x', 'z'])
+        y    2
+        dtype: int64
+
+        Drop a label from the second level in MultiIndex Series.
+
+        >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']])
+        >>> s = cudf.Series(range(6), index=midx)
+        >>> s
+        0  x    0
+           y    1
+        1  x    2
+           y    3
+        2  x    4
+           y    5
+        >>> s.drop(labels='y', level=1)
+        0  x    0
+        1  x    2
+        2  x    4
+        """
+        if labels is not None:
+            if index is not None or columns is not None:
+                raise ValueError(
+                    "Cannot specify both 'labels' and 'index'/'columns'"
+                )
+            if axis == 1:
+                raise ValueError("No axis named 1 for object type Series")
+            target = labels
+        elif index is not None:
+            target = index
+        elif columns is not None:
+            target = []  # Ignore parameter columns
+        else:
+            raise ValueError(
+                "Need to specify at least one of 'labels', "
+                "'index' or 'columns'"
+            )
+
+        if inplace:
+            out = self
+        else:
+            out = self.copy()
+
+        dropped = _drop_rows_by_labels(out, target, level, errors)
+
+        out._data = dropped._data
+        out._index = dropped._index
+
+        if not inplace:
+            return out
+
     def __copy__(self, deep=True):
         return self.copy(deep)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 69f6ecfeb17..ffd66e18314 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -524,6 +524,15 @@ def test_dataframe_drop_raises():
         expected_error_message="One or more values not found in axis",
     )
 
+    # label dtype mismatch
+    assert_exceptions_equal(
+        lfunc=pdf.drop,
+        rfunc=df.drop,
+        lfunc_args_and_kwargs=([3],),
+        rfunc_args_and_kwargs=([3],),
+        expected_error_message="One or more values not found in axis",
+    )
+
     expect = pdf.drop("p", errors="ignore")
     actual = df.drop("p", errors="ignore")
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index ab9d3d91f73..a1b4236719d 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -940,3 +940,181 @@ def test_fillna_with_nan(data, nan_as_null, fill_value):
     actual = gs.fillna(fill_value)
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "ps",
+    [
+        pd.Series(["a"] * 20, index=range(0, 20)),
+        pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"),
+    ],
+)
+@pytest.mark.parametrize(
+    "labels",
+    [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_series_drop_labels(ps, labels, inplace):
+    ps = ps.copy()
+    gs = cudf.from_pandas(ps)
+
+    expected = ps.drop(labels=labels, axis=0, inplace=inplace)
+    actual = gs.drop(labels=labels, axis=0, inplace=inplace)
+
+    if inplace:
+        expected = ps
+        actual = gs
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "ps",
+    [
+        pd.Series(["a"] * 20, index=range(0, 20)),
+        pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"),
+    ],
+)
+@pytest.mark.parametrize(
+    "index",
+    [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_series_drop_index(ps, index, inplace):
+    ps = ps.copy()
+    gs = cudf.from_pandas(ps)
+
+    expected = ps.drop(index=index, inplace=inplace)
+    actual = gs.drop(index=index, inplace=inplace)
+
+    if inplace:
+        expected = ps
+        actual = gs
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "ps",
+    [
+        pd.Series(
+            ["a" if i % 2 == 0 else "b" for i in range(0, 10)],
+            index=pd.MultiIndex(
+                levels=[
+                    ["lama", "cow", "falcon"],
+                    ["speed", "weight", "length"],
+                ],
+                codes=[
+                    [0, 0, 0, 1, 1, 1, 2, 2, 2, 1],
+                    [0, 1, 2, 0, 1, 2, 0, 1, 2, 1],
+                ],
+            ),
+            name="abc",
+        )
+    ],
+)
+@pytest.mark.parametrize(
+    "index,level",
+    [
+        ("cow", 0),
+        ("lama", 0),
+        ("falcon", 0),
+        ("speed", 1),
+        ("weight", 1),
+        ("length", 1),
+        pytest.param(
+            "cow",
+            None,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/36293"
+            ),
+        ),
+        pytest.param(
+            "lama",
+            None,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/36293"
+            ),
+        ),
+        pytest.param(
+            "falcon",
+            None,
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/36293"
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_series_drop_multiindex(ps, index, level, inplace):
+    ps = ps.copy()
+    gs = cudf.from_pandas(ps)
+
+    expected = ps.drop(index=index, inplace=inplace, level=level)
+    actual = gs.drop(index=index, inplace=inplace, level=level)
+
+    if inplace:
+        expected = ps
+        actual = gs
+
+    assert_eq(expected, actual)
+
+
+def test_series_drop_edge_inputs():
+    gs = cudf.Series([42], name="a")
+    ps = gs.to_pandas()
+
+    assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"]))
+
+    assert_eq(ps.drop(columns="b"), gs.drop(columns="b"))
+
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}),
+        rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}),
+        expected_error_message="Cannot specify both",
+    )
+
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=([], {}),
+        rfunc_args_and_kwargs=([], {}),
+        expected_error_message="Need to specify at least one",
+    )
+
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=(["b"], {"axis": 1}),
+        rfunc_args_and_kwargs=(["b"], {"axis": 1}),
+        expected_error_message="No axis named 1",
+    )
+
+
+def test_series_drop_raises():
+    gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c")
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=(["p"],),
+        rfunc_args_and_kwargs=(["p"],),
+        expected_error_message="One or more values not found in axis",
+    )
+
+    # dtype specified mismatch
+    assert_exceptions_equal(
+        lfunc=ps.drop,
+        rfunc=gs.drop,
+        lfunc_args_and_kwargs=([3],),
+        rfunc_args_and_kwargs=([3],),
+        expected_error_message="One or more values not found in axis",
+    )
+
+    expect = ps.drop("p", errors="ignore")
+    actual = gs.drop("p", errors="ignore")
+
+    assert_eq(actual, expect)

From 85edbfa325c9dba6f107b25ec7a9b084bdb40917 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 10 Mar 2021 12:03:46 -0800
Subject: [PATCH 10/33] Support `Series.__setitem__` with key to a new row
 (#7443)

Closes #7290

Supports assigning to a new row (specified by a new label) in a series.

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - @brandon-b-miller
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7443
---
 python/cudf/cudf/core/indexing.py       | 30 +++++++++++++++++++++++--
 python/cudf/cudf/tests/test_indexing.py | 25 +++++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index 653004eaee4..aec931fefbf 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -7,9 +7,12 @@
 from nvtx import annotate
 
 import cudf
+from cudf._lib.concat import concat_columns
 from cudf._lib.scalar import _is_null_host_scalar
-from cudf._typing import DataFrameOrSeries, ScalarLike
+from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike
+from cudf.core.column.column import as_column
 from cudf.utils.dtypes import (
+    find_common_type,
     is_categorical_dtype,
     is_column_like,
     is_list_like,
@@ -142,7 +145,19 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
         return self._sr.iloc[arg]
 
     def __setitem__(self, key, value):
-        key = self._loc_to_iloc(key)
+        try:
+            key = self._loc_to_iloc(key)
+        except KeyError as e:
+            if (
+                is_scalar(key)
+                and not isinstance(self._sr.index, cudf.MultiIndex)
+                and is_scalar(value)
+            ):
+                _append_new_row_inplace(self._sr.index._values, key)
+                _append_new_row_inplace(self._sr._column, value)
+                return
+            else:
+                raise e
         if isinstance(value, (pd.Series, cudf.Series)):
             value = cudf.Series(value)
             value = value._align_to_index(self._sr.index, how="right")
@@ -481,3 +496,14 @@ def _normalize_dtypes(df):
         for name, col in df._data.items():
             df[name] = col.astype(normalized_dtype)
     return df
+
+
+def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
+    """Append a scalar `value` to the end of `col` inplace.
+       Cast to common type if possible
+    """
+    to_type = find_common_type([type(value), col.dtype])
+    val_col = as_column(value, dtype=to_type)
+    old_col = col.astype(to_type)
+
+    col._mimic_inplace(concat_columns([old_col, val_col]), inplace=True)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 15d504799e4..558700f1f89 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1043,6 +1043,10 @@ def test_series_setitem_string(key, value):
     [
         ("a", 4),
         ("b", 4),
+        ("b", np.int8(8)),
+        ("d", 4),
+        ("d", np.int8(16)),
+        ("d", np.float32(16)),
         (["a", "b"], 4),
         (["a", "b"], [4, 5]),
         ([True, False, True], 4),
@@ -1058,6 +1062,27 @@ def test_series_setitem_loc(key, value):
     assert_eq(psr, gsr)
 
 
+@pytest.mark.parametrize(
+    "key, value",
+    [
+        (1, "d"),
+        (2, "e"),
+        (4, "f"),
+        ([1, 3], "g"),
+        ([1, 3], ["g", "h"]),
+        ([True, False, True], "i"),
+        ([False, False, False], "j"),
+        ([True, False, True], ["k", "l"]),
+    ],
+)
+def test_series_setitem_loc_numeric_index(key, value):
+    psr = pd.Series(["a", "b", "c"], [1, 2, 3])
+    gsr = cudf.from_pandas(psr)
+    psr.loc[key] = value
+    gsr.loc[key] = value
+    assert_eq(psr, gsr)
+
+
 @pytest.mark.parametrize(
     "key, value",
     [

From 2d055c35b498a59c33e95137376f341817263047 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Wed, 10 Mar 2021 14:41:25 -0700
Subject: [PATCH 11/33] =?UTF-8?q?Fix=20offset=5Fend=20iterator=20for=20lis?=
 =?UTF-8?q?ts=5Fcolumn=5Fview,=20which=20was=20not=20correctl=E2=80=A6=20(?=
 =?UTF-8?q?#7551)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the `offset_end` iterator in `lists_column_view`. Since the offset column size is one element larger than the number of column rows, the `offset_end` should be computed as `offset_begin() + size() + 1`. This can also be done by `offset_begin() + offsets().size()`.

This PR blocks https://github.com/rapidsai/cudf/pull/7528, thus it must be merged before that PR.

Authors:
  - Nghia Truong (@ttnghia)

Approvers:
  - Jake Hemstad (@jrhemstad)
  - Mike Wilson (@hyperbolic2346)
  - Vukasin Milovanovic (@vuule)

URL: https://github.com/rapidsai/cudf/pull/7551
---
 cpp/include/cudf/lists/lists_column_view.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 8cabf5287c8..6c5cfc37eab 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -107,7 +107,7 @@ class lists_column_view : private column_view {
    *
    * @return int32_t const* Pointer to one past the last offset
    */
-  offset_iterator offsets_end() const noexcept { return offsets_begin() + size(); }
+  offset_iterator offsets_end() const noexcept { return offsets_begin() + offsets().size(); }
 };
 /** @} */  // end of group
 }  // namespace cudf

From 35f3f70436514de3b6fc90372681438567d3bfb6 Mon Sep 17 00:00:00 2001
From: chenrui17 <33319780+chenrui17@users.noreply.github.com>
Date: Thu, 11 Mar 2021 06:01:42 +0800
Subject: [PATCH 12/33] Fix no such file dlpack.h error when build libcudf
 (#7549)

close #7548

Authors:
  - @chenrui17

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7549
---
 cpp/cmake/thirdparty/CUDF_GetDLPack.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake b/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake
index 5aaf5eaa434..b41c6d3b8d2 100644
--- a/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake
@@ -16,7 +16,7 @@
 
 function(find_and_configure_dlpack VERSION)
     if(DLPACK_INCLUDE)
-        set(DLPACK_INCLUDE_DIR "${DLPACK_INCLUDE_DIR}" PARENT_SCOPE)
+        set(DLPACK_INCLUDE_DIR "${DLPACK_INCLUDE}" PARENT_SCOPE)
         return()
     endif()
     find_path(DLPACK_INCLUDE_DIR "dlpack"

From f41c10c3b39240a3f7008e5e38faf61f933c552c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 10 Mar 2021 17:18:16 -0600
Subject: [PATCH 13/33] Fix index mismatch issue in equality related APIs
 (#7555)

Fixes: #7536

This PR enables re-indexing in some of the equality-related APIs, note that we will still error when we call the dunder methods.

Authors:
  - GALI PREM SAGAR (@galipremsagar)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7555
---
 python/cudf/cudf/core/series.py       | 252 ++++++++++++++++++++++++--
 python/cudf/cudf/tests/test_binops.py |  19 ++
 2 files changed, 257 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 9bffd28ced5..11e32e2285d 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1490,7 +1490,9 @@ def __repr__(self):
         return "\n".join(lines)
 
     @annotate("BINARY_OP", color="orange", domain="cudf_python")
-    def _binaryop(self, other, fn, fill_value=None, reflect=False):
+    def _binaryop(
+        self, other, fn, fill_value=None, reflect=False, can_reindex=False
+    ):
         """
         Internal util to call a binary operator *fn* on operands *self*
         and *other*.  Return the output Series.  The output dtype is
@@ -1505,7 +1507,7 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False):
 
         result_name = utils.get_result_name(self, other)
         if isinstance(other, Series):
-            if fn in cudf.utils.utils._EQUALITY_OPS:
+            if not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS:
                 if not self.index.equals(other.index):
                     raise ValueError(
                         "Can only compare identically-labeled "
@@ -2318,10 +2320,47 @@ def eq(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.eq(b, fill_value=2)
+        a    False
+        b    False
+        c    False
+        d    False
+        e     <NA>
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "eq", fill_value)
+        return self._binaryop(
+            other=other, fn="eq", fill_value=fill_value, can_reindex=True
+        )
 
     def __eq__(self, other):
         return self._binaryop(other, "eq")
@@ -2336,10 +2375,47 @@ def ne(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.ne(b, fill_value=2)
+        a    True
+        b    True
+        c    True
+        d    True
+        e    <NA>
+        f    True
+        g    True
+        dtype: bool
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "ne", fill_value)
+        return self._binaryop(
+            other=other, fn="ne", fill_value=fill_value, can_reindex=True
+        )
 
     def __ne__(self, other):
         return self._binaryop(other, "ne")
@@ -2354,10 +2430,47 @@ def lt(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.lt(b, fill_value=-10)
+        a    False
+        b     True
+        c    False
+        d    False
+        e     <NA>
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "lt", fill_value)
+        return self._binaryop(
+            other=other, fn="lt", fill_value=fill_value, can_reindex=True
+        )
 
     def __lt__(self, other):
         return self._binaryop(other, "lt")
@@ -2372,10 +2485,47 @@ def le(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.le(b, fill_value=-10)
+        a    False
+        b     True
+        c    False
+        d    False
+        e     <NA>
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "le", fill_value)
+        return self._binaryop(
+            other=other, fn="le", fill_value=fill_value, can_reindex=True
+        )
 
     def __le__(self, other):
         return self._binaryop(other, "le")
@@ -2390,10 +2540,47 @@ def gt(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.gt(b)
+        a     True
+        b    False
+        c     True
+        d    False
+        e    False
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "gt", fill_value)
+        return self._binaryop(
+            other=other, fn="gt", fill_value=fill_value, can_reindex=True
+        )
 
     def __gt__(self, other):
         return self._binaryop(other, "gt")
@@ -2408,10 +2595,47 @@ def ge(self, other, fill_value=None, axis=0):
         fill_value : None or value
             Value to fill nulls with before computation. If data in both
             corresponding Series locations is null the result will be null
-        """
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.ge(b)
+        a     True
+        b    False
+        c     True
+        d    False
+        e    False
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "ge", fill_value)
+        return self._binaryop(
+            other=other, fn="ge", fill_value=fill_value, can_reindex=True
+        )
 
     def __ge__(self, other):
         return self._binaryop(other, "ge")
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 579716f8277..a0b65743180 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1726,3 +1726,22 @@ def decimal_series(input, dtype):
     got = op(a, b)
     assert expect.dtype == got.dtype
     utils.assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("fn", ["eq", "ne", "lt", "gt", "le", "ge"])
+def test_equality_ops_index_mismatch(fn):
+    a = cudf.Series(
+        [1, 2, 3, None, None, 4], index=["a", "b", "c", "d", "e", "f"]
+    )
+    b = cudf.Series(
+        [-5, 4, 3, 2, 1, 0, 19, 11],
+        index=["aa", "b", "c", "d", "e", "f", "y", "z"],
+    )
+
+    pa = a.to_pandas()
+    pb = b.to_pandas()
+
+    expected = getattr(pa, fn)(pb)
+    actual = getattr(a, fn)(b)
+
+    utils.assert_eq(expected, actual)

From c76949e5476ce0a190c0f7b1658876846ee7551a Mon Sep 17 00:00:00 2001
From: Dillon Cullinan <dcullinan92@gmail.com>
Date: Wed, 10 Mar 2021 19:49:51 -0500
Subject: [PATCH 14/33] FIX Fix Anaconda upload args (#7558)

Fixes the upload arguments introduced in this PR: https://github.com/rapidsai/cudf/pull/7540

Order matters!

Authors:
  - Dillon Cullinan (@dillon-cullinan)

Approvers:
  - Keith Kraus (@kkraus14)
  - AJ Schmidt (@ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/7558
---
 ci/cpu/upload.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 82060ad40ef..4f72f6dd772 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -45,36 +45,36 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then
   test -e ${LIBCUDF_FILE}
   echo "Upload libcudf"
   echo ${LIBCUDF_FILE}
-  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUDF_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUDF_FILE} --no-progress
 fi
 
 if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then
   test -e ${CUDF_FILE}
   echo "Upload cudf"
   echo ${CUDF_FILE}
-  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_FILE} --no-progress
 
   test -e ${DASK_CUDF_FILE}
   echo "Upload dask-cudf"
   echo ${DASK_CUDF_FILE}
-  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${DASK_CUDF_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${DASK_CUDF_FILE} --no-progress
 
   test -e ${CUSTREAMZ_FILE}
   echo "Upload custreamz"
   echo ${CUSTREAMZ_FILE}
-  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUSTREAMZ_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUSTREAMZ_FILE} --no-progress
 fi
 
 if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then
   test -e ${LIBCUDF_KAFKA_FILE}
   echo "Upload libcudf_kafka"
   echo ${LIBCUDF_KAFKA_FILE}
-  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUDF_KAFKA_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBCUDF_KAFKA_FILE} --no-progress
 fi
 
 if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF_KAFKA" == "1" ]]; then
   test -e ${CUDF_KAFKA_FILE}
   echo "Upload cudf_kafka"
   echo ${CUDF_KAFKA_FILE}
-  gpuci_retry anaconda --no-progress -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_KAFKA_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_KAFKA_FILE} --no-progress
 fi

From 8cd927f3736424bcc3b3d30c76f6f4c64fef441d Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 11 Mar 2021 09:57:38 -0500
Subject: [PATCH 15/33] Change device_vector to device_uvector in nvtext source
 files (#7512)

Reference #7287
This PR changes `cpp/src/text/*` source files to use `rmm::device_uvector` instead of `rmm:device_vector`. This allows keeping the memory operations on the provided kernel stream.

Authors:
  - David (@davidwendt)

Approvers:
  - Paul Taylor (@trxcllnt)
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7512
---
 cpp/src/text/generate_ngrams.cu | 14 ++++---------
 cpp/src/text/ngrams_tokenize.cu | 35 +++++++++++++++++----------------
 cpp/src/text/tokenize.cu        |  9 ++++++---
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 79154232394..3c583622ed8 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -221,7 +221,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto const d_strings      = *strings_column;
 
   // create a vector of ngram offsets for each string
-  rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
+  rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -235,14 +235,8 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
     cudf::size_type{0},
     thrust::plus<cudf::size_type>());
 
-  // total count is the last entry
-  auto const d_ngram_offsets   = ngram_offsets.data().get();
-  cudf::size_type total_ngrams = 0;
-  CUDA_TRY(cudaMemcpyAsync(&total_ngrams,
-                           d_ngram_offsets + strings_count,
-                           sizeof(cudf::size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  // total ngrams count is the last entry
+  cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
@@ -254,7 +248,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                                                   mr);
   auto d_offsets      = offsets_column->mutable_view().data<int32_t>();
   // compute the size of each ngram -- output goes in d_offsets
-  character_ngram_generator_fn generator{d_strings, ngrams, d_ngram_offsets, d_offsets};
+  character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data(), d_offsets};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 18bc86f6478..96b06e7a1eb 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -158,21 +158,22 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
 
   // first, get the number of tokens per string to get the token-offsets
   // Ex. token-counts = [3,2]; token-offsets = [0,3,5]
-  rmm::device_vector<int32_t> token_offsets(strings_count + 1);
-  auto d_token_offsets = token_offsets.data().get();
+  rmm::device_uvector<int32_t> token_offsets(strings_count + 1, stream);
+  auto d_token_offsets = token_offsets.data();
   thrust::transform_inclusive_scan(rmm::exec_policy(stream),
                                    thrust::make_counting_iterator<cudf::size_type>(0),
                                    thrust::make_counting_iterator<cudf::size_type>(strings_count),
                                    d_token_offsets + 1,
                                    strings_tokenizer{d_strings, d_delimiter},
                                    thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_token_offsets, 0, sizeof(int32_t), stream.value()));
-  auto total_tokens = token_offsets[strings_count];  // Ex. 5 tokens
+  int32_t const zero = 0;
+  token_offsets.set_element_async(0, zero, stream);
+  auto const total_tokens = token_offsets.back_element(stream);  // Ex. 5 tokens
 
   // get the token positions (in bytes) per string
   // Ex. start/end pairs: [(0,1),(2,4),(5,8), (0,2),(3,4)]
-  rmm::device_vector<position_pair> token_positions(total_tokens);
-  auto d_token_positions = token_positions.data().get();
+  rmm::device_uvector<position_pair> token_positions(total_tokens, stream);
+  auto d_token_positions = token_positions.data();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -181,8 +182,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
 
   // compute the number of ngrams per string to get the total number of ngrams to generate
   // Ex. ngram-counts = [2,1]; ngram-offsets = [0,2,3]; total = 3 bigrams
-  rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
-  auto d_ngram_offsets = ngram_offsets.data().get();
+  rmm::device_uvector<int32_t> ngram_offsets(strings_count + 1, stream);
+  auto d_ngram_offsets = ngram_offsets.data();
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -193,8 +194,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
       return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
     },
     thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_ngram_offsets, 0, sizeof(int32_t), stream.value()));
-  auto total_ngrams = ngram_offsets[strings_count];
+  ngram_offsets.set_element_async(0, zero, stream);
+  auto const total_ngrams = ngram_offsets.back_element(stream);
 
   // Compute the total size of the ngrams for each string (not for each ngram)
   // Ex. 2 bigrams in 1st string total to 10 bytes; 1 bigram in 2nd string is 4 bytes
@@ -204,8 +205,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   // ngrams for each string.
   // Ex. bigram for first string produces 2 bigrams ("a_bb","bb_ccc") which
   //     is built in memory like this: "a_bbbb_ccc"
-  rmm::device_vector<int32_t> chars_offsets(strings_count + 1);  // output memory offsets
-  auto d_chars_offsets = chars_offsets.data().get();             // per input string
+  rmm::device_uvector<int32_t> chars_offsets(strings_count + 1, stream);  // output memory offsets
+  auto d_chars_offsets = chars_offsets.data();                            // per input string
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),
@@ -213,11 +214,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
     d_chars_offsets + 1,
     ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions},
     thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_chars_offsets, 0, sizeof(int32_t), stream.value()));
-  auto output_chars_size = chars_offsets[strings_count];  // Ex. 14 output bytes total
+  chars_offsets.set_element_async(0, zero, stream);
+  auto const output_chars_size = chars_offsets.back_element(stream);  // Ex. 14 output bytes total
 
-  rmm::device_vector<int32_t> ngram_sizes(total_ngrams);  // size in bytes of each
-  auto d_ngram_sizes = ngram_sizes.data().get();          // ngram to generate
+  rmm::device_uvector<int32_t> ngram_sizes(total_ngrams, stream);  // size in bytes of each
+  auto d_ngram_sizes = ngram_sizes.data();                         // ngram to generate
 
   // build chars column
   auto chars_column = cudf::strings::detail::create_chars_child_column(
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 0ba51f7639f..1b7e457367e 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
@@ -27,9 +28,10 @@
 #include <text/utilities/tokenize_ops.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/count.h>
+#include <thrust/copy.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
@@ -75,7 +77,8 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                          d_token_counts.template begin<int32_t>(),
                          d_token_counts.template end<int32_t>(),
                          token_offsets.begin() + 1);
-  CUDA_TRY(cudaMemsetAsync(token_offsets.data(), 0, sizeof(int32_t), stream.value()));
+  int32_t const zero = 0;
+  token_offsets.set_element_async(0, zero, stream);
   auto const total_tokens = token_offsets.back_element(stream);
   // build a list of pointers to each token
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
@@ -87,7 +90,7 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                      strings_count,
                      tokenizer);
   // create the strings column using the tokens pointers
-  return cudf::make_strings_column(tokens, stream, mr);
+  return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr);
 }
 
 }  // namespace

From 3355e6039c36e36480af2287f1d0dafc5a87cf9b Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 11 Mar 2021 10:03:10 -0500
Subject: [PATCH 16/33] Removed unneeded includes from traits.hpp (#7509)

The `cpp/include/cudf/utilities/traits.hpp` file is parsed when building most libcudf source files (~200). This PR removes a couple unneeded header includes to help reduce the compile dependency. Only a couple files needed to be updated that relied on `traits.hpp` including these for them.

Authors:
  - David (@davidwendt)

Approvers:
  - Paul Taylor (@trxcllnt)
  - Jake Hemstad (@jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/7509
---
 cpp/include/cudf/utilities/traits.hpp | 2 --
 cpp/src/interop/dlpack.cpp            | 2 ++
 cpp/src/io/csv/csv_gpu.cu             | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 1153d4f8ff3..e045476ea77 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <cudf/lists/list_view.cuh>
-#include <cudf/structs/struct_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/durations.hpp>
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 46d070e14af..84dd41907d2 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -15,6 +15,8 @@
  */
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/interop.hpp>
+#include <cudf/lists/list_view.cuh>
+#include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 67c6a49ed28..86e5f1fdcae 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -26,6 +26,7 @@
 #include <cudf/lists/list_view.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/string_view.cuh>
+#include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>

From eaec3db3dfb5dfe38ad51ab6e24d9a5710e5875e Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Thu, 11 Mar 2021 12:08:21 -0700
Subject: [PATCH 17/33] Fix cudf::lists::sort_lists failing for sliced column
 (#7564)

This fixes #7530 (`cudf::lists::sort_lists` fails for sliced column).

I also added more tests for sliced columns to cover the previously failed cases, and added a header `lists/detail/sorting.cuh` to expose the internal `detail::sort_lists` API which accepts a stream parameter.

Authors:
  - Nghia Truong (@ttnghia)

Approvers:
  - David (@davidwendt)
  - AJ Schmidt (@ajschmidt8)
  - Karthikeyan (@karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/7564
---
 conda/recipes/libcudf/meta.yaml           |  1 +
 cpp/include/cudf/lists/detail/sorting.hpp | 39 +++++++++++++++++++++++
 cpp/src/lists/segmented_sort.cu           | 13 ++++----
 cpp/tests/lists/sort_lists_tests.cpp      | 20 +++++++++---
 4 files changed, 62 insertions(+), 11 deletions(-)
 create mode 100644 cpp/include/cudf/lists/detail/sorting.hpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index a46712def28..b23977086d3 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -132,6 +132,7 @@ test:
     - test -f $PREFIX/include/cudf/join.hpp
     - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
+    - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/lists/count_elements.hpp
     - test -f $PREFIX/include/cudf/lists/extract.hpp
     - test -f $PREFIX/include/cudf/lists/contains.hpp
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
new file mode 100644
index 00000000000..f68ff872020
--- /dev/null
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+
+/**
+ * @copydoc cudf::lists::sort_lists
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> sort_lists(
+  lists_column_view const& input,
+  order column_order,
+  null_order null_precedence,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+}  // namespace detail
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 5681f7767e0..3bbbc9b16b7 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -167,6 +167,7 @@ struct SegmentedSortColumn {
         auto const null_replace_T = null_precedence == null_order::AFTER
                                       ? std::numeric_limits<T>::max()
                                       : std::numeric_limits<T>::min();
+
         auto device_child = column_device_view::create(child, stream);
         auto keys_in =
           cudf::detail::make_null_replacement_iterator<T>(*device_child, null_replace_T);
@@ -224,15 +225,13 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return empty_like(input.parent());
-  auto segment_offsets =
-    cudf::detail::slice(input.offsets(), {input.offset(), input.offsets().size()}, stream)[0];
-  // Copy list offsets.
-  auto output_offset = allocate_like(segment_offsets, mask_allocation_policy::RETAIN, mr);
+  auto output_offset = make_numeric_column(
+    input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr);
   thrust::transform(rmm::exec_policy(stream),
-                    segment_offsets.begin<size_type>(),
-                    segment_offsets.end<size_type>(),
+                    input.offsets_begin(),
+                    input.offsets_end(),
                     output_offset->mutable_view().begin<size_type>(),
-                    [first = segment_offsets.begin<size_type>()] __device__(auto offset_index) {
+                    [first = input.offsets_begin()] __device__(auto offset_index) {
                       return offset_index - *first;
                     });
   // for numeric columns, calls Faster segmented radix sort path
diff --git a/cpp/tests/lists/sort_lists_tests.cpp b/cpp/tests/lists/sort_lists_tests.cpp
index ac73297f088..28fb29c7d3c 100644
--- a/cpp/tests/lists/sort_lists_tests.cpp
+++ b/cpp/tests/lists/sort_lists_tests.cpp
@@ -171,11 +171,23 @@ TEST_F(SortListsInt, Depth)
 TEST_F(SortListsInt, Sliced)
 {
   using T = int;
-  LCW<T> l1{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}};
-  auto sliced_list = cudf::slice(l1, {1, 4})[0];
+  LCW<T> l1{{3, 2, 1, 4}, {7, 5, 6}, {8, 9}, {10}};
 
-  auto results = sort_lists(lists_column_view{sliced_list}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), sliced_list);
+  auto sliced_list = cudf::slice(l1, {0, 4})[0];
+  auto results     = sort_lists(lists_column_view{sliced_list}, {}, {});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), LCW<T>{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}});
+
+  sliced_list = cudf::slice(l1, {1, 4})[0];
+  results     = sort_lists(lists_column_view{sliced_list}, {}, {});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), LCW<T>{{5, 6, 7}, {8, 9}, {10}});
+
+  sliced_list = cudf::slice(l1, {1, 2})[0];
+  results     = sort_lists(lists_column_view{sliced_list}, {}, {});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), LCW<T>{{5, 6, 7}});
+
+  sliced_list = cudf::slice(l1, {0, 2})[0];
+  results     = sort_lists(lists_column_view{sliced_list}, {}, {});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), LCW<T>{{1, 2, 3, 4}, {5, 6, 7}});
 }
 
 }  // namespace test

From 3bcd1afa873448638fac12f38211fc991b0ab15d Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 11 Mar 2021 17:18:33 -0500
Subject: [PATCH 18/33] Remove unneeded step parameter from
 strings::detail::copy_slice (#7525)

This started to be a change converting some `device_vector` usages in `cpp/src/strings` source files to use `device_uvector` instead. The `cpp/src/strings/copying/copying.cu` source has the implementation for `cudf::strings::detail::copy_slice()` and used a `device_vector` to handle a `step` parameter. I can not longer find this parameter being used. I believe it was a hold over from porting nvstrings. So this PR mainly includes changes for removing this unneeded parameter which also removes the need for the `device_vector` or temporary memory in this function.
And, it also includes changes to `attributes.cu` to use the `device_uvector` as well.

~~I'm marking this as non-breaking change since it is a change to a `detail` API and did not seem to be used anywhere in this repo.~~

Reference #7287

Authors:
  - David (@davidwendt)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - Mike Wilson (@hyperbolic2346)
  - Jake Hemstad (@jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/7525
---
 conda/recipes/libcudf/meta.yaml               |  2 +-
 .../cudf/strings/{ => detail}/copying.hpp     | 17 ++--
 cpp/src/column/column.cu                      |  4 +-
 cpp/src/strings/attributes.cu                 | 44 +++++----
 cpp/src/strings/copying/copying.cu            | 90 +++++++++----------
 cpp/tests/strings/array_tests.cu              | 58 +++++-------
 6 files changed, 93 insertions(+), 122 deletions(-)
 rename cpp/include/cudf/strings/{ => detail}/copying.hpp (82%)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index b23977086d3..74f4a20c066 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -168,10 +168,10 @@ test:
     - test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp
     - test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp
     - test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp
-    - test -f $PREFIX/include/cudf/strings/copying.hpp
     - test -f $PREFIX/include/cudf/strings/detail/combine.hpp
     - test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/strings/detail/converters.hpp
+    - test -f $PREFIX/include/cudf/strings/detail/copying.hpp
     - test -f $PREFIX/include/cudf/strings/detail/fill.hpp
     - test -f $PREFIX/include/cudf/strings/detail/replace.hpp
     - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp
diff --git a/cpp/include/cudf/strings/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
similarity index 82%
rename from cpp/include/cudf/strings/copying.hpp
rename to cpp/include/cudf/strings/detail/copying.hpp
index b4455e2c3b4..19dfa193207 100644
--- a/cpp/include/cudf/strings/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,24 +25,24 @@ namespace strings {
 namespace detail {
 /**
  * @brief Returns a new strings column created from a subset of
- * of the strings column. The subset of strings selected is between
- * start (inclusive) and end (exclusive) with increments of step.
+ * of the strings column.
+ *
+ * The subset of strings selected is between
+ * start (inclusive) and end (exclusive).
  *
  * @code{.pseudo}
  * Example:
  * s1 = ["a", "b", "c", "d", "e", "f"]
- * s2 = slice( s1, 2 )
+ * s2 = copy_slice( s1, 2 )
  * s2 is ["c", "d", "e", "f"]
- * s3 = slice( s1, 1, 2 )
- * s3 is ["b", "d", "f"]
+ * s2 = copy_slice( s1, 1, 3 )
+ * s2 is ["b", "c"]
  * @endcode
  *
  * @param strings Strings instance for this operation.
  * @param start Index to first string to select in the column (inclusive).
  * @param end Index to last string to select in the column (exclusive).
  *            Default -1 indicates the last element.
- * @param step Increment value between indices.
- *             Default step is 1.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column of size (end-start)/step.
@@ -51,7 +51,6 @@ std::unique_ptr<cudf::column> copy_slice(
   strings_column_view const& strings,
   size_type start,
   size_type end                       = -1,
-  size_type step                      = 1,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index fc1a0871e66..d30e5fc746a 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -25,7 +25,7 @@
 #include <cudf/lists/detail/copying.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/strings/copying.hpp>
+#include <cudf/strings/detail/copying.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -193,7 +193,7 @@ struct create_column_from_view {
   std::unique_ptr<column> operator()()
   {
     cudf::strings_column_view sview(view);
-    return cudf::strings::detail::copy_slice(sview, 0, view.size(), 1, stream, mr);
+    return cudf::strings::detail::copy_slice(sview, 0, view.size(), stream, mr);
   }
 
   template <typename ColumnType,
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index c010da94231..bed86544ec7 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/transform.h>
@@ -54,28 +54,26 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
-  auto strings_count  = strings.size();
+  // create output column
+  auto results   = make_numeric_column(data_type{type_id::INT32},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
+  auto d_lengths = results->mutable_view().data<int32_t>();
+  // input column device view
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
-  // create output column
-  auto results = std::make_unique<cudf::column>(
-    cudf::data_type{type_id::INT32},
-    strings_count,
-    rmm::device_buffer(strings_count * sizeof(int32_t), stream, mr),
-    cudf::detail::copy_bitmask(strings.parent(), stream, mr),  // copy the null mask
-    strings.null_count());
-  auto results_view = results->mutable_view();
-  auto d_lengths    = results_view.data<int32_t>();
   // fill in the lengths
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<cudf::size_type>(0),
-                    thrust::make_counting_iterator<cudf::size_type>(strings_count),
+                    thrust::make_counting_iterator<cudf::size_type>(strings.size()),
                     d_lengths,
                     [d_strings, ufn] __device__(size_type idx) {
-                      int32_t length = 0;
-                      if (!d_strings.is_null(idx))
-                        length = static_cast<int32_t>(ufn(d_strings.element<string_view>(idx)));
-                      return length;
+                      return d_strings.is_null(idx)
+                               ? 0
+                               : static_cast<int32_t>(ufn(d_strings.element<string_view>(idx)));
                     });
   results->set_null_count(strings.null_count());  // reset null count
   return results;
@@ -140,23 +138,23 @@ std::unique_ptr<column> code_points(
   auto d_column       = *strings_column;
 
   // create offsets vector to account for each string's character length
-  rmm::device_vector<size_type> offsets(strings.size() + 1);
-  size_type* d_offsets = offsets.data().get();
+  rmm::device_uvector<size_type> offsets(strings.size() + 1, stream);
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings.size()),
-    d_offsets + 1,
+    offsets.begin() + 1,
     [d_column] __device__(size_type idx) {
       size_type length = 0;
       if (!d_column.is_null(idx)) length = d_column.element<string_view>(idx).length();
       return length;
     },
     thrust::plus<size_type>());
-  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(size_type), stream.value()));
+  size_type const zero = 0;
+  offsets.set_element_async(0, zero, stream);
 
   // the total size is the number of characters in the entire column
-  size_type num_characters = offsets.back();
+  size_type num_characters = offsets.back_element(stream);
   // create output column with no nulls
   auto results = make_numeric_column(
     data_type{type_id::INT32}, num_characters, mask_state::UNALLOCATED, stream, mr);
@@ -167,7 +165,7 @@ std::unique_ptr<column> code_points(
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings.size(),
-                     code_points_fn{d_column, d_offsets, d_results});
+                     code_points_fn{d_column, offsets.data(), d_results});
 
   results->set_null_count(0);
   return results;
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index 80ef11ec456..cdf188bfdc5 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,77 +14,67 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/strings/copying.hpp>
+#include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/sequence.h>
-
 namespace cudf {
 namespace strings {
 namespace detail {
-// new strings column from subset of this strings instance
+
 std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
                                          size_type start,
                                          size_type end,
-                                         size_type step,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
-  if (step == 0) step = 1;
-  CUDF_EXPECTS(step > 0, "Parameter step must be positive integer.");
-  if (end < 0 || end > strings_count) end = strings_count;
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
+  if (end < 0 || end > strings.size()) end = strings.size();
   CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
-  strings_count = cudf::util::round_up_safe<size_type>((end - start), step);
-  if (start == 0 && strings.offset() == 0 && step == 1) {
-    // sliced at the beginning and copying every step, so no need to gather
-    auto offsets_column = std::make_unique<cudf::column>(
-      cudf::slice(strings.offsets(), {0, strings_count + 1}).front(), stream, mr);
-    auto data_size =
-      cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-    auto chars_column = std::make_unique<cudf::column>(
-      cudf::slice(strings.chars(), {0, data_size}).front(), stream, mr);
-    auto null_mask = cudf::detail::copy_bitmask(strings.null_mask(), 0, strings_count, stream, mr);
-    return make_strings_column(strings_count,
-                               std::move(offsets_column),
-                               std::move(chars_column),
-                               UNKNOWN_NULL_COUNT,
-                               std::move(null_mask),
-                               stream,
-                               mr);
+  auto const strings_count  = end - start;
+  auto const offsets_offset = start + strings.offset();
+
+  // slice the offsets child column
+  auto offsets_column = std::make_unique<cudf::column>(
+    cudf::slice(strings.offsets(), {offsets_offset, offsets_offset + strings_count + 1}).front(),
+    stream,
+    mr);
+  auto const chars_offset =
+    offsets_offset == 0 ? 0 : cudf::detail::get_value<int32_t>(offsets_column->view(), 0, stream);
+  if (chars_offset > 0) {
+    // adjust the individual offset values only if needed
+    auto d_offsets = offsets_column->mutable_view();
+    thrust::transform(rmm::exec_policy(stream),
+                      d_offsets.begin<int32_t>(),
+                      d_offsets.end<int32_t>(),
+                      d_offsets.begin<int32_t>(),
+                      [chars_offset] __device__(auto offset) { return offset - chars_offset; });
   }
 
-  // do the gather instead
-  // build indices
-  rmm::device_vector<size_type> indices(strings_count);
-  thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end(), start, step);
-  // create a column_view as a wrapper of these indices
-  column_view indices_view(
-    data_type{type_id::INT32}, strings_count, indices.data().get(), nullptr, 0);
-  // build a new strings column from the indices
-  auto sliced_table = cudf::detail::gather(table_view{{strings.parent()}},
-                                           indices_view,
-                                           cudf::out_of_bounds_policy::DONT_CHECK,
-                                           cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                           stream,
-                                           mr)
-                        ->release();
-  std::unique_ptr<column> output_column(std::move(sliced_table.front()));
-  if (output_column->null_count() == 0)
-    output_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
-  return output_column;
+  // slice the chars child column
+  auto const data_size =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = std::make_unique<cudf::column>(
+    cudf::slice(strings.chars(), {chars_offset, chars_offset + data_size}).front(), stream, mr);
+
+  // slice the null mask
+  auto null_mask = cudf::detail::copy_bitmask(
+    strings.null_mask(), offsets_offset, offsets_offset + strings_count, stream, mr);
+
+  return make_strings_column(strings_count,
+                             std::move(offsets_column),
+                             std::move(chars_column),
+                             UNKNOWN_NULL_COUNT,
+                             std::move(null_mask),
+                             stream,
+                             mr);
 }
 
 }  // namespace detail
diff --git a/cpp/tests/strings/array_tests.cu b/cpp/tests/strings/array_tests.cu
index 26b00d8a548..2d1ae1a862d 100644
--- a/cpp/tests/strings/array_tests.cu
+++ b/cpp/tests/strings/array_tests.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/copying.hpp>
+#include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/detail/scatter.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -71,20 +71,17 @@ TEST_P(SliceParmsTest, Slice)
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  std::vector<const char*> h_expected;
-  if (end > start) {
-    for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
-         ++idx)
-      h_expected.push_back(h_strings[idx]);
-  }
-  auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::detail::copy_slice(strings_view, start, end);
+  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
 
-  cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
-  // thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  cudf::test::strings_column_wrapper expected(
+    h_strings.begin() + start,
+    h_strings.begin() + end,
+    thrust::make_transform_iterator(h_strings.begin() + start,
+                                    [](auto str) { return str != nullptr; }));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_P(SliceParmsTest, SliceAllNulls)
@@ -94,42 +91,29 @@ TEST_P(SliceParmsTest, SliceAllNulls)
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  std::vector<const char*> h_expected;
-  if (end > start) {
-    for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
-         ++idx)
-      h_expected.push_back(h_strings[idx]);
-  }
-  auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::detail::copy_slice(strings_view, start, end);
+  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
+
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin() + start,
+    h_strings.begin() + end,
+    thrust::make_transform_iterator(h_strings.begin() + start,
+                                    [](auto str) { return str != nullptr; }));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_P(SliceParmsTest, SliceAllEmpty)
 {
   std::vector<const char*> h_strings{"", "", "", "", "", "", ""};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
+
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  std::vector<const char*> h_expected;
-  if (end > start) {
-    for (cudf::size_type idx = start; (idx < end) && (idx < (cudf::size_type)h_strings.size());
-         ++idx)
-      h_expected.push_back(h_strings[idx]);
-  }
-  auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::detail::copy_slice(strings_view, start, end);
-  cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
-  // thrust::make_transform_iterator( h_expected.begin(), [] (auto str) { return str!=nullptr; }));
+  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
+
+  cudf::test::strings_column_wrapper expected(h_strings.begin() + start, h_strings.begin() + end);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 

From 2488bc8010a4caa455ba259fd6031f4947ba63df Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Thu, 11 Mar 2021 17:15:11 -0700
Subject: [PATCH 19/33] Another fix for offsets_end() iterator in
 lists_column_view (#7575)

This is another fix for `offsets_end()` iterator in lists_column_view. The last fix (https://github.com/rapidsai/cudf/pull/7551) was still not correct---that iterator should not be computed using the size of the `offsets()` child column, which is also the offsets of the original (non-sliced) column. Instead, it should be computed using the `size()` of the current column.

Interestingly, my previous fix passed all the unit tests, since thrust does not throw anything (like access violation) when the input range is larger than the output range.

Authors:
  - Nghia Truong (@ttnghia)

Approvers:
  - Jake Hemstad (@jrhemstad)
  - David (@davidwendt)

URL: https://github.com/rapidsai/cudf/pull/7575
---
 cpp/include/cudf/lists/lists_column_view.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 6c5cfc37eab..f8facb83975 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -103,11 +103,16 @@ class lists_column_view : private column_view {
   }
 
   /**
-   * @brief Return one past the last offset
+   * @brief Return pointer to the position that is one past the last offset
+   *
+   * This function return the position that is one past the last offset of the lists column.
+   * Since the current lists column may be a sliced column, this offsets_end() iterator should not
+   * be computed using the size of the offsets() child column, which is also the offsets of the
+   * entire original (non-sliced) lists column.
    *
    * @return int32_t const* Pointer to one past the last offset
    */
-  offset_iterator offsets_end() const noexcept { return offsets_begin() + offsets().size(); }
+  offset_iterator offsets_end() const noexcept { return offsets_begin() + size() + 1; }
 };
 /** @} */  // end of group
 }  // namespace cudf

From 8aeb14eca028e0ec80f57e280d08f5e3da06bd97 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Fri, 12 Mar 2021 07:22:49 -0700
Subject: [PATCH 20/33] Implement drop_list_duplicates (#7528)

Closes #7494 and partially addresses #7414.

This is the new implementation for `drop_list_duplicates`, which removes duplicated entries from lists column. The result is a new lists column in which each list row contains only unique entries. By current implementation, the output lists will have entries sorted by ascending order (null(s) last).

Example with null_equality=EQUAL:
```
input: { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
output: { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }

```

Example with null_equality=UNEQUAL:
```
input: { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
output: { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL, NULL, NULL} }

```

Authors:
  - Nghia Truong (@ttnghia)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - @nvdbaranec
  - David (@davidwendt)
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7528
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   1 +
 .../cudf/lists/drop_list_duplicates.hpp       |  63 ++++
 cpp/include/doxygen_groups.h                  |   1 +
 cpp/src/lists/drop_list_duplicates.cu         | 294 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   1 +
 .../lists/drop_list_duplicates_tests.cpp      | 187 +++++++++++
 7 files changed, 548 insertions(+)
 create mode 100644 cpp/include/cudf/lists/drop_list_duplicates.hpp
 create mode 100644 cpp/src/lists/drop_list_duplicates.cu
 create mode 100644 cpp/tests/lists/drop_list_duplicates_tests.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 74f4a20c066..e709824721c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -134,6 +134,7 @@ test:
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
     - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/lists/count_elements.hpp
+    - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp
     - test -f $PREFIX/include/cudf/lists/extract.hpp
     - test -f $PREFIX/include/cudf/lists/contains.hpp
     - test -f $PREFIX/include/cudf/lists/gather.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2d8e260c0ca..2e0c12d683a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -258,6 +258,7 @@ add_library(cudf
     src/lists/copying/segmented_gather.cu
     src/lists/count_elements.cu
     src/lists/extract.cu
+    src/lists/drop_list_duplicates.cu
     src/lists/lists_column_factories.cu
     src/lists/lists_column_view.cu
     src/lists/segmented_sort.cu
diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp
new file mode 100644
index 00000000000..0939bd7956a
--- /dev/null
+++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/stream_compaction.hpp>
+
+namespace cudf {
+namespace lists {
+/**
+ * @addtogroup lists_drop_duplicates
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Create a new lists column by removing duplicated entries from each list element in the
+ * given lists column
+ *
+ * @throw cudf::logic_error if any row (list element) in the input column is a nested type.
+ *
+ * Given an `input` lists_column_view, the list elements in the column are copied to an output lists
+ * column such that their duplicated entries are dropped out to keep only the unique ones. The
+ * order of those entries within each list are not guaranteed to be preserved as in the input. In
+ * the current implementation, entries in the output lists are sorted by ascending order (nulls
+ * last), but this is not guaranteed in future implementation.
+ *
+ * @param lists_column The input lists_column_view
+ * @param nulls_equal  Flag to specify whether null entries should be considered equal
+ * @param mr           Device resource used to allocate memory
+ *
+ * @code{.pseudo}
+ * lists_column = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
+ * output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
+ *
+ * Note that permuting the entries of each list in this output also produces another valid
+ * output.
+ * @endcode
+ *
+ * @return A list column with list elements having unique entries
+ */
+std::unique_ptr<column> drop_list_duplicates(
+  lists_column_view const& lists_column,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index e732a13e67c..3f3efdb7626 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -146,6 +146,7 @@
  *   @defgroup lists_contains Searching
  *   @defgroup lists_gather Gathering
  *   @defgroup lists_elements Counting
+ *   @defgroup lists_drop_duplicates Filtering
  * @}
  * @defgroup nvtext_apis NVText
  * @{
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
new file mode 100644
index 00000000000..1eb105d296d
--- /dev/null
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/detail/sorting.hpp>
+#include <cudf/lists/drop_list_duplicates.hpp>
+#include <cudf/table/row_operators.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+namespace {
+using offset_type = lists_column_view::offset_type;
+/**
+ * @brief Copy list entries and entry list offsets ignoring duplicates
+ *
+ * Given an array of all entries flattened from a list column and an array that maps each entry to
+ * the offset of the list containing that entry, those entries and list offsets are copied into
+ * new arrays such that the duplicated entries within each list will be ignored.
+ *
+ * @param all_lists_entries    The input array containing all list entries
+ * @param entries_list_offsets A map from list entries to their corresponding list offsets
+ * @param nulls_equal          Flag to specify whether null entries should be considered equal
+ * @param stream               CUDA stream used for device memory operations and kernel launches
+ * @param mr                   Device resource used to allocate memory
+ *
+ * @return A pair of columns, the first one contains unique list entries and the second one
+ * contains their corresponding list offsets
+ */
+template <bool has_nulls>
+std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
+  column_view const& all_lists_entries,
+  column_view const& entries_list_offsets,
+  null_equality nulls_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  // Create an intermediate table, since the comparator only work on tables
+  auto const device_input_table =
+    cudf::table_device_view::create(table_view{{all_lists_entries}}, stream);
+  auto const comp = row_equality_comparator<has_nulls>(
+    *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
+
+  auto const num_entries = all_lists_entries.size();
+  // Allocate memory to store the indices of the unique entries
+  auto const unique_indices = cudf::make_numeric_column(
+    entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream);
+  auto const unique_indices_begin = unique_indices->mutable_view().begin<offset_type>();
+
+  auto const copy_end = thrust::unique_copy(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(num_entries),
+    unique_indices_begin,
+    [list_offsets = entries_list_offsets.begin<offset_type>(), comp] __device__(auto i, auto j) {
+      return list_offsets[i] == list_offsets[j] && comp(i, j);
+    });
+
+  // Collect unique entries and entry list offsets
+  auto const indices = cudf::detail::slice(
+    unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end));
+  return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}},
+                              indices,
+                              cudf::out_of_bounds_policy::DONT_CHECK,
+                              cudf::detail::negative_index_policy::NOT_ALLOWED,
+                              stream,
+                              mr)
+    ->release();
+}
+
+/**
+ * @brief Generate a 0-based offset column for a lists column
+ *
+ * Given a lists_column_view, which may have a non-zero offset, generate a new column containing
+ * 0-based list offsets. This is done by subtracting each of the input list offset by the first
+ * offset.
+ *
+ * @code{.pseudo}
+ * Given a list column having offsets = { 3, 7, 9, 13 },
+ * then output_offsets = { 0, 4, 6, 10 }
+ * @endcode
+ *
+ * @param lists_column The input lists column
+ * @param stream       CUDA stream used for device memory operations and kernel launches
+ * @param mr           Device resource used to allocate memory
+ *
+ * @return A column containing 0-based list offsets
+ */
+std::unique_ptr<column> generate_clean_offsets(lists_column_view const& lists_column,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  auto output_offsets = make_numeric_column(data_type{type_to_id<offset_type>()},
+                                            lists_column.size() + 1,
+                                            mask_state::UNALLOCATED,
+                                            stream,
+                                            mr);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    lists_column.offsets_begin(),
+    lists_column.offsets_end(),
+    output_offsets->mutable_view().begin<offset_type>(),
+    [first = lists_column.offsets_begin()] __device__(auto offset) { return offset - *first; });
+  return output_offsets;
+}
+
+/**
+ * @brief Populate list offsets for all list entries
+ *
+ * Given an `offsets` column_view containing offsets of a lists column and a number of all list
+ * entries in the column, generate an array that maps from each list entry to the offset of the list
+ * containing that entry.
+ *
+ * @code{.pseudo}
+ * num_entries = 10, offsets = { 0, 4, 6, 10 }
+ * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 }
+ * @endcode
+ *
+ * @param num_entries The number of list entries
+ * @param offsets     Column view to the list offsets
+ * @param stream      CUDA stream used for device memory operations and kernel launches
+ * @param mr          Device resource used to allocate memory
+ *
+ * @return A column containing entry list offsets
+ */
+std::unique_ptr<column> generate_entry_list_offsets(size_type num_entries,
+                                                    column_view const& offsets,
+                                                    rmm::cuda_stream_view stream)
+{
+  auto entry_list_offsets = make_numeric_column(offsets.type(),
+                                                num_entries,
+                                                mask_state::UNALLOCATED,
+                                                stream,
+                                                rmm::mr::get_current_device_resource());
+  thrust::upper_bound(rmm::exec_policy(stream),
+                      offsets.begin<offset_type>(),
+                      offsets.end<offset_type>(),
+                      thrust::make_counting_iterator<offset_type>(0),
+                      thrust::make_counting_iterator<offset_type>(num_entries),
+                      entry_list_offsets->mutable_view().begin<offset_type>());
+  return entry_list_offsets;
+}
+
+/**
+ * @brief Generate list offsets from entry offsets
+ *
+ * Generate an array of list offsets for the final result lists column. The list
+ * offsets of the original lists column are also taken into account to make sure the result lists
+ * column will have the same empty list rows (if any) as in the original lists column.
+ *
+ * @param[in] num_entries          The number of unique entries after removing duplicates
+ * @param[in] entries_list_offsets The mapping from list entries to their list offsets
+ * @param[out] original_offsets    The list offsets of the original lists column, which
+ * will also be used to store the new list offsets
+ * @param[in] stream               CUDA stream used for device memory operations and kernel launches
+ * @param[in] mr                   Device resource used to allocate memory
+ */
+void generate_offsets(size_type num_entries,
+                      column_view const& entries_list_offsets,
+                      mutable_column_view const& original_offsets,
+                      rmm::cuda_stream_view stream)
+{
+  // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any)
+  // If entries_list_offsets = {1, 1, 1, 1, 2, 3, 3, 3, 4, 4 }, num_entries = 10,
+  // then new_offsets = { 0, 4, 5, 8, 10 }
+  auto const new_offsets = allocate_like(
+    original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource());
+  thrust::copy_if(rmm::exec_policy(stream),
+                  thrust::make_counting_iterator<offset_type>(0),
+                  thrust::make_counting_iterator<offset_type>(num_entries + 1),
+                  new_offsets->mutable_view().begin<offset_type>(),
+                  [num_entries, offsets_ptr = entries_list_offsets.begin<offset_type>()] __device__(
+                    auto i) -> bool {
+                    return i == 0 || i == num_entries || offsets_ptr[i] != offsets_ptr[i - 1];
+                  });
+
+  // Generate a prefix sum of number of empty lists, storing inplace to the original lists
+  // offsets
+  // If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists),
+  // and new_offsets = { 0, 4, 6 },
+  // then output = { 0, 1, 1, 2, 2, 3}
+  auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator(
+    0, [offsets = original_offsets.begin<offset_type>()] __device__(auto i) {
+      return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0;
+    });
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         iter_trans_begin,
+                         iter_trans_begin + original_offsets.size(),
+                         original_offsets.begin<offset_type>());
+
+  // Generate the final list offsets
+  // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 },
+  //  and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 },
+  //  then output = { 0, 0, 4, 4, 5, 5 }
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<offset_type>(0),
+                    thrust::make_counting_iterator<offset_type>(original_offsets.size()),
+                    original_offsets.begin<offset_type>(),
+                    [prefix_sum_empty_lists = original_offsets.begin<offset_type>(),
+                     offsets = new_offsets->view().begin<offset_type>()] __device__(auto i) {
+                      return offsets[i - prefix_sum_empty_lists[i]];
+                    });
+}
+/**
+ * @copydoc cudf::lists::drop_list_duplicates
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_column,
+                                             null_equality nulls_equal,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent());
+  if (cudf::is_nested(lists_column.child().type())) {
+    CUDF_FAIL("Nested types are not supported in drop_list_duplicates.");
+  }
+
+  // Call segmented sort on the list elements and store them in a temporary column sorted_list
+  auto const sorted_lists =
+    detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream);
+
+  // Flatten all entries (depth = 1) of the lists column
+  auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream);
+
+  // Generate a 0-based offset column
+  auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr);
+
+  // Generate a mapping from list entries to offsets of the lists containing those entries
+  auto const entries_list_offsets =
+    detail::generate_entry_list_offsets(all_lists_entries.size(), lists_offsets->view(), stream);
+
+  // Copy non-duplicated entries (along with their list offsets) to new arrays
+  auto unique_entries_and_list_offsets =
+    all_lists_entries.has_nulls()
+      ? detail::get_unique_entries_and_list_offsets<true>(
+          all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr)
+      : detail::get_unique_entries_and_list_offsets<false>(
+          all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr);
+
+  // Generate offsets for the new lists column
+  detail::generate_offsets(unique_entries_and_list_offsets.front()->size(),
+                           unique_entries_and_list_offsets.back()->view(),
+                           lists_offsets->mutable_view(),
+                           stream);
+
+  // Construct a new lists column without duplicated entries
+  return make_lists_column(lists_column.size(),
+                           std::move(lists_offsets),
+                           std::move(unique_entries_and_list_offsets.front()),
+                           lists_column.null_count(),
+                           cudf::detail::copy_bitmask(lists_column.parent(), stream, mr));
+}
+
+}  // anonymous namespace
+}  // namespace detail
+
+/**
+ * @copydoc cudf::lists::drop_list_duplicates
+ */
+std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_column,
+                                             null_equality nulls_equal,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::drop_list_duplicates(lists_column, nulls_equal, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 492767c5d2f..40829c74957 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -422,6 +422,7 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp)
 ConfigureTest(LISTS_TEST
     lists/contains_tests.cpp
     lists/count_elements_tests.cpp
+    lists/drop_list_duplicates_tests.cpp
     lists/extract_tests.cpp
     lists/sort_lists_tests.cpp)
 
diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp
new file mode 100644
index 00000000000..0948ba96f62
--- /dev/null
+++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/lists/drop_list_duplicates.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+using float_type = float;
+using int_type   = int32_t;
+using INT_LCW    = cudf::test::lists_column_wrapper<int_type>;
+using FLT_LCW    = cudf::test::lists_column_wrapper<float_type>;
+using STR_LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+template <bool equal_test, class LCW>
+void test_once(cudf::column_view const& input,
+               LCW const& expected,
+               cudf::null_equality nulls_equal = cudf::null_equality::EQUAL)
+{
+  auto const results =
+    cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal);
+  if (equal_test) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true);
+  } else {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected, true);
+  }
+}
+
+struct DropListDuplicatesTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(DropListDuplicatesTest, InvalidCasesTests)
+{
+  // Lists of nested types are not supported
+  EXPECT_THROW(
+    cudf::lists::drop_list_duplicates(cudf::lists_column_view{INT_LCW{INT_LCW{{1, 2}, {3}}}}),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf::lists::drop_list_duplicates(cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}),
+    cudf::logic_error);
+}
+
+TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull)
+{
+  // Trivial cases
+  test_once<false>(FLT_LCW{{}}, FLT_LCW{{}});
+  test_once<false>(FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}, FLT_LCW{{0, 1, 2, 3, 4, 5}, {}});
+
+  // Multiple empty lists
+  test_once<false>(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}},
+                   FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}});
+
+  auto constexpr p_inf = std::numeric_limits<float_type>::infinity();
+  auto constexpr m_inf = -std::numeric_limits<float_type>::infinity();
+
+  // Lists contain inf
+  // We can't test for lists containing nan because the order of nan is
+  // undefined after sorting
+  test_once<false>(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf},
+                   FLT_LCW{0, 1, 2, p_inf});
+  test_once<false>(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf},
+                   FLT_LCW{m_inf, 0, p_inf});
+}
+
+TEST_F(DropListDuplicatesTest, IntegerTestsNonNull)
+{
+  // Trivial cases
+  test_once<true>(INT_LCW{{}}, INT_LCW{{}});
+  test_once<true>(INT_LCW{{0, 1, 2, 3, 4, 5}, {}}, INT_LCW{{0, 1, 2, 3, 4, 5}, {}});
+
+  // Multiple empty lists
+  test_once<true>(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}},
+                  INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}});
+
+  // Adjacent lists containing the same entries
+  test_once<true>(
+    INT_LCW{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}},
+    INT_LCW{{1}, {1, 2}, {2, 3}});
+
+  // Sliced list column
+  auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}};
+  auto const list1 = cudf::slice(list0, {0, 5})[0];
+  auto const list2 = cudf::slice(list0, {1, 5})[0];
+  auto const list3 = cudf::slice(list0, {1, 3})[0];
+  auto const list4 = cudf::slice(list0, {0, 3})[0];
+
+  test_once<true>(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
+  test_once<true>(list1, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
+  test_once<true>(list2, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
+  test_once<true>(list3, INT_LCW{{1, 2, 3, 4}, {5}});
+  test_once<true>(list4, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}});
+}
+
+TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls)
+{
+  auto constexpr null = std::numeric_limits<int_type>::max();
+
+  // null lists
+  test_once<true>(INT_LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}},
+                          cudf::detail::make_counting_transform_iterator(
+                            0, [](auto i) { return i != 2 && i != 3; })},
+                  INT_LCW{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}},
+                          cudf::detail::make_counting_transform_iterator(
+                            0, [](auto i) { return i != 2 && i != 3; })});
+
+  // null entries are equal
+  test_once<true>(
+    INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+            cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })},
+    INT_LCW{{1, 3, 5, 7, 9, null},
+            std::initializer_list<bool>{true, true, true, true, true, false}});
+
+  // nulls entries are not equal
+  test_once<true>(
+    INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+            cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })},
+    INT_LCW{
+      {1, 3, 5, 7, 9, null, null, null, null, null},
+      std::initializer_list<bool>{true, true, true, true, true, false, false, false, false, false}},
+    cudf::null_equality::UNEQUAL);
+}
+
+TEST_F(DropListDuplicatesTest, StringTestsNonNull)
+{
+  // Trivial cases
+  test_once<true>(STR_LCW{{}}, STR_LCW{{}});
+  test_once<true>(STR_LCW{"this", "is", "a", "string"}, STR_LCW{"a", "is", "string", "this"});
+
+  // One list column
+  test_once<true>(STR_LCW{"this", "is", "is", "is", "a", "string", "string"},
+                  STR_LCW{"a", "is", "string", "this"});
+
+  // Multiple lists column
+  test_once<true>(
+    STR_LCW{STR_LCW{"this", "is", "a", "no duplicate", "string"},
+            STR_LCW{"this", "is", "is", "a", "one duplicate", "string"},
+            STR_LCW{"this", "is", "is", "is", "a", "two duplicates", "string"},
+            STR_LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}},
+    STR_LCW{STR_LCW{"a", "is", "no duplicate", "string", "this"},
+            STR_LCW{"a", "is", "one duplicate", "string", "this"},
+            STR_LCW{"a", "is", "string", "this", "two duplicates"},
+            STR_LCW{"a", "is", "string", "this", "three duplicates"}});
+}
+
+TEST_F(DropListDuplicatesTest, StringTestsWithNulls)
+{
+  auto const null = std::string("");
+
+  // One list column with null entries
+  test_once<true>(
+    STR_LCW{{"this", null, "is", "is", "is", "a", null, "string", null, "string"},
+            cudf::detail::make_counting_transform_iterator(
+              0, [](auto i) { return i != 1 && i != 6 && i != 8; })},
+    STR_LCW{{"a", "is", "string", "this", null},
+            cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })});
+
+  // Multiple lists column with null lists and null entries
+  test_once<true>(
+    STR_LCW{{STR_LCW{{"this", null, "is", null, "a", null, "no duplicate", null, "string"},
+                     cudf::detail::make_counting_transform_iterator(
+                       0, [](auto i) { return i % 2 == 0; })},
+             STR_LCW{},
+             STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}},
+            cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })},
+    STR_LCW{
+      {STR_LCW{{"a", "is", "no duplicate", "string", "this", null},
+               cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i <= 4; })},
+       STR_LCW{},
+       STR_LCW{"a", "is", "one duplicate", "string", "this"}},
+      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })});
+}

From ff0c5378ae99df797fdeffda9ac8728a8c63d579 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 12 Mar 2021 13:08:46 -0800
Subject: [PATCH 21/33] Fix ORC writer output corruption with string columns
 (#7565)

Closes: #7346

Fixes an issue in ORC writer where null counting would not read the mask for every row.
The issue occurs when the column offset is not divisible by 32 so that two words are always read to get 32bits of mask (each read is effectively offset by the columns offset, so when reading the mask for 32 rows, we need to get two words to account for the offset). Namely, the second word is not read when the row is closer than 32 to the end of the chunk. This condition is incorrect for most column offsets, as the current row is not really the first bit of the mask word.
The fix is to adjust the condition when the second mask word is read (assuming that mask in padded to multiple of 32).

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - @nvdbaranec
  - Mike Wilson (@hyperbolic2346)
  - Devavret Makkar (@devavret)

URL: https://github.com/rapidsai/cudf/pull/7565
---
 cpp/src/io/orc/dict_enc.cu | 36 +++++++++++++++++++++---------------
 cpp/tests/io/orc_test.cpp  | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index de20af1bff4..99157a23fcb 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -70,24 +70,28 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
                                           Storage &temp_storage)
 {
   if (t == 0) { s->nnz = 0; }
-  for (uint32_t i = 0; i < s->chunk.num_rows; i += 512) {
+  for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) {
     const uint32_t *valid_map = s->chunk.valid_map_base;
     uint32_t is_valid, nz_pos;
-    if (t < 16) {
+    if (t < block_size / 32) {
       if (!valid_map) {
         s->scratch_red[t] = 0xffffffffu;
       } else {
-        uint32_t row = s->chunk.start_row + i + t * 32;
-        uint32_t v   = (row < s->chunk.start_row + s->chunk.num_rows)
-                       ? valid_map[(row + s->chunk.column_offset) / 32]
-                       : 0;
-        if (row & 0x1f) {
-          uint32_t v1 = (row + 32 < s->chunk.start_row + s->chunk.num_rows)
-                          ? valid_map[((row + s->chunk.column_offset) / 32) + 1]
-                          : 0;
-          v = __funnelshift_r(v, v1, row + s->chunk.column_offset);
+        uint32_t const row   = s->chunk.start_row + i + t * 32;
+        auto const chunk_end = s->chunk.start_row + s->chunk.num_rows;
+
+        auto const valid_map_idx = (row + s->chunk.column_offset) / 32;
+        uint32_t valid           = (row < chunk_end) ? valid_map[valid_map_idx] : 0;
+
+        auto const rows_in_next_word = (row + s->chunk.column_offset) & 0x1f;
+        if (rows_in_next_word != 0) {
+          auto const rows_in_current_word = 32 - rows_in_next_word;
+          // Read next word if any rows are within the chunk
+          uint32_t const valid_next =
+            (row + rows_in_current_word < chunk_end) ? valid_map[valid_map_idx + 1] : 0;
+          valid = __funnelshift_r(valid, valid_next, rows_in_next_word);
         }
-        s->scratch_red[t] = v;
+        s->scratch_red[t] = valid;
       }
     }
     __syncthreads();
@@ -109,7 +113,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s,
  * @param[in] chunks DictionaryChunk device array [rowgroup][column]
  * @param[in] num_columns Number of columns
  */
-// blockDim {512,1,1}
+// blockDim {block_size,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 2)
   gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
@@ -411,9 +415,11 @@ void InitDictionaryIndices(DictionaryChunk *chunks,
                            uint32_t num_rowgroups,
                            rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(512, 1);  // 512 threads per chunk
+  static constexpr int block_size = 512;
+  dim3 dim_block(block_size, 1);
   dim3 dim_grid(num_columns, num_rowgroups);
-  gpuInitDictionaryIndices<512><<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
+  gpuInitDictionaryIndices<block_size>
+    <<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
 }
 
 /**
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index cc4c9b700af..b0dc01ea001 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1025,4 +1025,40 @@ TEST_F(OrcStatisticsTest, Basic)
   validate_statistics(stats.stripes_stats[0]);
 }
 
+TEST_F(OrcWriterTest, SlicedValidMask)
+{
+  std::vector<const char*> strings;
+  // Need more than 32 elements to reproduce the issue
+  for (int i = 0; i < 34; ++i)
+    strings.emplace_back("a long string to make sure overflow affects the output");
+  // An element is null only to enforce the output column to be nullable
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 32; });
+
+  column_wrapper<cudf::string_view> col{strings.begin(), strings.end(), validity};
+
+  std::vector<std::unique_ptr<column>> cols;
+  cols.push_back(col.release());
+
+  cudf_io::table_metadata expected_metadata;
+  expected_metadata.column_names.emplace_back("col_string");
+
+  // Bug tested here is easiest to reproduce when column_offset % 32 is 31
+  std::vector<cudf::size_type> indices{31, 34};
+  std::vector<cudf::column_view> sliced_col = cudf::slice(cols[0]->view(), indices);
+  cudf::table_view tbl{sliced_col};
+
+  auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
+  auto result = cudf_io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view());
+  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 365e6491f47e6eaa04a72f6c2d5073fa2b4563bf Mon Sep 17 00:00:00 2001
From: Keith Kraus <kkraus@nvidia.com>
Date: Fri, 12 Mar 2021 16:18:24 -0500
Subject: [PATCH 22/33] Fix missing Dask imports (#7580)

https://github.com/dask/dask/pull/7345 removed some imports that we were improperly using from a dask module. Fix the imports to properly target `fsspec`.

Authors:
  - Keith Kraus (@kkraus14)

Approvers:
  - @jakirkham
  - Ashwin Srinath (@shwina)

URL: https://github.com/rapidsai/cudf/pull/7580
---
 python/dask_cudf/dask_cudf/io/orc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index e96219fd23e..5b0d19b737b 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -2,11 +2,12 @@
 
 from io import BufferedWriter, IOBase
 
+from fsspec.core import get_fs_token_paths
+from fsspec.utils import stringify_path
 from pyarrow import orc as orc
 
 from dask import dataframe as dd
 from dask.base import tokenize
-from dask.bytes.core import get_fs_token_paths, stringify_path
 from dask.dataframe.io.utils import _get_pyarrow_dtypes
 
 import cudf

From 5c4fa28f6000a36399bd74af5c938dc34b5f1b3b Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 12 Mar 2021 17:33:06 -0500
Subject: [PATCH 23/33] Add `__repr__` for Column and ColumnAccessor (#7531)

## Summary:

* Add a `__repr__` for Column (thin wrapper around the `__repr__` of the underlying pa.Array)
* Add a `__repr__` for ColumnAccessor (similar to pa.Table, shows the names/types of the columns of the ColumnAccessor)

## Additional info:

Debugging is sometimes made painful by the fact that we don't have a `__repr__` for columns and column accessors. For example, here's what a `ColumnAccessor` and a `Column` currently look like when printed...:

```python
In [2]: cudf.DataFrame({'a': [1, 2, 3], "b": [4, 5, 6], "z_1": [2, 3, 4]})._data
Out[2]: ColumnAccessor(OrderedColumnDict([('a', <cudf.core.column.numerical.NumericalColumn object at 0x7f0306336f80>), ('b', <cudf.core.column.numerical.NumericalColumn object at 0x7f03062a05f0>), ('z_1', <cudf.core.column.numerical.NumericalColumn object at 0x7f03062a0e60>)]), multiindex=False, level_names=(None,))

In [3]: cudf.Series([1, 2, None, 3])._column
Out[3]: <cudf.core.column.numerical.NumericalColumn at 0x7f2190746710>
```

After this PR:

```python
In [2]: cudf.DataFrame({'a': [1, 2, 3], "b": [4, 5, 6], "z_1": [2, 3, 4]})._data
Out[2]:
ColumnAccessor(multiindex=False, level_names=(None,))
a: int64
b: int64
z_1: int64

In [3]: cudf.Series([1, 2, None, 3])._column
Out[3]:
<cudf.core.column.numerical.NumericalColumn object at 0x7f3e90c2ac20>
[
  1,
  2,
  null,
  3
]
dtype: int64
```

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7531
---
 python/cudf/cudf/core/column/column.py   |  7 +++++++
 python/cudf/cudf/core/column_accessor.py | 16 ++++++++--------
 python/cudf/cudf/utils/cudautils.py      |  5 +++--
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7e7b39816d8..2bb35c97d7c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -109,6 +109,13 @@ def mask_array_view(self) -> "cuda.devicearray.DeviceNDArray":
     def __len__(self) -> int:
         return self.size
 
+    def __repr__(self):
+        return (
+            f"{object.__repr__(self)}\n"
+            f"{self.to_arrow().to_string()}\n"
+            f"dtype: {self.dtype}"
+        )
+
     def to_pandas(
         self, index: ColumnLike = None, nullable: bool = False, **kwargs
     ) -> "pd.Series":
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index ad1a0c80ef5..03743e4464b 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -86,15 +86,15 @@ def __len__(self) -> int:
         return len(self._data)
 
     def __repr__(self) -> str:
-        data_repr = self._data.__repr__()
-        multiindex_repr = self.multiindex.__repr__()
-        level_names_repr = self.level_names.__repr__()
-        return "{}({}, multiindex={}, level_names={})".format(
-            self.__class__.__name__,
-            data_repr,
-            multiindex_repr,
-            level_names_repr,
+        type_info = (
+            f"{self.__class__.__name__}("
+            f"multiindex={self.multiindex}, "
+            f"level_names={self.level_names})"
         )
+        column_info = "\n".join(
+            [f"{name}: {col.dtype}" for name, col in self.items()]
+        )
+        return f"{type_info}\n{column_info}"
 
     @property
     def level_names(self) -> Tuple[Any, ...]:
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index f62ca862091..722e0b12183 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
+from pickle import dumps
+
 import cachetools
 import cupy
 import numpy as np
 from numba import cuda
-from pickle import dumps
 
 import cudf
 from cudf.utils.utils import check_equals_float, check_equals_int
@@ -239,7 +240,7 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
 # it can hit for distinct functions that are similar. The lru_cache wrapping
 # compile_udf misses for these similar functions, but doesn't need to serialize
 # closure variables to check for a hit.
-_udf_code_cache = cachetools.LRUCache(maxsize=32)
+_udf_code_cache: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 
 
 def compile_udf(udf, type_signature):

From 04f90211fe95b99f4ad934f273faf87639b1f6ff Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Fri, 12 Mar 2021 18:44:50 -0500
Subject: [PATCH 24/33] `fixed_point` + `cudf::binary_operation` API Changes
 (#7435)

This resolves https://github.com/rapidsai/cudf/issues/7442

Recently while working with @razajafri on `fixed_point` binary ops, it became clear that the `cudf::binary_operation` is breaking the "easy to use, **hard to misuse**" # 1 design guideline. I knew about this but I slotted it as technical debt to be cleaned up later. Long story short, after discussions with both @razajafri, @jrhemstad and comments on the https://github.com/rapidsai/cudf/issues/7442, we will implement the following:

* [x] For `fixed_point` + `cudf::binary_operation` + `DIV` always **use** the `cudf::data_type output_type` parameter
* [x] ~~For `fixed_point` + `cudf::binary_operation` + `TRUE_DIV`, require that the columns/scalars provided as arguments (`lhs` and `rhs`) will result in the specified `data_type`/`scale`~~
* [x] Provide a convenience function (something like `binary_operation_fixed_point_scale()`) that will compute the "expected" scale given two input columns/scalars and a `binary_operator`
* [x] Remove `TRUE_DIV`
* [x] Add unit tests for different output data_types
* [x] Update Python/Cython

**This will be a breaking change for all `fixed_point` + `cudf::binary_operation`.**

Authors:
  - Conor Hoekstra (@codereport)

Approvers:
  - Keith Kraus (@kkraus14)
  - Mike Wilson (@hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/7435
---
 cpp/include/cudf/binaryop.hpp                 |  24 ++
 cpp/src/binaryop/binaryop.cpp                 | 260 +++++----------
 cpp/src/unary/cast_ops.cu                     |   6 +-
 cpp/tests/binaryop/binop-integration-test.cpp | 312 ++++++++++++++----
 cpp/tests/fixed_point/fixed_point_tests.cu    |  18 +-
 python/cudf/cudf/_lib/binaryop.pyx            |  11 +-
 python/cudf/cudf/_lib/column.pyx              |  26 +-
 python/cudf/cudf/_lib/types.pxd               |   4 +-
 python/cudf/cudf/_lib/types.pyx               |  21 +-
 python/cudf/cudf/core/column/decimal.py       |  18 +-
 python/cudf/cudf/core/dtypes.py               |   6 +-
 python/cudf/cudf/utils/dtypes.py              |   7 +-
 12 files changed, 420 insertions(+), 293 deletions(-)

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 72abefef04f..7099c29b9df 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -178,5 +178,29 @@ std::unique_ptr<column> binary_operation(
   data_type output_type,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
+ *
+ * @param op           The binary_operator used for two `fixed_point` numbers
+ * @param left_scale   Scale of left `fixed_point` number
+ * @param right_scale  Scale of right `fixed_point` number
+ * @return             The resulting `scale` of the computed `fixed_point` number
+ */
+int32_t binary_operation_fixed_point_scale(binary_operator op,
+                                           int32_t left_scale,
+                                           int32_t right_scale);
+
+/**
+ * @brief Computes the `data_type` for a `fixed_point` number based on given binary operator `op`
+ *
+ * @param op   The binary_operator used for two `fixed_point` numbers
+ * @param lhs  `cudf::data_type` of left `fixed_point` number
+ * @param rhs  `cudf::data_type` of right `fixed_point` number
+ * @return     The resulting `cudf::data_type` of the computed `fixed_point` number
+ */
+cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
+                                                         cudf::data_type const& lhs,
+                                                         cudf::data_type const& rhs);
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index fc697267ca7..6b5afa69300 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -42,6 +42,7 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -378,25 +379,7 @@ bool is_comparison_binop(binary_operator op)
  */
 bool is_supported_fixed_point_binop(binary_operator op)
 {
-  return is_basic_arithmetic_binop(op) or is_comparison_binop(op) or
-         op == binary_operator::TRUE_DIV;
-}
-
-/**
- * @brief Computes the scale for a `fixed_point` number based on given binary operator `op`
- *
- * @param op The binary_operator used for two `fixed_point` numbers
- * @param left_scale Scale of left `fixed_point` number
- * @param right_scale Scale of right `fixed_point` number
- * @return int32_t The resulting `scale` of the computed `fixed_point` number
- */
-int32_t compute_scale_for_binop(binary_operator op, int32_t left_scale, int32_t right_scale)
-{
-  CUDF_EXPECTS(is_supported_fixed_point_binop(op), "Unsupported fixed_point binary operation.");
-  if (op == binary_operator::TRUE_DIV) CUDF_FAIL("TRUE_DIV scale cannot be computed.");
-  if (op == binary_operator::MUL) return left_scale + right_scale;
-  if (op == binary_operator::DIV) return left_scale - right_scale;
-  return std::min(left_scale, right_scale);
+  return is_basic_arithmetic_binop(op) or is_comparison_binop(op);
 }
 
 /**
@@ -411,6 +394,26 @@ bool is_same_scale_necessary(binary_operator op)
   return op != binary_operator::MUL && op != binary_operator::DIV;
 }
 
+template <typename Lhs, typename Rhs>
+void fixed_point_binary_operation_validation(binary_operator op,
+                                             Lhs lhs,
+                                             Rhs rhs,
+                                             thrust::optional<cudf::data_type> output_type = {})
+{
+  CUDF_EXPECTS(is_fixed_point(lhs), "Input must have fixed_point data_type.");
+  CUDF_EXPECTS(is_fixed_point(rhs), "Input must have fixed_point data_type.");
+  CUDF_EXPECTS(is_supported_fixed_point_binop(op), "Unsupported fixed_point binary operation");
+  CUDF_EXPECTS(lhs.id() == rhs.id(), "Data type mismatch");
+  if (output_type.has_value()) {
+    if (is_comparison_binop(op))
+      CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
+                   "Comparison operations require boolean output type.");
+    else
+      CUDF_EXPECTS(is_fixed_point(output_type.value()),
+                   "fixed_point binary operations require fixed_point output type.");
+  }
+}
+
 /**
  * @brief Function to compute binary operation of one `column_view` and one `scalar`
  *
@@ -424,52 +427,24 @@ bool is_same_scale_necessary(binary_operator op)
 std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
                                                      column_view const& rhs,
                                                      binary_operator op,
-                                                     thrust::optional<cudf::data_type> output_type,
+                                                     cudf::data_type output_type,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
 
-  CUDF_EXPECTS(is_fixed_point(lhs.type()), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_fixed_point(rhs.type()), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_supported_fixed_point_binop(op), "Unsupported fixed_point binary operation");
-  CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Data type mismatch");
-  if (op == binary_operator::TRUE_DIV)
-    CUDF_EXPECTS(output_type.has_value(), "TRUE_DIV requires result_type.");
-
-  auto const scale = op == binary_operator::TRUE_DIV
-                       ? output_type.value().scale()
-                       : compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale());
-
-  auto const out_type = output_type.value_or(
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : data_type{lhs.type().id(), scale});
+  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
 
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, out_type, stream, mr);
-
-  if (rhs.is_empty()) return out;
+  if (rhs.is_empty())
+    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
+  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
+  auto const type =
+    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{rhs.type().id(), scale};
+  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
   auto out_view = out->mutable_view();
 
-  if (op == binary_operator::TRUE_DIV) {
-    // Adjust scalar so lhs has the scale needed to get desired output data_type (scale)
-    auto const diff         = (lhs.type().scale() - rhs.type().scale()) - scale;
-    auto const unused_scale = scale_type{0};  // scale of out_view already set
-    if (lhs.type().id() == type_id::DECIMAL32) {
-      auto const factor       = numeric::detail::ipow<int32_t, Radix::BASE_10>(std::abs(diff));
-      auto const val          = static_cast<fixed_point_scalar<decimal32> const&>(lhs).value();
-      auto const scaled_value = diff < 0 ? val / factor : val * factor;
-      auto const scalar       = make_fixed_point_scalar<decimal32>(scaled_value, unused_scale);
-      binops::jit::binary_operation(out_view, *scalar, rhs, binary_operator::DIV, stream);
-      return out;
-    } else {
-      auto const factor       = numeric::detail::ipow<int64_t, Radix::BASE_10>(std::abs(diff));
-      auto const val          = static_cast<fixed_point_scalar<decimal64> const&>(lhs).value();
-      auto const scaled_value = diff < 0 ? val / factor : val * factor;
-      auto const scalar       = make_fixed_point_scalar<decimal64>(scaled_value, unused_scale);
-      binops::jit::binary_operation(out_view, *scalar, rhs, binary_operator::DIV, stream);
-      return out;
-    }
-  } else if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
+  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
     // Adjust scalar/column so they have they same scale
     if (rhs.type().scale() < lhs.type().scale()) {
       auto const diff = lhs.type().scale() - rhs.type().scale();
@@ -479,7 +454,6 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
         binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-        return out;
       } else {
         CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
         auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
@@ -487,31 +461,27 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
         binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
-        return out;
       }
     } else {
       auto const diff   = rhs.type().scale() - lhs.type().scale();
       auto const result = [&] {
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar =
-            make_fixed_point_scalar<decimal32>(factor, scale_type{rhs.type().scale()});
+          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
           return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar =
-            make_fixed_point_scalar<decimal64>(factor, scale_type{rhs.type().scale()});
+          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
           return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
-      return out;
     }
   } else {
     binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-    return out;
   }
+  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
 }
 
 /**
@@ -527,52 +497,24 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
 std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
                                                      scalar const& rhs,
                                                      binary_operator op,
-                                                     thrust::optional<cudf::data_type> output_type,
+                                                     cudf::data_type output_type,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
 
-  CUDF_EXPECTS(is_fixed_point(lhs.type()), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_fixed_point(rhs.type()), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_supported_fixed_point_binop(op), "Unsupported fixed_point binary operation");
-  CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Data type mismatch");
-  if (op == binary_operator::TRUE_DIV)
-    CUDF_EXPECTS(output_type.has_value(), "TRUE_DIV requires result_type.");
-
-  auto const scale = op == binary_operator::TRUE_DIV
-                       ? output_type.value().scale()
-                       : compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale());
+  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
 
-  auto const out_type = output_type.value_or(
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : data_type{lhs.type().id(), scale});
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, out_type, stream, mr);
-
-  if (lhs.is_empty()) return out;
+  if (lhs.is_empty())
+    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
+  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
+  auto const type =
+    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{lhs.type().id(), scale};
+  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
   auto out_view = out->mutable_view();
 
-  if (op == binary_operator::TRUE_DIV) {
-    // Adjust columns so lhs has the scale needed to get desired output data_type (scale)
-    auto const diff         = (lhs.type().scale() - rhs.type().scale()) - scale;
-    auto const interim_op   = diff < 0 ? binary_operator::DIV : binary_operator::MUL;
-    auto const scalar_scale = scale_type{rhs.type().scale() + scale};
-    auto const result       = [&] {
-      if (lhs.type().id() == type_id::DECIMAL32) {
-        auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(std::abs(diff));
-        auto const scalar = make_fixed_point_scalar<decimal32>(factor, scalar_scale);
-        return binary_operation(lhs, *scalar, interim_op, {}, stream, mr);
-      } else {
-        CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-        auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(std::abs(diff));
-        auto const scalar = make_fixed_point_scalar<decimal64>(factor, scalar_scale);
-        return binary_operation(lhs, *scalar, interim_op, {}, stream, mr);
-      }
-    }();
-    binops::jit::binary_operation(out_view, result->view(), rhs, binary_operator::DIV, stream);
-    return out;
-  } else if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
+  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
     // Adjust scalar/column so they have they same scale
     if (rhs.type().scale() > lhs.type().scale()) {
       auto const diff = rhs.type().scale() - lhs.type().scale();
@@ -582,7 +524,6 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
         auto const scale  = scale_type{lhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
         binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-        return out;
       } else {
         CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
         auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
@@ -590,31 +531,27 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
         binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
-        return out;
       }
     } else {
       auto const diff   = lhs.type().scale() - rhs.type().scale();
       auto const result = [&] {
         if (rhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar =
-            make_fixed_point_scalar<decimal32>(factor, scale_type{lhs.type().scale()});
+          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
           return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar =
-            make_fixed_point_scalar<decimal64>(factor, scale_type{lhs.type().scale()});
+          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
           return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
-      return out;
     }
   } else {
     binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-    return out;
   }
+  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
 }
 
 /**
@@ -630,94 +567,59 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
 std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
                                                      column_view const& rhs,
                                                      binary_operator op,
-                                                     thrust::optional<cudf::data_type> output_type,
+                                                     cudf::data_type output_type,
                                                      rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
 
-  CUDF_EXPECTS(is_fixed_point(lhs.type()), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_fixed_point(rhs.type()), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_supported_fixed_point_binop(op), "Unsupported fixed_point binary operation");
-  CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Data type mismatch");
-  if (op == binary_operator::TRUE_DIV)
-    CUDF_EXPECTS(output_type.has_value(), "TRUE_DIV requires result_type.");
-
-  auto const scale = op == binary_operator::TRUE_DIV
-                       ? output_type.value().scale()
-                       : compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale());
-
-  auto const out_type = output_type.value_or(
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : data_type{lhs.type().id(), scale});
+  fixed_point_binary_operation_validation(op, lhs.type(), rhs.type(), output_type);
 
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, out_type, stream, mr);
-
-  if (lhs.is_empty() or rhs.is_empty()) return out;
+  if (lhs.is_empty() or rhs.is_empty())
+    return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
+  auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
+  auto const type =
+    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{lhs.type().id(), scale};
+  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
   auto out_view = out->mutable_view();
 
-  if (op == binary_operator::TRUE_DIV) {
-    // Adjust columns so lhs has the scale needed to get desired output data_type (scale)
-    auto const diff         = (lhs.type().scale() - rhs.type().scale()) - scale;
-    auto const interim_op   = diff < 0 ? binary_operator::DIV : binary_operator::MUL;
-    auto const scalar_scale = scale_type{rhs.type().scale() + scale};
-    auto const result       = [&] {
-      if (lhs.type().id() == type_id::DECIMAL32) {
-        auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(std::abs(diff));
-        auto const scalar = make_fixed_point_scalar<decimal32>(factor, scalar_scale);
-        return binary_operation(lhs, *scalar, interim_op, {}, stream, mr);
-      } else {
-        CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
-        auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(std::abs(diff));
-        auto const scalar = make_fixed_point_scalar<decimal64>(factor, scalar_scale);
-        return binary_operation(lhs, *scalar, interim_op, {}, stream, mr);
-      }
-    }();
-    binops::jit::binary_operation(out_view, result->view(), rhs, binary_operator::DIV, stream);
-    return out;
-  } else if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
-    // Adjust columns so they have they same scale TODO modify comment
+  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
     if (rhs.type().scale() < lhs.type().scale()) {
       auto const diff   = lhs.type().scale() - rhs.type().scale();
       auto const result = [&] {
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar =
-            make_fixed_point_scalar<decimal32>(factor, scale_type{lhs.type().scale()});
+          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
           return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar =
-            make_fixed_point_scalar<decimal64>(factor, scale_type{lhs.type().scale()});
+          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
           return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
-      return out;
     } else {
       auto const diff   = rhs.type().scale() - lhs.type().scale();
       auto const result = [&] {
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-          auto const scalar =
-            make_fixed_point_scalar<decimal32>(factor, scale_type{rhs.type().scale()});
+          auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
           return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-          auto const scalar =
-            make_fixed_point_scalar<decimal64>(factor, scale_type{rhs.type().scale()});
+          auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
           return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
-      return out;
     }
   } else {
     binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
-    return out;
   }
+  return output_type.scale() != scale ? cudf::cast(out_view, output_type) : std::move(out);
 }
 
 std::unique_ptr<column> binary_operation(scalar const& lhs,
@@ -730,11 +632,8 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) {
-    auto const type =
-      op == binary_operator::TRUE_DIV ? output_type : thrust::optional<data_type>{thrust::nullopt};
-    return fixed_point_binary_operation(lhs, rhs, op, type, stream, mr);
-  }
+  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
+    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
@@ -760,11 +659,8 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) {
-    auto const type =
-      op == binary_operator::TRUE_DIV ? output_type : thrust::optional<data_type>{thrust::nullopt};
-    return fixed_point_binary_operation(lhs, rhs, op, type, stream, mr);
-  }
+  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
+    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
@@ -792,11 +688,8 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
     return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
-  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) {
-    auto const type =
-      op == binary_operator::TRUE_DIV ? output_type : thrust::optional<data_type>{thrust::nullopt};
-    return fixed_point_binary_operation(lhs, rhs, op, type, stream, mr);
-  }
+  if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
+    return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
@@ -845,6 +738,27 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
 }  // namespace detail
 
+int32_t binary_operation_fixed_point_scale(binary_operator op,
+                                           int32_t left_scale,
+                                           int32_t right_scale)
+{
+  CUDF_EXPECTS(cudf::detail::is_supported_fixed_point_binop(op),
+               "Unsupported fixed_point binary operation.");
+  if (op == binary_operator::MUL) return left_scale + right_scale;
+  if (op == binary_operator::DIV) return left_scale - right_scale;
+  return std::min(left_scale, right_scale);
+}
+
+cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
+                                                         cudf::data_type const& lhs,
+                                                         cudf::data_type const& rhs)
+{
+  cudf::detail::fixed_point_binary_operation_validation(op, lhs, rhs);
+
+  auto const scale = binary_operation_fixed_point_scale(op, lhs.scale(), rhs.scale());
+  return cudf::data_type{lhs.id(), scale};
+}
+
 std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 15db8e6a3dd..7e3a4050b4f 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -178,11 +178,13 @@ std::unique_ptr<column> rescale(column_view input,
 
   if (input.type().scale() > scale) {
     auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
-    return detail::binary_operation(input, *scalar, binary_operator::ADD, {}, stream, mr);
+    auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
+    return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);
   } else {
     auto const diff   = input.type().scale() - scale;
     auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff});
-    return detail::binary_operation(input, *scalar, binary_operator::DIV, {}, stream, mr);
+    auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
+    return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
   }
 };
 
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 2d17853a72b..019e72d3d3f 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/column_utilities.hpp>
@@ -30,6 +31,7 @@
 
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
+#include "cudf/utilities/error.hpp"
 
 namespace cudf {
 namespace test {
@@ -2043,7 +2045,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
   auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
   auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
 
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2072,7 +2078,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
   auto const rhs          = wrapper<decimalXX>(vec2.begin(), vec2.end());
   auto const expected_col = wrapper<decimalXX>(expected.begin(), expected.end());
 
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2090,7 +2100,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply2)
   auto const rhs      = fp_wrapper<RepType>{{10, 10, 10, 10, 10}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{110, 220, 330, 440, 550}, scale_type{-1}};
 
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2105,7 +2119,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv)
   auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{-1}};
 
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2120,7 +2138,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv2)
   auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{-2}};
   auto const expected = fp_wrapper<RepType>{{2, 7, 12, 17}, scale_type{1}};
 
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2135,7 +2157,9 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv3)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
   auto const expected = fp_wrapper<RepType>{{0, 2, 4, 5}, scale_type{0}};
 
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, {});
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2153,7 +2177,9 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv4)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(12, scale_type{-1});
   auto const expected = fp_wrapper<RepType>(result_begin, result_begin + 1000, scale_type{0});
 
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, {});
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2168,7 +2194,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd2)
   auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
   auto const expected = fp_wrapper<RepType>{{210, 420, 630, 840, 1050}, scale_type{-2}};
 
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2183,7 +2213,78 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd3)
   auto const rhs      = fp_wrapper<RepType>{{100, 200, 300, 400, 500}, scale_type{-2}};
   auto const expected = fp_wrapper<RepType>{{2100, 4200, 6300, 8400, 10500}, scale_type{-3}};
 
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd4)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
+  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
+
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd5)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const lhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-2});
+  auto const rhs      = fp_wrapper<RepType>{{11, 22, 33, 44, 55}, scale_type{-1}};
+  auto const expected = fp_wrapper<RepType>{{210, 320, 430, 540, 650}, scale_type{-2}};
+
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd6)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col = fp_wrapper<RepType>{{3, 4, 5, 6, 7, 8}, scale_type{0}};
+
+  auto const expected1 = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
+  auto const expected2 = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
+  auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
+  auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
+  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
+  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointCast)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col      = fp_wrapper<RepType>{{6, 8, 10, 12, 14, 16}, scale_type{0}};
+  auto const expected = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
+  auto const type     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
+  auto const result   = cudf::cast(col, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2198,7 +2299,9 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(100, scale_type{-1});
   auto const expected = fp_wrapper<RepType>{{1100, 2200, 3300, 4400, 5500}, scale_type{-2}};
 
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, {});
+  auto const type = cudf::binary_operation_fixed_point_output_type(
+    cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2213,7 +2316,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
   auto const rhs      = fp_wrapper<RepType>{{2250, 1005}, scale_type{-3}};
   auto const expected = fp_wrapper<RepType>{{3750, 3005}, scale_type{-3}};
 
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(lhs).type(),
+                                                   static_cast<cudf::column_view>(rhs).type());
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2229,7 +2336,56 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
   auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::EQUAL, {});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const trues    = std::vector<bool>(4, true);
+  auto const col      = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
+  auto const expected = wrapper<bool>(trues.begin(), trues.end());
+
+  auto const result =
+    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{0}};
+  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
+  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
+
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{-2}};
+  auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
+  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
+
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2251,7 +2407,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
   auto const iota_3  = fp_wrapper<RepType>(vec1.begin(), vec1.end(), scale_type{-3});
   auto const zeros_3 = fp_wrapper<RepType>(vec2.begin(), vec2.end(), scale_type{-1});
 
-  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
+                                                   static_cast<cudf::column_view>(iota_3).type(),
+                                                   static_cast<cudf::column_view>(zeros_3).type());
+  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
 
@@ -2260,16 +2420,17 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
   auto const trues    = std::vector<bool>(sz, true);
   auto const true_col = wrapper<bool>(trues.begin(), trues.end());
 
+  auto const btype = cudf::data_type{type_id::BOOL8};
   auto const equal_result =
-    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, {});
+    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
 
   auto const less_result =
-    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, {});
+    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
 
   auto const greater_result =
-    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, {});
+    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
 }
 
@@ -2284,7 +2445,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMaxSimple)
   auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 1, 1, 0, 0}, scale_type{-2}};
   auto const expected = fp_wrapper<RepType>{{40, 20, 30, 10, 0}, {1, 1, 1, 1, 0}, scale_type{-2}};
 
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
+                                                   static_cast<cudf::column_view>(col1).type(),
+                                                   static_cast<cudf::column_view>(col2).type());
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2300,7 +2465,11 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMinSimple)
   auto const col2     = fp_wrapper<RepType>{{10, 20, 30, 40, 0}, {1, 0, 1, 1, 0}, scale_type{-1}};
   auto const expected = fp_wrapper<RepType>{{10, 30, 20, 40, 0}, {1, 1, 1, 1, 0}, scale_type{-1}};
 
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, {});
+  auto const type =
+    cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
+                                                   static_cast<cudf::column_view>(col1).type(),
+                                                   static_cast<cudf::column_view>(col2).type());
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2316,60 +2485,61 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullEqualsSimple)
   auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
   auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
 
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_EQUALS, {});
+  auto const result = cudf::binary_operation(
+    col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
+  auto const lhs      = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
   auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv2)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div2)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{1}};
+  auto const lhs      = fp_wrapper<RepType>{{100000, 300000, 500000, 700000}, scale_type{-3}};
   auto const rhs      = fp_wrapper<RepType>{{20, 20, 20, 20}, scale_type{-1}};
   auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv3)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div3)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{1}};
-  auto const rhs      = fp_wrapper<RepType>{{300, 900, 300, 300}, scale_type{-2}};
+  auto const lhs      = fp_wrapper<RepType>{{10000, 30000, 50000, 70000}, scale_type{-2}};
+  auto const rhs      = fp_wrapper<RepType>{{3, 9, 3, 3}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv4)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div4)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -2377,131 +2547,127 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv4)
 
   auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{1}};
   auto const rhs      = make_fixed_point_scalar<decimalXX>(3, scale_type{0});
-  auto const expected = fp_wrapper<RepType>{{3333, 10000, 16666, 23333}, scale_type{-2}};
+  auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
 
-  auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv5)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div6)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
-  auto const lhs      = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(30000, scale_type{-4});
-  auto const expected = fp_wrapper<RepType>{{3333, 10000, 16666, 23333}, scale_type{-2}};
+  auto const lhs = make_fixed_point_scalar<decimalXX>(3000, scale_type{-3});
+  auto const rhs = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
+
+  auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv6)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div7)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
-  for (auto const i : {0, 1, 2, 3, 4, 5, 6, 7}) {
-    auto const val = 3 * numeric::detail::ipow<int32_t, Radix::BASE_10>(i);
-    auto const lhs = make_fixed_point_scalar<decimalXX>(val, scale_type{-i});
-    auto const rhs = fp_wrapper<RepType>{{10, 30, 50, 70}, scale_type{-1}};
-
-    auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
-
-    auto const type   = data_type{type_to_id<decimalXX>(), -2};
-    auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::TRUE_DIV, type);
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
-  }
-}
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv7)
-{
-  using namespace numeric;
-  using decimalXX = TypeParam;
-  using RepType   = device_storage_type_t<decimalXX>;
-
-  auto const lhs = make_fixed_point_scalar<decimalXX>(12000, scale_type{-1});
+  auto const lhs = make_fixed_point_scalar<decimalXX>(1200, scale_type{0});
   auto const rhs = fp_wrapper<RepType>{{100, 200, 300, 500, 600, 800, 1200, 1300}, scale_type{-2}};
 
   auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv8)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div8)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
   auto const lhs      = fp_wrapper<RepType>{{4000, 6000, 80000}, scale_type{-1}};
-  auto const rhs      = make_fixed_point_scalar<decimalXX>(500, scale_type{-2});
+  auto const rhs      = make_fixed_point_scalar<decimalXX>(5000, scale_type{-3});
   auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv9)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div9)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
-  auto const lhs      = fp_wrapper<RepType>{{100000, 200000, 300000}, scale_type{-2}};
+  auto const lhs      = fp_wrapper<RepType>{{10, 20, 30}, scale_type{2}};
   auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{1});
   auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv10)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div10)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
-  auto const lhs      = fp_wrapper<RepType>{{100000, 200000, 300000}, scale_type{-2}};
+  auto const lhs      = fp_wrapper<RepType>{{100, 200, 300}, scale_type{1}};
   auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{0});
   auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
-TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpTrueDiv11)
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div11)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
   using RepType   = device_storage_type_t<decimalXX>;
 
-  auto const lhs      = fp_wrapper<RepType>{{1000000, 2000000, 3000000}, scale_type{-2}};
+  auto const lhs      = fp_wrapper<RepType>{{1000, 2000, 3000}, scale_type{1}};
   auto const rhs      = fp_wrapper<RepType>{{7, 7, 7}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::TRUE_DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpThrows)
+{
+  using namespace numeric;
+  using decimalXX = TypeParam;
+  using RepType   = device_storage_type_t<decimalXX>;
+
+  auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
+  auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
+  auto const float_type    = data_type{type_id::FLOAT32};
+  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
+               cudf::logic_error);
+}
+
 }  // namespace binop
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index 5f969098b48..5f74e459bb1 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -596,8 +596,9 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
   auto const expected1 = fp_wrapper{{150000000}, scale_type{6}};
   auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
 
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+  auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -618,8 +619,9 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
   auto const expected1 = fp_wrapper{{150000000}, scale_type{100}};
   auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
 
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+  auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -630,6 +632,7 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
   // This is testing fixed_point values with an extremely large negative scale. The fixed_point
   // implementation should be able to handle any scale representable by an int32_t
 
+  using decimalXX  = fixed_point<TypeParam, Radix::BASE_10>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<TypeParam>;
 
   auto const a = fp_wrapper{{10}, scale_type{-201}};
@@ -639,8 +642,11 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
   auto const expected1 = fp_wrapper{{150}, scale_type{-202}};
   auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
 
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, {});
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, {});
+  auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
+
+  auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 5228c34af67..59a6b876961 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -13,7 +13,7 @@ from cudf._lib.replace import replace_nulls
 from cudf._lib.scalar import as_device_scalar
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.types import np_to_cudf_types
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, dtype_to_data_type
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.scalar.scalar cimport scalar
@@ -174,15 +174,8 @@ def binaryop(lhs, rhs, op, dtype):
     cdef binary_operator c_op = <binary_operator> (
         <underlying_type_t_binary_operator> op
     )
-    cdef type_id tid = (
-        <type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
-            )
-        )
-    )
 
-    cdef data_type c_dtype = data_type(tid)
+    cdef data_type c_dtype = dtype_to_data_type(dtype)
 
     if is_scalar(lhs) or lhs is None:
         is_string_col = is_string_dtype(rhs.dtype)
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f9b5c859ff2..ffc3fdfd70a 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -32,7 +32,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer
 from cudf._lib.types import np_to_cudf_types, cudf_to_np_types
 from cudf._lib.types cimport (
     underlying_type_t_type_id,
-    dtype_from_column_view
+    dtype_from_column_view,
+    dtype_to_data_type
 )
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
 
@@ -378,29 +379,12 @@ cdef class Column:
     cdef column_view _view(self, libcudf_types.size_type null_count) except *:
         if is_categorical_dtype(self.dtype):
             col = self.base_children[0]
+            data_dtype = col.dtype
         else:
             col = self
+            data_dtype = self.dtype
 
-        data_dtype = col.dtype
-        cdef libcudf_types.type_id tid
-
-        if is_list_dtype(self.dtype):
-            tid = libcudf_types.type_id.LIST
-        elif is_struct_dtype(self.dtype):
-            tid = libcudf_types.type_id.STRUCT
-        elif is_decimal_dtype(self.dtype):
-            tid = libcudf_types.type_id.DECIMAL64
-        else:
-            tid = <libcudf_types.type_id> (
-                <underlying_type_t_type_id> (
-                    np_to_cudf_types[np.dtype(data_dtype)]
-                )
-            )
-        cdef libcudf_types.data_type dtype = (
-            libcudf_types.data_type(tid, -self.dtype.scale)
-            if tid == libcudf_types.type_id.DECIMAL64
-            else libcudf_types.data_type(tid)
-        )
+        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index 9b35ca2e80c..383b3665bd9 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -4,7 +4,7 @@ from libc.stdint cimport int32_t
 from libcpp cimport bool
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-
+cimport cudf._lib.cpp.types as libcudf_types
 
 ctypedef bool underlying_type_t_order
 ctypedef bool underlying_type_t_null_order
@@ -14,3 +14,5 @@ ctypedef int32_t underlying_type_t_type_id
 ctypedef bool underlying_type_t_null_policy
 
 cdef dtype_from_column_view(column_view cv)
+
+cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 370d083d7ac..e9ed4f21ddd 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -15,6 +15,7 @@ from cudf._lib.types cimport (
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf.core.dtypes import ListDtype, StructDtype, Decimal64Dtype
+from cudf.utils.dtypes import is_decimal_dtype, is_list_dtype, is_struct_dtype
 
 cimport cudf._lib.cpp.types as libcudf_types
 
@@ -192,8 +193,7 @@ cdef dtype_from_structs_column_view(column_view cv):
 
 cdef dtype_from_decimal_column_view(column_view cv):
     scale = -cv.type().scale()
-    precision = 18  # max of 64 bit integer
-    return Decimal64Dtype(precision=precision, scale=scale)
+    return Decimal64Dtype(precision=Decimal64Dtype.MAX_PRECISION, scale=scale)
 
 cdef dtype_from_column_view(column_view cv):
     cdef libcudf_types.type_id tid = cv.type().id()
@@ -208,3 +208,20 @@ cdef dtype_from_column_view(column_view cv):
                                   "Use decimal64 instead")
     else:
         return cudf_to_np_types[<underlying_type_t_type_id>(tid)]
+
+cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
+    if is_list_dtype(dtype):
+        tid = libcudf_types.type_id.LIST
+    elif is_struct_dtype(dtype):
+        tid = libcudf_types.type_id.STRUCT
+    elif is_decimal_dtype(dtype):
+        tid = libcudf_types.type_id.DECIMAL64
+    else:
+        tid = <libcudf_types.type_id> (
+            <underlying_type_t_type_id> (
+                np_to_cudf_types[np.dtype(dtype)]))
+
+    if tid == libcudf_types.type_id.DECIMAL64:
+        return libcudf_types.data_type(tid, -dtype.scale)
+    else:
+        return libcudf_types.data_type(tid)
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 4766426892a..7fbe602f07a 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -64,7 +64,11 @@ def to_arrow(self):
     def binary_operator(self, op, other, reflect=False):
         if reflect:
             self, other = other, self
-        result = libcudf.binaryop.binaryop(self, other, op, "int32")
+        scale = _binop_scale(self.dtype, other.dtype, op)
+        output_type = Decimal64Dtype(
+            scale=scale, precision=Decimal64Dtype.MAX_PRECISION
+        )  # precision will be ignored, libcudf has no notion of precision
+        result = libcudf.binaryop.binaryop(self, other, op, output_type)
         result.dtype.precision = _binop_precision(self.dtype, other.dtype, op)
         return result
 
@@ -99,6 +103,18 @@ def as_string_column(
             )
 
 
+def _binop_scale(l_dtype, r_dtype, op):
+    # This should at some point be hooked up to libcudf's
+    # binary_operation_fixed_point_scale
+    s1, s2 = l_dtype.scale, r_dtype.scale
+    if op in ("add", "sub"):
+        return max(s1, s2)
+    elif op == "mul":
+        return s1 + s2
+    else:
+        raise NotImplementedError()
+
+
 def _binop_precision(l_dtype, r_dtype, op):
     """
     Returns the result precision when performing the
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 8b7d54b6715..a18aad3872b 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -235,7 +235,7 @@ class Decimal64Dtype(ExtensionDtype):
 
     name = "decimal"
     _metadata = ("precision", "scale")
-    _MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
+    MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
 
     def __init__(self, precision, scale=0):
         """
@@ -303,10 +303,10 @@ def __hash__(self):
 
     @classmethod
     def _validate(cls, precision, scale=0):
-        if precision > Decimal64Dtype._MAX_PRECISION:
+        if precision > Decimal64Dtype.MAX_PRECISION:
             raise ValueError(
                 f"Cannot construct a {cls.__name__}"
-                f" with precision > {cls._MAX_PRECISION}"
+                f" with precision > {cls.MAX_PRECISION}"
             )
         if abs(scale) > precision:
             raise ValueError(f"scale={scale} exceeds precision={precision}")
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 20c86b2a4b7..1438421bb12 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -13,7 +13,7 @@
 from pandas.core.dtypes.dtypes import CategoricalDtype, CategoricalDtypeType
 
 import cudf
-from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
+from cudf._lib.scalar import DeviceScalar
 from cudf.core._compat import PANDAS_GE_120
 
 _NA_REP = "<NA>"
@@ -331,7 +331,10 @@ def to_cudf_compatible_scalar(val, dtype=None):
 
     If `val` is None, returns None.
     """
-    if _is_null_host_scalar(val) or isinstance(val, cudf.Scalar):
+
+    if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
+        val, cudf.Scalar
+    ):
         return val
 
     if not is_scalar(val):

From 396f7415865c6b3d933bd07f24444bef56925649 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 12 Mar 2021 20:59:46 -0500
Subject: [PATCH 25/33] CMAKE_CUDA_ARCHITECTURES doesn't change when
 build-system invokes cmake (#7579)

Consider the following:
```
cmake -DCMAKE_CUDA_ARCHITECTURES="" . #build for detected
touch <cudf_dir>/cpp/CMakeLists.txt
ninja #should be build for detected

cmake -DCMAKE_CUDA_ARCHITECTURES= . #build for all
touch <cudf_dir>/cpp/CMakeLists.txt
ninja #should be build for all

cmake -DCMAKE_CUDA_ARCHITECTURES="" . #build for detected
touch <cudf_dir>/cpp/CMakeLists.txt
ninja #should be build for detected
```

Before these changes the invocations of `ninja` would always
go back to building for all when ever `ninja` was invoked. The issue
is that once a CMake cache variable exists it can't be removed
via `-DCMAKE_CUDA_ARCHITECTURES=` and therefore becomes sticky.

To resolve the issue you can now pass `-DCMAKE_CUDA_ARCHITECTURES=ALL`
to consistently get all archs, and now the build-system should not
change what CUDA archs you are building for.

Authors:
  - Robert Maynard (@robertmaynard)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7579
---
 build.sh                               | 2 +-
 cpp/CMakeLists.txt                     | 3 +--
 cpp/cmake/thirdparty/CUDF_GetRMM.cmake | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/build.sh b/build.sh
index d75053f8849..5eb404d02a8 100755
--- a/build.sh
+++ b/build.sh
@@ -135,7 +135,7 @@ if hasArg clean; then
 fi
 
 if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
-    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES="
+    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES=ALL"
     echo "Building for the architecture of the GPU in the system..."
 else
     CUDF_CMAKE_CUDA_ARCHITECTURES=""
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2e0c12d683a..2a51ad5e55a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -22,10 +22,9 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 # This needs to be run before enabling the CUDA language due to the default initialization behavior
 # of `CMAKE_CUDA_ARCHITECTURES`, https://gitlab.kitware.com/cmake/cmake/-/issues/21302
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL")
   set(CUDF_BUILD_FOR_ALL_ARCHS TRUE)
 elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
-  unset(CMAKE_CUDA_ARCHITECTURES CACHE)
   set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE)
 endif()
 
diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
index ccefaf2ff33..16c8a2b39f4 100644
--- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
@@ -43,7 +43,6 @@ function(find_and_configure_rmm VERSION)
         OPTIONS         "BUILD_TESTS OFF"
                         "BUILD_BENCHMARKS OFF"
                         "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}"
-                        "CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES}"
                         "DISABLE_DEPRECATION_WARNING ${DISABLE_DEPRECATION_WARNING}"
     )
     cudf_restore_if_enabled(BUILD_TESTS)

From 6a1868fb7435a49399bd1379d538b88d94e5020b Mon Sep 17 00:00:00 2001
From: Mike Wendt <1915404+mike-wendt@users.noreply.github.com>
Date: Sun, 14 Mar 2021 14:53:09 -0400
Subject: [PATCH 26/33] ENH Fix stale GHA and prevent duplicates (#7594)

- Updates the stale GHA to enable more operations per run to account for the large number of issues in this repo
- Prevents `inactive-30d` labels from being applied to issues/PRs that have a `inactive-90d` label

Authors:
  - Mike Wendt (@mike-wendt)

Approvers:
  - Ray Douglass (@raydouglass)

URL: https://github.com/rapidsai/cudf/pull/7594
---
 .github/workflows/stale.yaml | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
index 741e159fbd8..9ffe2b1ec8c 100644
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@@ -5,53 +5,53 @@ on:
     - cron: "0 * * * *"
 
 jobs:
-  mark-inactive-30d:
+  mark-inactive-90d:
     runs-on: ubuntu-latest
     steps:
-      - name: Mark 30 day inactive issues
+      - name: Mark 90 day inactive issues
         uses: actions/stale@v3
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           stale-issue-message: >
-            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
+            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
             Please close this issue if no further response or action is needed.
             Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-issue-label: "inactive-30d"
+          stale-issue-label: "inactive-90d"
           exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 30
+          days-before-issue-stale: 90
           days-before-issue-close: -1
           stale-pr-message: >
-            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
+            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
             Please close this PR if it is no longer required.
             Otherwise, please respond with a comment indicating any updates.
-            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-pr-label: "inactive-30d"
+          stale-pr-label: "inactive-90d"
           exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 30
+          days-before-pr-stale: 90
           days-before-pr-close: -1
-          operations-per-run: 50
-  mark-inactive-90d:
+          operations-per-run: 1000
+  mark-inactive-30d:
     runs-on: ubuntu-latest
     steps:
-      - name: Mark 90 day inactive issues
+      - name: Mark 30 day inactive issues
         uses: actions/stale@v3
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           stale-issue-message: >
-            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
+            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
             Please close this issue if no further response or action is needed.
             Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-          stale-issue-label: "inactive-90d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 90
+            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-issue-label: "inactive-30d"
+          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue,inactive-90d"
+          days-before-issue-stale: 30
           days-before-issue-close: -1
           stale-pr-message: >
-            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
+            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
             Please close this PR if it is no longer required.
             Otherwise, please respond with a comment indicating any updates.
-          stale-pr-label: "inactive-90d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 90
+            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-pr-label: "inactive-30d"
+          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue,inactive-90d"
+          days-before-pr-stale: 30
           days-before-pr-close: -1
-          operations-per-run: 50
+          operations-per-run: 1000

From 5fea6ade9daf6d8f00760ab7019f03ef0c1cdd3c Mon Sep 17 00:00:00 2001
From: Mike Wendt <1915404+mike-wendt@users.noreply.github.com>
Date: Sun, 14 Mar 2021 15:44:53 -0400
Subject: [PATCH 27/33] Revert "ENH Fix stale GHA and prevent duplicates
 (#7594)" (#7595)

Reverts rapidsai/cudf#7594

The changes made to the number of operations resulted in using all available GH API calls across the org which prevents other GHAs from running in other repos. This reverts the change until a better solution can be determined on how to proceed

Authors:
  - Mike Wendt (@mike-wendt)

Approvers:
  - Ray Douglass (@raydouglass)

URL: https://github.com/rapidsai/cudf/pull/7595
---
 .github/workflows/stale.yaml | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
index 9ffe2b1ec8c..741e159fbd8 100644
--- a/.github/workflows/stale.yaml
+++ b/.github/workflows/stale.yaml
@@ -5,53 +5,53 @@ on:
     - cron: "0 * * * *"
 
 jobs:
-  mark-inactive-90d:
+  mark-inactive-30d:
     runs-on: ubuntu-latest
     steps:
-      - name: Mark 90 day inactive issues
+      - name: Mark 30 day inactive issues
         uses: actions/stale@v3
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           stale-issue-message: >
-            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
+            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
             Please close this issue if no further response or action is needed.
             Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-          stale-issue-label: "inactive-90d"
+            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-issue-label: "inactive-30d"
           exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 90
+          days-before-issue-stale: 30
           days-before-issue-close: -1
           stale-pr-message: >
-            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
+            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
             Please close this PR if it is no longer required.
             Otherwise, please respond with a comment indicating any updates.
-          stale-pr-label: "inactive-90d"
+            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-pr-label: "inactive-30d"
           exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 90
+          days-before-pr-stale: 30
           days-before-pr-close: -1
-          operations-per-run: 1000
-  mark-inactive-30d:
+          operations-per-run: 50
+  mark-inactive-90d:
     runs-on: ubuntu-latest
     steps:
-      - name: Mark 30 day inactive issues
+      - name: Mark 90 day inactive issues
         uses: actions/stale@v3
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           stale-issue-message: >
-            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
+            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
             Please close this issue if no further response or action is needed.
             Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-issue-label: "inactive-30d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue,inactive-90d"
-          days-before-issue-stale: 30
+          stale-issue-label: "inactive-90d"
+          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-issue-stale: 90
           days-before-issue-close: -1
           stale-pr-message: >
-            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
+            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
             Please close this PR if it is no longer required.
             Otherwise, please respond with a comment indicating any updates.
-            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-pr-label: "inactive-30d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue,inactive-90d"
-          days-before-pr-stale: 30
+          stale-pr-label: "inactive-90d"
+          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-pr-stale: 90
           days-before-pr-close: -1
-          operations-per-run: 1000
+          operations-per-run: 50

From 325d5b800b17fd8ab853904c1b51e0de0d7581ef Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 15 Mar 2021 17:08:04 +0530
Subject: [PATCH 28/33] Use device_uvector, device_span in sort groupby (#7523)

- Replace device_vector with device_uvector
- Replace device_vector const& with device_span<const>

Ref. https://github.com/rapidsai/cudf/pull/7387#discussion_r587718013

Authors:
  - Karthikeyan (@karthikeyann)

Approvers:
  - Mike Wilson (@hyperbolic2346)
  - David (@davidwendt)

URL: https://github.com/rapidsai/cudf/pull/7523
---
 .../cudf/detail/groupby/sort_helper.hpp       |  6 +-
 cpp/src/groupby/groupby.cu                    | 19 ++++---
 cpp/src/groupby/sort/group_argmax.cu          |  3 +-
 cpp/src/groupby/sort/group_argmin.cu          |  3 +-
 cpp/src/groupby/sort/group_collect.cu         |  5 +-
 cpp/src/groupby/sort/group_count.cu           |  5 +-
 cpp/src/groupby/sort/group_max.cu             |  2 +-
 cpp/src/groupby/sort/group_min.cu             |  2 +-
 cpp/src/groupby/sort/group_nth_element.cu     |  5 +-
 cpp/src/groupby/sort/group_nunique.cu         | 21 +++----
 cpp/src/groupby/sort/group_quantiles.cu       |  9 +--
 cpp/src/groupby/sort/group_reductions.hpp     | 30 +++++-----
 .../sort/group_single_pass_reduction_util.cuh |  8 +--
 cpp/src/groupby/sort/group_std.cu             | 12 ++--
 cpp/src/groupby/sort/group_sum.cu             |  3 +-
 cpp/src/groupby/sort/sort_helper.cu           | 17 ++++--
 cpp/src/rolling/grouped_rolling.cu            | 56 +++++++++----------
 17 files changed, 110 insertions(+), 96 deletions(-)

diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 294978bf128..cadcb1265c4 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -22,7 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -40,8 +40,8 @@ namespace sort {
  *   value column
  */
 struct sort_groupby_helper {
-  using index_vector       = rmm::device_vector<size_type>;
-  using bitmask_vector     = rmm::device_vector<bitmask_type>;
+  using index_vector       = rmm::device_uvector<size_type>;
+  using bitmask_vector     = rmm::device_uvector<bitmask_type>;
   using column_ptr         = std::unique_ptr<column>;
   using index_vector_ptr   = std::unique_ptr<index_vector>;
   using bitmask_vector_ptr = std::unique_ptr<bitmask_vector>;
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 3166b2be4d4..487aed4b411 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -164,18 +164,19 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
   CUDF_FUNC_RANGE();
   auto grouped_keys = helper().sorted_keys(rmm::cuda_stream_default, mr);
 
-  auto group_offsets = helper().group_offsets(0);
+  auto const& group_offsets = helper().group_offsets(rmm::cuda_stream_default);
   std::vector<size_type> group_offsets_vector(group_offsets.size());
-  thrust::copy(group_offsets.begin(), group_offsets.end(), group_offsets_vector.begin());
+  thrust::copy(thrust::device_pointer_cast(group_offsets.begin()),
+               thrust::device_pointer_cast(group_offsets.end()),
+               group_offsets_vector.begin());
 
-  std::unique_ptr<table> grouped_values{nullptr};
   if (values.num_columns()) {
-    grouped_values = cudf::detail::gather(values,
-                                          helper().key_sort_order(),
-                                          cudf::out_of_bounds_policy::DONT_CHECK,
-                                          cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                          rmm::cuda_stream_default,
-                                          mr);
+    auto grouped_values = cudf::detail::gather(values,
+                                               helper().key_sort_order(),
+                                               cudf::out_of_bounds_policy::DONT_CHECK,
+                                               cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                               rmm::cuda_stream_default,
+                                               mr);
     return groupby::groups{
       std::move(grouped_keys), std::move(group_offsets_vector), std::move(grouped_values)};
   } else {
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index 94b5d3817a7..bed64c5147a 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -17,6 +17,7 @@
 #include <groupby/sort/group_single_pass_reduction_util.cuh>
 
 #include <cudf/detail/gather.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -27,7 +28,7 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_argmax(column_view const& values,
                                      size_type num_groups,
-                                     rmm::device_vector<size_type> const& group_labels,
+                                     cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index 11a350ae1c4..ec97a609390 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -17,6 +17,7 @@
 #include <groupby/sort/group_single_pass_reduction_util.cuh>
 
 #include <cudf/detail/gather.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -27,7 +28,7 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_argmin(column_view const& values,
                                      size_type num_groups,
-                                     rmm::device_vector<size_type> const& group_labels,
+                                     cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index 9c8ab92cc50..b7bcd05a72a 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -26,13 +27,13 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_collect(column_view const &values,
-                                      rmm::device_vector<size_type> const &group_offsets,
+                                      cudf::device_span<size_type const> group_offsets,
                                       size_type num_groups,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource *mr)
 {
   rmm::device_buffer offsets_data(
-    group_offsets.data().get(), group_offsets.size() * sizeof(cudf::size_type), stream, mr);
+    group_offsets.data(), group_offsets.size() * sizeof(cudf::size_type), stream, mr);
 
   auto offsets = std::make_unique<cudf::column>(
     cudf::data_type(cudf::type_to_id<cudf::size_type>()), num_groups + 1, std::move(offsets_data));
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 352fc841d11..60e0ce31db1 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -29,7 +30,7 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_count_valid(column_view const& values,
-                                          rmm::device_vector<size_type> const& group_labels,
+                                          cudf::device_span<size_type const> group_labels,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
@@ -70,7 +71,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
   return result;
 }
 
-std::unique_ptr<column> group_count_all(rmm::device_vector<size_type> const& group_offsets,
+std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index 06aa172d125..bd4e676b83d 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -23,7 +23,7 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
-                                  rmm::device_vector<size_type> const& group_labels,
+                                  cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 72bc3e6ba3d..a7c84ac639e 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -23,7 +23,7 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
-                                  rmm::device_vector<size_type> const& group_labels,
+                                  cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index 95375f44605..5c8e8b790d4 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -31,8 +32,8 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_nth_element(column_view const &values,
                                           column_view const &group_sizes,
-                                          rmm::device_vector<size_type> const &group_labels,
-                                          rmm::device_vector<size_type> const &group_offsets,
+                                          cudf::device_span<size_type const> group_labels,
+                                          cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           size_type n,
                                           null_policy null_handling,
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index b14beb42435..09bbf13e8d4 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -34,9 +35,9 @@ struct nunique_functor {
   template <typename T>
   typename std::enable_if_t<cudf::is_equality_comparable<T, T>(), std::unique_ptr<column>>
   operator()(column_view const& values,
-             rmm::device_vector<size_type> const& group_labels,
+             cudf::device_span<size_type const> group_labels,
              size_type const num_groups,
-             rmm::device_vector<size_type> const& group_offsets,
+             cudf::device_span<size_type const> group_offsets,
              null_policy null_handling,
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr)
@@ -54,8 +55,8 @@ struct nunique_functor {
         [v = *values_view,
          equal,
          null_handling,
-         group_offsets = group_offsets.data().get(),
-         group_labels  = group_labels.data().get()] __device__(auto i) -> size_type {
+         group_offsets = group_offsets.data(),
+         group_labels  = group_labels.data()] __device__(auto i) -> size_type {
           bool is_input_countable =
             (null_handling == null_policy::INCLUDE || v.is_valid_nocheck(i));
           bool is_unique = is_input_countable &&
@@ -76,8 +77,8 @@ struct nunique_functor {
         thrust::make_counting_iterator<size_type>(0),
         [v = *values_view,
          equal,
-         group_offsets = group_offsets.data().get(),
-         group_labels  = group_labels.data().get()] __device__(auto i) -> size_type {
+         group_offsets = group_offsets.data(),
+         group_labels  = group_labels.data()] __device__(auto i) -> size_type {
           bool is_unique = group_offsets[group_labels[i]] == i ||  // first element or
                            (not equal.operator()<T>(i, i - 1));    // new unique value in sorted
           return static_cast<size_type>(is_unique);
@@ -95,9 +96,9 @@ struct nunique_functor {
   template <typename T>
   typename std::enable_if_t<!cudf::is_equality_comparable<T, T>(), std::unique_ptr<column>>
   operator()(column_view const& values,
-             rmm::device_vector<size_type> const& group_labels,
+             cudf::device_span<size_type const> group_labels,
              size_type const num_groups,
-             rmm::device_vector<size_type> const& group_offsets,
+             cudf::device_span<size_type const> group_offsets,
              null_policy null_handling,
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr)
@@ -107,9 +108,9 @@ struct nunique_functor {
 };
 }  // namespace
 std::unique_ptr<column> group_nunique(column_view const& values,
-                                      rmm::device_vector<size_type> const& group_labels,
+                                      cudf::device_span<size_type const> group_labels,
                                       size_type const num_groups,
-                                      rmm::device_vector<size_type> const& group_offsets,
+                                      cudf::device_span<size_type const> group_offsets,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index d5af2de0f7a..fcadb2e71fb 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <groupby/sort/group_reductions.hpp>
 #include <quantiles/quantiles_util.hpp>
@@ -77,7 +78,7 @@ struct quantiles_functor {
   std::enable_if_t<std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
     column_view const& values,
     column_view const& group_sizes,
-    rmm::device_vector<size_type> const& group_offsets,
+    cudf::device_span<size_type const> group_offsets,
     size_type const num_groups,
     rmm::device_vector<double> const& quantile,
     interpolation interpolation,
@@ -110,7 +111,7 @@ struct quantiles_functor {
                            values_iter,
                            *group_size_view,
                            *result_view,
-                           group_offsets.data().get(),
+                           group_offsets.data(),
                            quantile.data().get(),
                            static_cast<size_type>(quantile.size()),
                            interpolation});
@@ -123,7 +124,7 @@ struct quantiles_functor {
                            values_iter,
                            *group_size_view,
                            *result_view,
-                           group_offsets.data().get(),
+                           group_offsets.data(),
                            quantile.data().get(),
                            static_cast<size_type>(quantile.size()),
                            interpolation});
@@ -145,7 +146,7 @@ struct quantiles_functor {
 // TODO: add optional check for is_sorted. Use context.flag_sorted
 std::unique_ptr<column> group_quantiles(column_view const& values,
                                         column_view const& group_sizes,
-                                        rmm::device_vector<size_type> const& group_offsets,
+                                        cudf::device_span<size_type const> group_offsets,
                                         size_type const num_groups,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index f5c21f1289e..b69fe6a0291 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 
 #include <memory>
 
@@ -38,7 +38,7 @@ namespace detail {
  */
 std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
-                                  rmm::device_vector<size_type> const& group_labels,
+                                  cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
@@ -53,7 +53,7 @@ std::unique_ptr<column> group_sum(column_view const& values,
  */
 std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
-                                  rmm::device_vector<size_type> const& group_labels,
+                                  cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
@@ -68,7 +68,7 @@ std::unique_ptr<column> group_min(column_view const& values,
  */
 std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
-                                  rmm::device_vector<size_type> const& group_labels,
+                                  cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
@@ -84,7 +84,7 @@ std::unique_ptr<column> group_max(column_view const& values,
  */
 std::unique_ptr<column> group_argmax(column_view const& values,
                                      size_type num_groups,
-                                     rmm::device_vector<size_type> const& group_labels,
+                                     cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr);
@@ -101,7 +101,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
  */
 std::unique_ptr<column> group_argmin(column_view const& values,
                                      size_type num_groups,
-                                     rmm::device_vector<size_type> const& group_labels,
+                                     cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr);
@@ -117,7 +117,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> group_count_valid(column_view const& values,
-                                          rmm::device_vector<size_type> const& group_labels,
+                                          cudf::device_span<size_type const> group_labels,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr);
@@ -130,7 +130,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> group_count_all(rmm::device_vector<size_type> const& group_offsets,
+std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
@@ -150,7 +150,7 @@ std::unique_ptr<column> group_count_all(rmm::device_vector<size_type> const& gro
 std::unique_ptr<column> group_var(column_view const& values,
                                   column_view const& group_means,
                                   column_view const& group_sizes,
-                                  rmm::device_vector<size_type> const& group_labels,
+                                  cudf::device_span<size_type const> group_labels,
                                   size_type ddof,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
@@ -168,7 +168,7 @@ std::unique_ptr<column> group_var(column_view const& values,
  */
 std::unique_ptr<column> group_quantiles(column_view const& values,
                                         column_view const& group_sizes,
-                                        rmm::device_vector<size_type> const& group_offsets,
+                                        cudf::device_span<size_type const> group_offsets,
                                         size_type const num_groups,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
@@ -190,9 +190,9 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> group_nunique(column_view const& values,
-                                      rmm::device_vector<size_type> const& group_labels,
+                                      cudf::device_span<size_type const> group_labels,
                                       size_type const num_groups,
-                                      rmm::device_vector<size_type> const& group_offsets,
+                                      cudf::device_span<size_type const> group_offsets,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
@@ -213,8 +213,8 @@ std::unique_ptr<column> group_nunique(column_view const& values,
  */
 std::unique_ptr<column> group_nth_element(column_view const& values,
                                           column_view const& group_sizes,
-                                          rmm::device_vector<size_type> const& group_labels,
-                                          rmm::device_vector<size_type> const& group_offsets,
+                                          cudf::device_span<size_type const> group_labels,
+                                          cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           size_type n,
                                           null_policy null_handling,
@@ -230,7 +230,7 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> group_collect(column_view const& values,
-                                      rmm::device_vector<size_type> const& group_offsets,
+                                      cudf::device_span<size_type const> group_offsets,
                                       size_type num_groups,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 5b26b7bf108..63a68974d6b 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -25,9 +25,9 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
@@ -54,7 +54,7 @@ struct reduce_functor {
   std::enable_if_t<is_supported<T>(), std::unique_ptr<column>> operator()(
     column_view const& values,
     size_type num_groups,
-    rmm::device_vector<cudf::size_type> const& group_labels,
+    cudf::device_span<size_type const> group_labels,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
   {
@@ -87,7 +87,7 @@ struct reduce_functor {
                          values.size(),
                          [d_values     = *valuesview,
                           d_result     = *resultview,
-                          dest_indices = group_labels.data().get()] __device__(auto i) {
+                          dest_indices = group_labels.data()] __device__(auto i) {
                            cudf::detail::update_target_element<DeviceType, K, true, true>{}(
                              d_result, dest_indices[i], d_values, i);
                          });
@@ -97,7 +97,7 @@ struct reduce_functor {
                          values.size(),
                          [d_values     = *valuesview,
                           d_result     = *resultview,
-                          dest_indices = group_labels.data().get()] __device__(auto i) {
+                          dest_indices = group_labels.data()] __device__(auto i) {
                            cudf::detail::update_target_element<dictionary32, K, true, true>{}(
                              d_result, dest_indices[i], d_values, i);
                          });
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index eb85932d8eb..e49d5ebb4aa 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -22,10 +22,10 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
@@ -65,7 +65,7 @@ struct var_transform {
 template <typename ResultType, typename Iterator>
 void reduce_by_key_fn(column_device_view const& values,
                       Iterator values_iter,
-                      rmm::device_vector<size_type> const& group_labels,
+                      cudf::device_span<size_type const> group_labels,
                       ResultType const* d_means,
                       size_type const* d_group_sizes,
                       size_type ddof,
@@ -75,7 +75,7 @@ void reduce_by_key_fn(column_device_view const& values,
   auto var_iter = thrust::make_transform_iterator(
     thrust::make_counting_iterator(0),
     var_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, d_group_sizes, group_labels.data().get(), ddof});
+      values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof});
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
@@ -91,7 +91,7 @@ struct var_functor {
     column_view const& values,
     column_view const& group_means,
     column_view const& group_sizes,
-    rmm::device_vector<size_type> const& group_labels,
+    cudf::device_span<size_type const> group_labels,
     size_type ddof,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
@@ -110,7 +110,7 @@ struct var_functor {
     auto values_view = column_device_view::create(values, stream);
     auto d_values    = *values_view;
 
-    auto d_group_labels = group_labels.data().get();
+    auto d_group_labels = group_labels.data();
     auto d_means        = group_means.data<ResultType>();
     auto d_group_sizes  = group_sizes.data<size_type>();
     auto d_result       = result->mutable_view().data<ResultType>();
@@ -157,7 +157,7 @@ struct var_functor {
 std::unique_ptr<column> group_var(column_view const& values,
                                   column_view const& group_means,
                                   column_view const& group_sizes,
-                                  rmm::device_vector<size_type> const& group_labels,
+                                  cudf::device_span<size_type const> group_labels,
                                   size_type ddof,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index f0b50e910c4..e9e6e985c54 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 #include <groupby/sort/group_single_pass_reduction_util.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -24,7 +25,7 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
-                                  rmm::device_vector<size_type> const& group_labels,
+                                  cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 8f3070c3497..6a9da36e21b 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -35,6 +35,7 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
+#include <thrust/uninitialized_fill.h>
 #include <thrust/unique.h>
 
 #include <algorithm>
@@ -160,7 +161,7 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
 {
   if (_group_offsets) return *_group_offsets;
 
-  _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1);
+  _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
 
   auto device_input_table = table_device_view::create(_keys, stream);
   auto sorted_order       = key_sort_order().data<size_type>();
@@ -182,9 +183,9 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
       permuted_row_equality_comparator<false>(*device_input_table, sorted_order));
   }
 
-  size_type num_groups          = thrust::distance(_group_offsets->begin(), result_end);
-  (*_group_offsets)[num_groups] = num_keys(stream);
-  _group_offsets->resize(num_groups + 1);
+  size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
+  _group_offsets->set_element(num_groups, num_keys(stream), stream);
+  _group_offsets->resize(num_groups + 1, stream);
 
   return *_group_offsets;
 }
@@ -195,12 +196,16 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(
   if (_group_labels) return *_group_labels;
 
   // Get group labels for future use in segmented sorting
-  _group_labels = std::make_unique<index_vector>(num_keys(stream));
+  _group_labels = std::make_unique<index_vector>(num_keys(stream), stream);
 
   auto& group_labels = *_group_labels;
 
   if (num_keys(stream) == 0) return group_labels;
 
+  thrust::uninitialized_fill(rmm::exec_policy(stream),
+                             group_labels.begin(),
+                             group_labels.end(),
+                             index_vector::value_type{0});
   thrust::scatter(rmm::exec_policy(stream),
                   thrust::make_constant_iterator(1, decltype(num_groups())(1)),
                   thrust::make_constant_iterator(1, num_groups()),
@@ -221,7 +226,7 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre
     data_type(type_to_id<size_type>()), _keys.num_rows(), mask_state::ALL_NULL, stream);
 
   auto group_labels_view = cudf::column_view(
-    data_type(type_to_id<size_type>()), group_labels().size(), group_labels().data().get());
+    data_type(type_to_id<size_type>()), group_labels().size(), group_labels().data());
 
   auto scatter_map = key_sort_order();
 
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 2dca4f608fd..135df6bdfe2 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -110,7 +110,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
 
   sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
-  auto group_offsets{helper.group_offsets()};
+  auto const& group_offsets{helper.group_offsets()};
   auto const& group_labels{helper.group_labels()};
 
   // `group_offsets` are interpreted in adjacent pairs, each pair representing the offsets
@@ -131,8 +131,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
          group_offsets[group_offsets.size() - 1] == input.size() &&
          "Must have at least one group.");
 
-  auto preceding_calculator = [d_group_offsets = group_offsets.data().get(),
-                               d_group_labels  = group_labels.data().get(),
+  auto preceding_calculator = [d_group_offsets = group_offsets.data(),
+                               d_group_labels  = group_labels.data(),
                                preceding_window] __device__(size_type idx) {
     auto group_label = d_group_labels[idx];
     auto group_start = d_group_offsets[group_label];
@@ -140,8 +140,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                         idx - group_start + 1);  // Preceding includes current row.
   };
 
-  auto following_calculator = [d_group_offsets = group_offsets.data().get(),
-                               d_group_labels  = group_labels.data().get(),
+  auto following_calculator = [d_group_offsets = group_offsets.data(),
+                               d_group_labels  = group_labels.data(),
                                following_window] __device__(size_type idx) {
     auto group_label = d_group_labels[idx];
     auto group_end =
@@ -152,10 +152,10 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
 
   if (aggr->kind == aggregation::CUDA || aggr->kind == aggregation::PTX) {
     cudf::detail::preceding_window_wrapper grouped_preceding_window{
-      group_offsets.data().get(), group_labels.data().get(), preceding_window};
+      group_offsets.data(), group_labels.data(), preceding_window};
 
     cudf::detail::following_window_wrapper grouped_following_window{
-      group_offsets.data().get(), group_labels.data().get(), following_window};
+      group_offsets.data(), group_labels.data(), following_window};
 
     return cudf::detail::rolling_window_udf(input,
                                             grouped_preceding_window,
@@ -371,7 +371,7 @@ std::unique_ptr<column> time_range_window_ASC(column_view const& input,
 /// If there are no nulls for any given group, (nulls_begin, nulls_end) == (0,0).
 std::tuple<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
 get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
-                                     rmm::device_vector<size_type> const& group_offsets)
+                                     rmm::device_uvector<size_type> const& group_offsets)
 {
   // For each group, the null values are themselves clustered
   // at the beginning or the end of the group.
@@ -392,7 +392,7 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
       thrust::make_counting_iterator(static_cast<size_type>(0)),
       thrust::make_counting_iterator(static_cast<size_type>(num_groups)),
       [d_timestamps    = *p_timestamps_device_view,
-       d_group_offsets = group_offsets.data().get(),
+       d_group_offsets = group_offsets.data(),
        d_null_start    = null_start.data(),
        d_null_end      = null_end.data()] __device__(auto group_label) {
         auto group_start           = d_group_offsets[group_label];
@@ -434,8 +434,8 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
 std::unique_ptr<column> time_range_window_ASC(
   column_view const& input,
   column_view const& timestamp_column,
-  rmm::device_vector<cudf::size_type> const& group_offsets,
-  rmm::device_vector<cudf::size_type> const& group_labels,
+  rmm::device_uvector<cudf::size_type> const& group_offsets,
+  rmm::device_uvector<cudf::size_type> const& group_labels,
   TimeT preceding_window,
   bool preceding_window_is_unbounded,
   TimeT following_window,
@@ -450,8 +450,8 @@ std::unique_ptr<column> time_range_window_ASC(
     get_null_bounds_for_timestamp_column(timestamp_column, group_offsets);
 
   auto preceding_calculator =
-    [d_group_offsets = group_offsets.data().get(),
-     d_group_labels  = group_labels.data().get(),
+    [d_group_offsets = group_offsets.data(),
+     d_group_labels  = group_labels.data(),
      d_timestamps    = timestamp_column.data<TimeT>(),
      d_nulls_begin   = null_start.data().get(),
      d_nulls_end     = null_end.data().get(),
@@ -490,8 +490,8 @@ std::unique_ptr<column> time_range_window_ASC(
   auto preceding_column = expand_to_column(preceding_calculator, input.size(), stream, mr);
 
   auto following_calculator =
-    [d_group_offsets = group_offsets.data().get(),
-     d_group_labels  = group_labels.data().get(),
+    [d_group_offsets = group_offsets.data(),
+     d_group_labels  = group_labels.data(),
      d_timestamps    = timestamp_column.data<TimeT>(),
      d_nulls_begin   = null_start.data().get(),
      d_nulls_end     = null_end.data().get(),
@@ -633,8 +633,8 @@ std::unique_ptr<column> time_range_window_DESC(column_view const& input,
 std::unique_ptr<column> time_range_window_DESC(
   column_view const& input,
   column_view const& timestamp_column,
-  rmm::device_vector<cudf::size_type> const& group_offsets,
-  rmm::device_vector<cudf::size_type> const& group_labels,
+  rmm::device_uvector<cudf::size_type> const& group_offsets,
+  rmm::device_uvector<cudf::size_type> const& group_labels,
   TimeT preceding_window,
   bool preceding_window_is_unbounded,
   TimeT following_window,
@@ -649,8 +649,8 @@ std::unique_ptr<column> time_range_window_DESC(
     get_null_bounds_for_timestamp_column(timestamp_column, group_offsets);
 
   auto preceding_calculator =
-    [d_group_offsets = group_offsets.data().get(),
-     d_group_labels  = group_labels.data().get(),
+    [d_group_offsets = group_offsets.data(),
+     d_group_labels  = group_labels.data(),
      d_timestamps    = timestamp_column.data<TimeT>(),
      d_nulls_begin   = null_start.data().get(),
      d_nulls_end     = null_end.data().get(),
@@ -691,8 +691,8 @@ std::unique_ptr<column> time_range_window_DESC(
   auto preceding_column = expand_to_column(preceding_calculator, input.size(), stream, mr);
 
   auto following_calculator =
-    [d_group_offsets = group_offsets.data().get(),
-     d_group_labels  = group_labels.data().get(),
+    [d_group_offsets = group_offsets.data(),
+     d_group_labels  = group_labels.data(),
      d_timestamps    = timestamp_column.data<TimeT>(),
      d_nulls_begin   = null_start.data().get(),
      d_nulls_end     = null_end.data().get(),
@@ -745,8 +745,8 @@ std::unique_ptr<column> grouped_time_range_rolling_window_impl(
   column_view const& input,
   column_view const& timestamp_column,
   cudf::order const& timestamp_ordering,
-  rmm::device_vector<cudf::size_type> const& group_offsets,
-  rmm::device_vector<cudf::size_type> const& group_labels,
+  rmm::device_uvector<cudf::size_type> const& group_offsets,
+  rmm::device_uvector<cudf::size_type> const& group_labels,
   window_bounds preceding_window_in_days,  // TODO: Consider taking offset-type as type_id. Assumes
                                            // days for now.
   window_bounds following_window_in_days,
@@ -758,7 +758,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window_impl(
   TimeT mult_factor{static_cast<TimeT>(multiplication_factor(timestamp_column.type()))};
 
   if (timestamp_ordering == cudf::order::ASCENDING) {
-    return group_offsets.empty()
+    return group_offsets.is_empty()
              ? time_range_window_ASC(input,
                                      timestamp_column,
                                      preceding_window_in_days.value * mult_factor,
@@ -782,7 +782,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window_impl(
                                      stream,
                                      mr);
   } else {
-    return group_offsets.empty()
+    return group_offsets.is_empty()
              ? time_range_window_DESC(input,
                                       timestamp_column,
                                       preceding_window_in_days.value * mult_factor,
@@ -835,11 +835,11 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
   using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
   using index_vector        = sort_groupby_helper::index_vector;
 
-  index_vector group_offsets, group_labels;
+  index_vector group_offsets(0, stream), group_labels(0, stream);
   if (group_keys.num_columns() > 0) {
     sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
-    group_offsets = helper.group_offsets();
-    group_labels  = helper.group_labels();
+    group_offsets = index_vector(helper.group_offsets(), stream);
+    group_labels  = index_vector(helper.group_labels(), stream);
   }
 
   // Assumes that `timestamp_column` is actually of a timestamp type.

From 36f18c8974c178eae2f58b936c83738a35168521 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 15 Mar 2021 11:09:39 -0700
Subject: [PATCH 29/33] Fix ORC issue with incorrect timestamp nanosecond
 values (#7581)

Closes #7355

Use 64 bit variables/buffers to handle nanosecond values since nanosecond encode can overflow a 32bit value in some cases.
Removed the overloaded `intrle_minmax` function, using templated `numeric_limits` functions instead (the alternative was to add another overload).

Performance impact evaluation pending, but this fix seems unavoidable regardless of the impact.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - GALI PREM SAGAR (@galipremsagar)
  - Devavret Makkar (@devavret)
  - Kumar Aatish (@kaatish)

URL: https://github.com/rapidsai/cudf/pull/7581
---
 cpp/src/io/orc/stripe_data.cu      | 34 ++++++++++++++++++++--------
 cpp/src/io/orc/stripe_enc.cu       | 36 +++++++++---------------------
 python/cudf/cudf/tests/test_orc.py | 14 ++++++++++++
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 4bca725a16b..1ff752034ad 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1455,8 +1455,9 @@ __global__ void __launch_bounds__(block_size)
     __syncthreads();
     // Decode data streams
     {
-      uint32_t numvals      = s->top.data.max_vals, secondary_val;
-      uint32_t vals_skipped = 0;
+      uint32_t numvals       = s->top.data.max_vals;
+      uint64_t secondary_val = 0;
+      uint32_t vals_skipped  = 0;
       if (s->is_string || s->chunk.type_kind == TIMESTAMP) {
         // For these data types, we have a secondary unsigned 32-bit data stream
         orc_bytestream_s *bs = (is_dictionary(s->chunk.encoding_kind)) ? &s->bs : &s->bs2;
@@ -1471,9 +1472,15 @@ __global__ void __launch_bounds__(block_size)
         }
         if (numvals > ofs) {
           if (is_rlev1(s->chunk.encoding_kind)) {
-            numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u32[ofs], numvals - ofs, t);
+            if (s->chunk.type_kind == TIMESTAMP)
+              numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u64[ofs], numvals - ofs, t);
+            else
+              numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u32[ofs], numvals - ofs, t);
           } else {
-            numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t);
+            if (s->chunk.type_kind == TIMESTAMP)
+              numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u64[ofs], numvals - ofs, t);
+            else
+              numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t);
           }
           __syncthreads();
           if (numvals <= ofs && t >= ofs && t < s->top.data.max_vals) { s->vals.u32[t] = 0; }
@@ -1487,15 +1494,24 @@ __global__ void __launch_bounds__(block_size)
             __syncthreads();
             if (t == 0) { s->top.data.index.run_pos[cid] = 0; }
             numvals -= vals_skipped;
-            if (t < numvals) { secondary_val = s->vals.u32[vals_skipped + t]; }
+            if (t < numvals) {
+              secondary_val = (s->chunk.type_kind == TIMESTAMP) ? s->vals.u64[vals_skipped + t]
+                                                                : s->vals.u32[vals_skipped + t];
+            }
             __syncthreads();
-            if (t < numvals) { s->vals.u32[t] = secondary_val; }
+            if (t < numvals) {
+              if (s->chunk.type_kind == TIMESTAMP)
+                s->vals.u64[t] = secondary_val;
+              else
+                s->vals.u32[t] = secondary_val;
+            }
           }
         }
         __syncthreads();
         // For strings with direct encoding, we need to convert the lengths into an offset
         if (!is_dictionary(s->chunk.encoding_kind)) {
-          secondary_val = (t < numvals) ? s->vals.u32[t] : 0;
+          if (t < numvals)
+            secondary_val = (s->chunk.type_kind == TIMESTAMP) ? s->vals.u64[t] : s->vals.u32[t];
           if (s->chunk.type_kind != TIMESTAMP) {
             lengths_to_positions(s->vals.u32, numvals, t);
             __syncthreads();
@@ -1693,7 +1709,7 @@ __global__ void __launch_bounds__(block_size)
             }
             case TIMESTAMP: {
               int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch;
-              uint32_t nanos  = secondary_val;
+              uint64_t nanos  = secondary_val;
               nanos           = (nanos >> 3) * kTimestampNanoScale[nanos & 7];
               if (!tz_table.ttimes.empty()) {
                 seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
@@ -1716,7 +1732,7 @@ __global__ void __launch_bounds__(block_size)
       if (s->chunk.type_kind == TIMESTAMP) {
         int buffer_pos = s->top.data.max_vals;
         if (t >= buffer_pos && t < buffer_pos + s->top.data.buffered_count) {
-          s->vals.u32[t - buffer_pos] = secondary_val;
+          s->vals.u64[t - buffer_pos] = secondary_val;
         }
       } else if (s->chunk.type_kind == BOOLEAN && t < s->top.data.buffered_count) {
         s->vals.u8[t] = secondary_val;
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 88cad005817..aef32efaf6e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -92,6 +92,7 @@ struct orcenc_state_s {
   union {
     uint8_t u8[2048];
     uint32_t u32[1024];
+    uint64_t u64[1024];
   } lengths;
 };
 
@@ -101,6 +102,7 @@ static inline __device__ uint32_t zigzag(int32_t v)
   int32_t s = (v >> 31);
   return ((v ^ s) * 2) - s;
 }
+static inline __device__ uint64_t zigzag(uint64_t v) { return v; }
 static inline __device__ uint64_t zigzag(int64_t v)
 {
   int64_t s = (v < 0) ? 1 : 0;
@@ -286,24 +288,6 @@ static inline __device__ uint32_t StoreVarint(uint8_t *dst, uint64_t v)
   return bytecnt;
 }
 
-static inline __device__ void intrle_minmax(int64_t &vmin, int64_t &vmax)
-{
-  vmin = INT64_MIN;
-  vmax = INT64_MAX;
-}
-// static inline __device__ void intrle_minmax(uint64_t &vmin, uint64_t &vmax) { vmin = UINT64_C(0);
-// vmax = UINT64_MAX; }
-static inline __device__ void intrle_minmax(int32_t &vmin, int32_t &vmax)
-{
-  vmin = INT32_MIN;
-  vmax = INT32_MAX;
-}
-static inline __device__ void intrle_minmax(uint32_t &vmin, uint32_t &vmax)
-{
-  vmin = UINT32_C(0);
-  vmax = UINT32_MAX;
-}
-
 template <class T>
 static inline __device__ void StoreBytesBigEndian(uint8_t *dst, T v, uint32_t w)
 {
@@ -412,13 +396,9 @@ static __device__ uint32_t IntegerRLE(orcenc_state_s *s,
     // Find minimum and maximum values
     if (literal_run > 0) {
       // Find min & max
-      T vmin, vmax;
+      T vmin = (t < literal_run) ? v0 : std::numeric_limits<T>::max();
+      T vmax = (t < literal_run) ? v0 : std::numeric_limits<T>::min();
       uint32_t literal_mode, literal_w;
-      if (t < literal_run) {
-        vmin = vmax = v0;
-      } else {
-        intrle_minmax(vmax, vmin);
-      }
       vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min());
       __syncthreads();
       vmax = block_reduce(temp_storage).Reduce(vmax, cub::Max());
@@ -652,6 +632,7 @@ __global__ void __launch_bounds__(block_size)
     typename cub::BlockReduce<int32_t, block_size>::TempStorage i32;
     typename cub::BlockReduce<int64_t, block_size>::TempStorage i64;
     typename cub::BlockReduce<uint32_t, block_size>::TempStorage u32;
+    typename cub::BlockReduce<uint64_t, block_size>::TempStorage u64;
   } temp_storage;
 
   orcenc_state_s *const s = &state_g;
@@ -763,7 +744,7 @@ __global__ void __launch_bounds__(block_size)
             int64_t ts       = static_cast<const int64_t *>(base)[row];
             int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
-            int32_t nanos    = (ts - seconds * ts_scale);
+            int64_t nanos    = (ts - seconds * ts_scale);
             // There is a bug in the ORC spec such that for negative timestamps, it is understood
             // between the writer and reader that nanos will be adjusted to their positive component
             // but the negative seconds will be left alone. This means that -2.6 is encoded as
@@ -786,7 +767,7 @@ __global__ void __launch_bounds__(block_size)
               }
               nanos = (nanos << 3) + zeroes;
             }
-            s->lengths.u32[nz_idx] = nanos;
+            s->lengths.u64[nz_idx] = nanos;
             break;
           }
           case STRING:
@@ -897,6 +878,9 @@ __global__ void __launch_bounds__(block_size)
         uint32_t flush = (s->cur_row == s->chunk.num_rows) ? 1 : 0, n;
         switch (s->chunk.type_kind) {
           case TIMESTAMP:
+            n = IntegerRLE<CI_DATA2, uint64_t, false, 0x3ff, block_size>(
+              s, s->lengths.u64, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u64);
+            break;
           case STRING:
             n = IntegerRLE<CI_DATA2, uint32_t, false, 0x3ff, block_size>(
               s, s->lengths.u32, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u32);
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index ed91e909f25..ca8aa00f80c 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -724,3 +724,17 @@ def test_orc_bool_encode_fail():
     # Also validate data
     pdf = pa.orc.ORCFile(buffer).read().to_pandas()
     assert_eq(okay_df, pdf)
+
+
+def test_nanoseconds_overflow():
+    buffer = BytesIO()
+    # Use nanosecond values that take more than 32 bits to encode
+    s = cudf.Series([710424008, -1338482640], dtype="datetime64[ns]")
+    expected = cudf.DataFrame({"s": s})
+    expected.to_orc(buffer)
+
+    cudf_got = cudf.read_orc(buffer)
+    assert_eq(expected, cudf_got)
+
+    pyarrow_got = pa.orc.ORCFile(buffer).read()
+    assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())

From 05bb2f06ad05b4db1cde08e947797729e4a4b9dd Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 15 Mar 2021 18:56:30 -0500
Subject: [PATCH 30/33] Fix auto-detecting GPU architectures (#7593)

Fixes regression from https://github.com/rapidsai/cudf/pull/7579 in auto-detecting GPU architectures when `-DCMAKE_CUDA_ARCHITECTURES=` is passed on the CLI.

Now that the cached `CMAKE_CUDA_ARCHITECTURES` isn't unset before calling `enable_language(CUDA)`, this call throws an error and configuration fails. This change ensures we call `enable_language(CUDA)` after any potential rewrites of `CMAKE_CUDA_ARCHITECTURES`.

This PR also aligns with RMM's `EvalGPUArchs.cmake` logic and prints `SUPPORTED_CUDA_ARCHITECTURES` instead of `"ALL"` in the case the current machine is a CPU-only node.

Related: https://github.com/rapidsai/rmm/pull/727

Authors:
  - Paul Taylor (@trxcllnt)
  - Robert Maynard (@robertmaynard)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7593
---
 build.sh                              |  2 +-
 cpp/cmake/Modules/ConfigureCUDA.cmake | 22 ++++---------
 cpp/cmake/Modules/EvalGPUArchs.cmake  | 37 ++++++++++++---------
 cpp/cmake/Modules/SetGPUArchs.cmake   | 46 +++++++++++++++------------
 4 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/build.sh b/build.sh
index 5eb404d02a8..d75053f8849 100755
--- a/build.sh
+++ b/build.sh
@@ -135,7 +135,7 @@ if hasArg clean; then
 fi
 
 if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
-    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES=ALL"
+    CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES="
     echo "Building for the architecture of the GPU in the system..."
 else
     CUDF_CMAKE_CUDA_ARCHITECTURES=""
diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index 44699a13206..d4be6e65021 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -17,26 +17,16 @@
 # Find the CUDAToolkit
 find_package(CUDAToolkit REQUIRED)
 
-# Must come after find_package(CUDAToolkit) because we symlink
-# ccache as a compiler front-end for nvcc in gpuCI CPU builds.
-enable_language(CUDA)
-
-if(CMAKE_CUDA_COMPILER_VERSION)
-  # Compute the version. from  CMAKE_CUDA_COMPILER_VERSION
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${CMAKE_CUDA_COMPILER_VERSION})
-  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${CMAKE_CUDA_COMPILER_VERSION})
-  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}")
-endif()
-
-message(VERBOSE "CUDF: CUDA_VERSION_MAJOR: ${CUDA_VERSION_MAJOR}")
-message(VERBOSE "CUDF: CUDA_VERSION_MINOR: ${CUDA_VERSION_MINOR}")
-message(STATUS "CUDF: CUDA_VERSION: ${CUDA_VERSION}")
-
 # Auto-detect available GPU compute architectures
-
 include(${CUDF_SOURCE_DIR}/cmake/Modules/SetGPUArchs.cmake)
 message(STATUS "CUDF: Building CUDF for GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
+# Must come after find_package(CUDAToolkit) because we symlink
+# ccache as a compiler front-end for nvcc in gpuCI CPU builds.
+# Must also come after we detect and potentially rewrite
+# CMAKE_CUDA_ARCHITECTURES
+enable_language(CUDA)
+
 if(CMAKE_COMPILER_IS_GNUCXX)
     list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
     if(CUDF_BUILD_TESTS OR CUDF_BUILD_BENCHMARKS)
diff --git a/cpp/cmake/Modules/EvalGPUArchs.cmake b/cpp/cmake/Modules/EvalGPUArchs.cmake
index 6c747a0b867..09e42c6cc7a 100644
--- a/cpp/cmake/Modules/EvalGPUArchs.cmake
+++ b/cpp/cmake/Modules/EvalGPUArchs.cmake
@@ -14,12 +14,21 @@
 # limitations under the License.
 #=============================================================================
 
+# Unset this first in case it's set to <empty_string>
+set(CMAKE_CUDA_ARCHITECTURES OFF)
+
+# Enable CUDA so we can invoke nvcc
+enable_language(CUDA)
+
+# Function uses the CUDA runtime API to query the compute capability of the device, so if a user
+# doesn't pass any architecture options to CMake we only build the current architecture
 function(evaluate_gpu_archs gpu_archs)
   set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu)
   set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs)
   set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log)
-  file(WRITE ${eval_file}
-[=[
+  file(
+    WRITE ${eval_file}
+    "
 #include <cstdio>
 #include <set>
 #include <string>
@@ -32,32 +41,30 @@ int main(int argc, char** argv) {
       char buff[32];
       cudaDeviceProp prop;
       if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
-      sprintf(buff, "%d%d", prop.major, prop.minor);
+      sprintf(buff, \"%d%d\", prop.major, prop.minor);
       archs.insert(buff);
     }
   }
   if(archs.empty()) {
-    printf("ALL");
+    printf(\"${SUPPORTED_CUDA_ARCHITECTURES}\");
   } else {
     bool first = true;
     for(const auto& arch : archs) {
-      printf(first? "%s" : ";%s", arch.c_str());
+      printf(first? \"%s\" : \";%s\", arch.c_str());
       first = false;
     }
   }
-  printf("\n");
+  printf(\"\\n\");
   return 0;
 }
-]=])
+")
   execute_process(
-    COMMAND ${CMAKE_CUDA_COMPILER}
-      -std=c++11
-      -o ${eval_exe}
-      --run
-      ${eval_file}
+    COMMAND ${CMAKE_CUDA_COMPILER} -std=c++11 -o ${eval_exe} --run ${eval_file}
     OUTPUT_VARIABLE __gpu_archs
     OUTPUT_STRIP_TRAILING_WHITESPACE
     ERROR_FILE ${error_file})
-  message(VERBOSE "CUDF: Auto detection of gpu-archs: ${__gpu_archs}")
-  set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
-endfunction()
+  message(STATUS "CUDF: Auto detection of gpu-archs: ${__gpu_archs}")
+  set(${gpu_archs}
+      ${__gpu_archs}
+      PARENT_SCOPE)
+endfunction(evaluate_gpu_archs)
diff --git a/cpp/cmake/Modules/SetGPUArchs.cmake b/cpp/cmake/Modules/SetGPUArchs.cmake
index 396023ee9a9..61e4e6bc198 100644
--- a/cpp/cmake/Modules/SetGPUArchs.cmake
+++ b/cpp/cmake/Modules/SetGPUArchs.cmake
@@ -25,35 +25,41 @@ else()
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "62" "72")
 endif()
 
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
+# CMake < 3.20 has a bug in FindCUDAToolkit where it won't properly detect the CUDAToolkit version
+# when find_package(CUDAToolkit) occurs before enable_language(CUDA)
+if(NOT DEFINED CUDAToolkit_VERSION AND CMAKE_CUDA_COMPILER)
+  execute_process(COMMAND ${CMAKE_CUDA_COMPILER} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+  endif()
+  unset(NVCC_OUT)
+endif()
+
+if(CUDAToolkit_VERSION_MAJOR LESS 11)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "80")
 endif()
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 10)
+if(CUDAToolkit_VERSION_MAJOR LESS 10)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "75")
 endif()
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9)
+if(CUDAToolkit_VERSION_MAJOR LESS 9)
   list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "70")
 endif()
 
-if(CUDF_BUILD_FOR_DETECTED_ARCHS)
-  include(${CUDF_SOURCE_DIR}/cmake/Modules/EvalGPUArchs.cmake)
-  evaluate_gpu_archs(CMAKE_CUDA_ARCHITECTURES)
-  if(CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL")
-    unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-    set(CUDF_BUILD_FOR_ALL_ARCHS TRUE)
-  else()
-    set(CUDF_BUILD_FOR_ALL_ARCHS FALSE)
-    list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
-  endif()
-endif()
-
-if(CUDF_BUILD_FOR_ALL_ARCHS)
+if(${PROJECT_NAME}_BUILD_FOR_ALL_ARCHS)
   set(CMAKE_CUDA_ARCHITECTURES ${SUPPORTED_CUDA_ARCHITECTURES})
-  # CMake architecture list entry of "80" means to build compute and sm.
-  # What we want is for the newest arch only to build that way
-  # while the rest built only for sm.
-  list(SORT CMAKE_CUDA_ARCHITECTURES ORDER ASCENDING)
+
+  # CMake architecture list entry of "80" means to build compute and sm. What we want is for the
+  # newest arch only to build that way while the rest built only for sm.
   list(POP_BACK CMAKE_CUDA_ARCHITECTURES latest_arch)
   list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
   list(APPEND CMAKE_CUDA_ARCHITECTURES ${latest_arch})
+
+elseif(${PROJECT_NAME}_BUILD_FOR_DETECTED_ARCHS)
+  include(${PROJECT_SOURCE_DIR}/cmake/Modules/EvalGPUArchs.cmake)
+  evaluate_gpu_archs(CMAKE_CUDA_ARCHITECTURES)
+
+  list(TRANSFORM CMAKE_CUDA_ARCHITECTURES APPEND "-real")
 endif()

From 561f68a387578cf491da27c475ee7439ecd8855f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 15 Mar 2021 19:23:32 -0500
Subject: [PATCH 31/33] Match Pandas logic for comparing two objects with nulls
 (#7490)

Fixes https://github.com/rapidsai/cudf/issues/7066

Authors:
  - @brandon-b-miller

Approvers:
  - Ashwin Srinath (@shwina)
  - Christopher Harris (@cwharris)
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7490
---
 python/cudf/cudf/_lib/binaryop.pyx          |  20 +--
 python/cudf/cudf/_lib/cpp/binaryop.pxd      |   1 +
 python/cudf/cudf/_lib/reduce.pyx            |   2 +
 python/cudf/cudf/core/column/categorical.py |   6 +-
 python/cudf/cudf/core/column/column.py      |   6 +-
 python/cudf/cudf/core/column/datetime.py    |   2 +-
 python/cudf/cudf/core/column/numerical.py   |  13 +-
 python/cudf/cudf/core/column/string.py      |   3 +-
 python/cudf/cudf/core/column/timedelta.py   |   2 +-
 python/cudf/cudf/core/dataframe.py          |  14 +-
 python/cudf/cudf/core/frame.py              |   5 +-
 python/cudf/cudf/core/series.py             |   6 +-
 python/cudf/cudf/tests/test_binops.py       | 144 ++++++++++++++++----
 python/cudf/cudf/tests/test_dataframe.py    |  28 ++--
 python/cudf/cudf/tests/test_indexing.py     |  11 --
 python/cudf/cudf/tests/test_setitem.py      |   7 +-
 16 files changed, 182 insertions(+), 88 deletions(-)

diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 59a6b876961..5eaec640b15 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -93,6 +93,9 @@ class BinaryOperation(IntEnum):
     GENERIC_BINARY = (
         <underlying_type_t_binary_operator> binary_operator.GENERIC_BINARY
     )
+    NULL_EQUALS = (
+        <underlying_type_t_binary_operator> binary_operator.NULL_EQUALS
+    )
 
 
 cdef binaryop_v_v(Column lhs, Column rhs,
@@ -154,17 +157,6 @@ cdef binaryop_s_v(DeviceScalar lhs, Column rhs,
     return Column.from_unique_ptr(move(c_result))
 
 
-def handle_null_for_string_column(Column input_col, op):
-    if op in ('eq', 'lt', 'le', 'gt', 'ge'):
-        return replace_nulls(input_col, DeviceScalar(False, 'bool'))
-
-    elif op == 'ne':
-        return replace_nulls(input_col, DeviceScalar(True, 'bool'))
-
-    # Nothing needs to be done
-    return input_col
-
-
 def binaryop(lhs, rhs, op, dtype):
     """
     Dispatches a binary op call to the appropriate libcudf function:
@@ -205,11 +197,7 @@ def binaryop(lhs, rhs, op, dtype):
             c_op,
             c_dtype
         )
-
-    if is_string_col is True:
-        return handle_null_for_string_column(result, op.name.lower())
-    else:
-        return result
+    return result
 
 
 def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/cpp/binaryop.pxd
index fb36fdfd639..2e36070a164 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/cpp/binaryop.pxd
@@ -27,6 +27,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         GREATER "cudf::binary_operator::GREATER"
         LESS_EQUAL "cudf::binary_operator::LESS_EQUAL"
         GREATER_EQUAL "cudf::binary_operator::GREATER_EQUAL"
+        NULL_EQUALS "cudf::binary_operator::NULL_EQUALS"
         BITWISE_AND "cudf::binary_operator::BITWISE_AND"
         BITWISE_OR "cudf::binary_operator::BITWISE_OR"
         BITWISE_XOR "cudf::binary_operator::BITWISE_XOR"
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 7b455dd574b..2185cb089a7 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -57,6 +57,8 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             return incol.dtype.type(0)
         if reduction_op == 'product':
             return incol.dtype.type(1)
+        if reduction_op == "any":
+            return False
 
         return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
 
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c41a458f02b..39c278d2abf 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1014,7 +1014,11 @@ def slice(
     def binary_operator(
         self, op: str, rhs, reflect: bool = False
     ) -> ColumnBase:
-        if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"):
+        if not (self.ordered and rhs.ordered) and op not in (
+            "eq",
+            "ne",
+            "NULL_EQUALS",
+        ):
             if op in ("lt", "gt", "le", "ge"):
                 raise TypeError(
                     "Unordered Categoricals can only compare equality or not"
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2bb35c97d7c..b2b2874eeb4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -179,7 +179,11 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
         if check_dtypes:
             if self.dtype != other.dtype:
                 return False
-        return (self == other).min()
+        null_equals = self._null_equals(other)
+        return null_equals.all()
+
+    def _null_equals(self, other: ColumnBase) -> ColumnBase:
+        return self.binary_operator("NULL_EQUALS", other)
 
     def all(self) -> bool:
         return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 7c5385b9bbf..a563248f4ab 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -274,7 +274,7 @@ def binary_operator(
         if isinstance(rhs, cudf.DateOffset):
             return binop_offset(self, rhs, op)
         lhs, rhs = self, rhs
-        if op in ("eq", "ne", "lt", "gt", "le", "ge"):
+        if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"):
             out_dtype = np.dtype(np.bool_)  # type: Dtype
         elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
             out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 6fae8c644e3..7ad6eed65a8 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -700,16 +700,21 @@ def _numeric_column_binop(
     if reflect:
         lhs, rhs = rhs, lhs
 
-    is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]
+    is_op_comparison = op in [
+        "lt",
+        "gt",
+        "le",
+        "ge",
+        "eq",
+        "ne",
+        "NULL_EQUALS",
+    ]
 
     if is_op_comparison:
         out_dtype = "bool"
 
     out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
-    if is_op_comparison:
-        out = out.fillna(op == "ne")
-
     return out
 
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 81abdd3f66a..ea01aa07b91 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -434,7 +434,6 @@ def cat(self, others=None, sep=None, na_rep=None):
         3    dD
         dtype: object
         """
-
         if sep is None:
             sep = ""
 
@@ -5109,7 +5108,7 @@ def binary_operator(
         if isinstance(rhs, (StringColumn, str, cudf.Scalar)):
             if op == "add":
                 return cast("column.ColumnBase", lhs.str().cat(others=rhs))
-            elif op in ("eq", "ne", "gt", "lt", "ge", "le"):
+            elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"):
                 return _string_column_binop(self, rhs, op=op, out_dtype="bool")
 
         raise TypeError(
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index ac63192b692..e22b511db01 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -223,7 +223,7 @@ def binary_operator(
 
         if op in ("eq", "ne"):
             out_dtype = self._binary_op_eq_ne(rhs)
-        elif op in ("lt", "gt", "le", "ge"):
+        elif op in ("lt", "gt", "le", "ge", "NULL_EQUALS"):
             out_dtype = self._binary_op_lt_gt_le_ge(rhs)
         elif op == "mul":
             out_dtype = self._binary_op_mul(rhs)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ecdce9443a1..25f57748765 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6031,7 +6031,6 @@ def isin(self, values):
         falcon      True       True
         dog        False      False
         """
-
         if isinstance(values, dict):
 
             result_df = DataFrame()
@@ -6051,14 +6050,15 @@ def isin(self, values):
             values = values.reindex(self.index)
 
             result = DataFrame()
-
+            # TODO: propagate nulls through isin
+            # https://github.com/rapidsai/cudf/issues/7556
             for col in self._data.names:
                 if isinstance(
                     self[col]._column, cudf.core.column.CategoricalColumn
                 ) and isinstance(
                     values._column, cudf.core.column.CategoricalColumn
                 ):
-                    res = self._data[col] == values._column
+                    res = (self._data[col] == values._column).fillna(False)
                     result[col] = res
                 elif (
                     isinstance(
@@ -6073,7 +6073,9 @@ def isin(self, values):
                 ):
                     result[col] = utils.scalar_broadcast_to(False, len(self))
                 else:
-                    result[col] = self._data[col] == values._column
+                    result[col] = (self._data[col] == values._column).fillna(
+                        False
+                    )
 
             result.index = self.index
             return result
@@ -6083,7 +6085,9 @@ def isin(self, values):
             result = DataFrame()
             for col in self._data.names:
                 if col in values.columns:
-                    result[col] = self._data[col] == values[col]._column
+                    result[col] = (
+                        self._data[col] == values[col]._column
+                    ).fillna(False)
                 else:
                     result[col] = utils.scalar_broadcast_to(False, len(self))
             result.index = self.index
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 275d085ef5d..fab5936f94d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1566,10 +1566,7 @@ def _apply_boolean_mask(self, boolean_mask):
         rows corresponding to `False` is dropped
         """
         boolean_mask = as_column(boolean_mask)
-        if boolean_mask.has_nulls:
-            raise ValueError(
-                "cannot mask with boolean_mask containing null values"
-            )
+
         result = self.__class__._from_table(
             libcudf.stream_compaction.apply_boolean_mask(
                 self, as_column(boolean_mask)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 11e32e2285d..5e7121c0488 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3120,8 +3120,10 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
                 "bool_only parameter is not implemented yet"
             )
 
-        if self.empty:
-            return False
+        skipna = False if skipna is None else skipna
+
+        if skipna is False and self.has_nulls:
+            return True
 
         if skipna:
             result_series = self.nans_to_nulls()
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index a0b65743180..18f2d7e474b 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -206,12 +206,45 @@ def test_series_compare(cmpop, obj_class, dtype):
     np.testing.assert_equal(result3.to_array(), cmpop(arr1, arr2))
 
 
+def _series_compare_nulls_typegen():
+    tests = []
+    tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
+    tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
+    tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
+    tests += list(product(STRING_TYPES, STRING_TYPES))
+
+    return tests
+
+
+@pytest.mark.parametrize("cmpop", _cmpops)
+@pytest.mark.parametrize("dtypes", _series_compare_nulls_typegen())
+def test_series_compare_nulls(cmpop, dtypes):
+    ltype, rtype = dtypes
+
+    ldata = [1, 2, None, None, 5]
+    rdata = [2, 1, None, 4, None]
+
+    lser = Series(ldata, dtype=ltype)
+    rser = Series(rdata, dtype=rtype)
+
+    lmask = ~lser.isnull()
+    rmask = ~rser.isnull()
+
+    expect_mask = np.logical_and(lmask, rmask)
+    expect = cudf.Series([None] * 5, dtype="bool")
+    expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask])
+
+    got = cmpop(lser, rser)
+    utils.assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
-    "obj", [pd.Series(["a", "b", None, "d", "e", None]), "a"]
+    "obj", [pd.Series(["a", "b", None, "d", "e", None], dtype="string"), "a"]
 )
 @pytest.mark.parametrize("cmpop", _cmpops)
 @pytest.mark.parametrize(
-    "cmp_obj", [pd.Series(["b", "a", None, "d", "f", None]), "a"]
+    "cmp_obj",
+    [pd.Series(["b", "a", None, "d", "f", None], dtype="string"), "a"],
 )
 def test_string_series_compare(obj, cmpop, cmp_obj):
 
@@ -221,10 +254,12 @@ def test_string_series_compare(obj, cmpop, cmp_obj):
     g_cmp_obj = cmp_obj
     if isinstance(g_cmp_obj, pd.Series):
         g_cmp_obj = Series.from_pandas(g_cmp_obj)
-
     got = cmpop(g_obj, g_cmp_obj)
     expected = cmpop(obj, cmp_obj)
 
+    if isinstance(expected, pd.Series):
+        expected = cudf.from_pandas(expected)
+
     utils.assert_eq(expected, got)
 
 
@@ -694,10 +729,12 @@ def test_operator_func_series_and_scalar(
 def test_operator_func_between_series_logical(
     dtype, func, scalar_a, scalar_b, fill_value
 ):
-    gdf_series_a = Series([scalar_a]).astype(dtype)
-    gdf_series_b = Series([scalar_b]).astype(dtype)
-    pdf_series_a = gdf_series_a.to_pandas()
-    pdf_series_b = gdf_series_b.to_pandas()
+
+    gdf_series_a = Series([scalar_a], nan_as_null=False).astype(dtype)
+    gdf_series_b = Series([scalar_b], nan_as_null=False).astype(dtype)
+
+    pdf_series_a = gdf_series_a.to_pandas(nullable=True)
+    pdf_series_b = gdf_series_b.to_pandas(nullable=True)
 
     gdf_series_result = getattr(gdf_series_a, func)(
         gdf_series_b, fill_value=fill_value
@@ -705,16 +742,22 @@ def test_operator_func_between_series_logical(
     pdf_series_result = getattr(pdf_series_a, func)(
         pdf_series_b, fill_value=fill_value
     )
-
-    if scalar_a in [None, np.nan] and scalar_b in [None, np.nan]:
-        # cudf binary operations will return `None` when both left- and right-
-        # side values are `None`. It will return `np.nan` when either side is
-        # `np.nan`. As a consequence, when we convert our gdf => pdf during
-        # assert_eq, we get a pdf with dtype='object' (all inputs are none).
-        # to account for this, we use fillna.
-        gdf_series_result.fillna(func == "ne", inplace=True)
-
-    utils.assert_eq(pdf_series_result, gdf_series_result)
+    expect = pdf_series_result
+    got = gdf_series_result.to_pandas(nullable=True)
+
+    # If fill_value is np.nan, things break down a bit,
+    # because setting a NaN into a pandas nullable float
+    # array still gets transformed to <NA>. As such,
+    # pd_series_with_nulls.fillna(np.nan) has no effect.
+    if (
+        (pdf_series_a.isnull().sum() != pdf_series_b.isnull().sum())
+        and np.isscalar(fill_value)
+        and np.isnan(fill_value)
+    ):
+        with pytest.raises(AssertionError):
+            utils.assert_eq(expect, got)
+        return
+    utils.assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
@@ -729,8 +772,7 @@ def test_operator_func_series_and_scalar_logical(
     gdf_series = utils.gen_rand_series(
         dtype, 1000, has_nulls=has_nulls, stride=10000
     )
-    pdf_series = gdf_series.to_pandas()
-
+    pdf_series = gdf_series.to_pandas(nullable=True)
     gdf_series_result = getattr(gdf_series, func)(
         cudf.Scalar(scalar) if use_cudf_scalar else scalar,
         fill_value=fill_value,
@@ -739,7 +781,10 @@ def test_operator_func_series_and_scalar_logical(
         scalar, fill_value=fill_value
     )
 
-    utils.assert_eq(pdf_series_result, gdf_series_result)
+    expect = pdf_series_result
+    got = gdf_series_result.to_pandas(nullable=True)
+
+    utils.assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("func", _operators_arithmetic)
@@ -1738,10 +1783,61 @@ def test_equality_ops_index_mismatch(fn):
         index=["aa", "b", "c", "d", "e", "f", "y", "z"],
     )
 
-    pa = a.to_pandas()
-    pb = b.to_pandas()
-
+    pa = a.to_pandas(nullable=True)
+    pb = b.to_pandas(nullable=True)
     expected = getattr(pa, fn)(pb)
-    actual = getattr(a, fn)(b)
+    actual = getattr(a, fn)(b).to_pandas(nullable=True)
 
     utils.assert_eq(expected, actual)
+
+
+def generate_test_null_equals_columnops_data():
+    # Generate tuples of:
+    # (left_data, right_data, compare_bool
+    # where compare_bool is the correct answer to
+    # if the columns should compare as null equals
+
+    def set_null_cases(column_l, column_r, case):
+        if case == "neither":
+            return column_l, column_r
+        elif case == "left":
+            column_l[1] = None
+        elif case == "right":
+            column_r[1] = None
+        elif case == "both":
+            column_l[1] = None
+            column_r[1] = None
+        else:
+            raise ValueError("Unknown null case")
+        return column_l, column_r
+
+    null_cases = ["neither", "left", "right", "both"]
+    data = [1, 2, 3]
+
+    results = []
+    # TODO: Numeric types can be cross compared as null equal
+    for dtype in (
+        list(NUMERIC_TYPES)
+        + list(DATETIME_TYPES)
+        + list(TIMEDELTA_TYPES)
+        + list(STRING_TYPES)
+        + ["category"]
+    ):
+        for case in null_cases:
+            left = cudf.Series(data, dtype=dtype)
+            right = cudf.Series(data, dtype=dtype)
+            if case in {"left", "right"}:
+                answer = False
+            else:
+                answer = True
+            left, right = set_null_cases(left, right, case)
+            results.append((left._column, right._column, answer, case))
+
+    return results
+
+
+@pytest.mark.parametrize(
+    "lcol,rcol,ans,case", generate_test_null_equals_columnops_data()
+)
+def test_null_equals_columnops(lcol, rcol, ans, case):
+    assert lcol._null_equals(rcol).all() == ans
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index ffd66e18314..77548b95277 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5017,12 +5017,14 @@ def test_cov_nans():
         operator.truediv,
         operator.mod,
         operator.pow,
-        operator.eq,
-        operator.lt,
-        operator.le,
-        operator.gt,
-        operator.ge,
-        operator.ne,
+        # comparison ops will temporarily XFAIL
+        # see PR  https://github.com/rapidsai/cudf/pull/7491
+        pytest.param(operator.eq, marks=pytest.mark.xfail()),
+        pytest.param(operator.lt, marks=pytest.mark.xfail()),
+        pytest.param(operator.le, marks=pytest.mark.xfail()),
+        pytest.param(operator.gt, marks=pytest.mark.xfail()),
+        pytest.param(operator.ge, marks=pytest.mark.xfail()),
+        pytest.param(operator.ne, marks=pytest.mark.xfail()),
     ],
 )
 def test_df_sr_binop(gsr, colnames, op):
@@ -5052,12 +5054,14 @@ def test_df_sr_binop(gsr, colnames, op):
         operator.truediv,
         operator.mod,
         operator.pow,
-        operator.eq,
-        operator.lt,
-        operator.le,
-        operator.gt,
-        operator.ge,
-        operator.ne,
+        # comparison ops will temporarily XFAIL
+        # see PR  https://github.com/rapidsai/cudf/pull/7491
+        pytest.param(operator.eq, marks=pytest.mark.xfail()),
+        pytest.param(operator.lt, marks=pytest.mark.xfail()),
+        pytest.param(operator.le, marks=pytest.mark.xfail()),
+        pytest.param(operator.gt, marks=pytest.mark.xfail()),
+        pytest.param(operator.ge, marks=pytest.mark.xfail()),
+        pytest.param(operator.ne, marks=pytest.mark.xfail()),
     ],
 )
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 558700f1f89..cec2623027f 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -755,17 +755,6 @@ def do_slice(x):
     assert_eq(expect, got, check_dtype=False)
 
 
-def test_dataframe_boolean_mask_with_None():
-    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
-    gdf = cudf.DataFrame.from_pandas(pdf)
-    pdf_masked = pdf[[True, False, True, False]]
-    gdf_masked = gdf[[True, False, True, False]]
-    assert_eq(pdf_masked, gdf_masked)
-
-    with pytest.raises(ValueError):
-        gdf[cudf.Series([True, False, None, False])]
-
-
 @pytest.mark.parametrize("dtype", [int, float, str])
 def test_empty_boolean_mask(dtype):
     gdf = cudf.datasets.randomdata(nrows=0, dtypes={"a": dtype})
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 4d2e2a4b33b..1005efec3ee 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -143,15 +143,14 @@ def test_setitem_dataframe_series_inplace(df):
 )
 def test_series_set_equal_length_object_by_mask(replace_data):
 
-    psr = pd.Series([1, 2, 3, 4, 5])
+    psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64")
     gsr = cudf.from_pandas(psr)
 
     # Lengths match in trivial case
-    pd_bool_col = pd.Series([True] * len(psr))
+    pd_bool_col = pd.Series([True] * len(psr), dtype="boolean")
     gd_bool_col = cudf.from_pandas(pd_bool_col)
-
     psr[pd_bool_col] = (
-        replace_data.to_pandas()
+        replace_data.to_pandas(nullable=True)
         if hasattr(replace_data, "to_pandas")
         else replace_data
     )

From 2f5901ffb49eed3216d82d793e5a366a5e021d72 Mon Sep 17 00:00:00 2001
From: Keith Kraus <kkraus@nvidia.com>
Date: Tue, 16 Mar 2021 00:29:31 -0400
Subject: [PATCH 32/33] Fix 0.18 --> 0.19 automerge (#7589)

Closes #7586

Brings the hotfix from #7568 into branch-0.19.

Authors:
  - Keith Kraus (@kkraus14)
  - Ray Douglass (@raydouglass)
  - MithunR (@mythrocks)

Approvers:
  - Nghia Truong (@ttnghia)

URL: https://github.com/rapidsai/cudf/pull/7589
---
 cpp/src/rolling/grouped_rolling.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 135df6bdfe2..c1ebc9f3f9f 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -384,7 +384,7 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
 
   if (timestamp_column.has_nulls()) {
     auto p_timestamps_device_view = column_device_view::create(timestamp_column);
-    auto num_groups               = group_offsets.size();
+    auto num_groups               = group_offsets.size() - 1;
 
     // Null timestamps exist. Find null bounds, per group.
     thrust::for_each(

From c1c60ba3daf36d0ee5553558f70669f454d9f0c8 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 16 Mar 2021 14:27:55 -0500
Subject: [PATCH 33/33] Fix specifying GPU architecture in JNI build (#7612)

After #7593 the variables for controlling the CUDA build for either all architectures or detected architectures changed to be based on the project name which broke the JNI build.  This updates the JNI CMakeList accordingly to fix the JNI build.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Rong Ou (@rongou)
  - Gera Shegalov (@gerashegalov)
  - Thomas Graves (@tgravescs)

URL: https://github.com/rapidsai/cudf/pull/7612
---
 java/src/main/native/CMakeLists.txt | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index c1239fe69ea..ceafc75f840 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -17,10 +17,7 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 # Use GPU_ARCHS if it is defined
 if(DEFINED GPU_ARCHS)
-  unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-  if(NOT "${GPU_ARCHS}" STREQUAL "ALL")
-    set(CMAKE_CUDA_ARCHITECTURES "${GPU_ARCHS}")
-  endif()
+  set(CMAKE_CUDA_ARCHITECTURES "${GPU_ARCHS}")
 endif()
 
 # If `CMAKE_CUDA_ARCHITECTURES` is not defined, build for all supported architectures. If
@@ -29,11 +26,10 @@ endif()
 
 # This needs to be run before enabling the CUDA language due to the default initialization behavior
 # of `CMAKE_CUDA_ARCHITECTURES`, https://gitlab.kitware.com/cmake/cmake/-/issues/21302
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  set(CUDF_BUILD_FOR_ALL_ARCHS TRUE)
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "ALL")
+  set(CUDF_JNI_BUILD_FOR_ALL_ARCHS TRUE)
 elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
-  unset(CMAKE_CUDA_ARCHITECTURES CACHE)
-  set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE)
+  set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE)
 endif()
 
 project(CUDF_JNI VERSION 0.19 LANGUAGES C CXX)