From ee78a916bd2410abeda8fdbad496840ba56b0fdb Mon Sep 17 00:00:00 2001
From: jeanp413 <jeanp413@hotmail.com>
Date: Fri, 14 Feb 2020 10:00:30 -0500
Subject: [PATCH 01/79] Make fill/copy_range no-op on empty columns

---
 cpp/src/column/column_view.cpp         |  3 +--
 cpp/src/copying/copy_range.cu          | 28 +++++++++-----------------
 cpp/src/filling/fill.cu                | 10 ++++-----
 cpp/tests/copying/copy_range_tests.cpp | 17 ++++++++++++++--
 cpp/tests/filling/fill_tests.cu        | 18 +++++++++++++----
 5 files changed, 43 insertions(+), 33 deletions(-)
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index b7cad8b5013..3aff211dbe6 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -67,8 +67,7 @@ size_type column_view_base::null_count() const {
 }
 
 size_type column_view_base::null_count(size_type begin, size_type end) const {
-  CUDF_EXPECTS((begin <= end) && (begin >= 0) && (begin <  size()) &&
-                 (end <= size()),
+  CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end),
                "Range is out of bounds.");
   return (null_count() == 0) ?
     0 : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end);
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index b87ca06880a..d8f4ce6f79a 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -162,16 +162,11 @@ void copy_range(column_view const& source, mutable_column_view& target,
                 cudaStream_t stream) {
   CUDF_EXPECTS(cudf::is_fixed_width(target.type()) == true,
                "In-place copy_range does not support variable-sized types.");
-  CUDF_EXPECTS((source_begin <= source_end) &&
-                 (source_begin >= 0) &&
-                 (source_begin < source.size()) &&
-                 (source_end <= source.size()) &&
-                 (target_begin >= 0) &&
-                 (target_begin < target.size()) &&
-                 (target_begin + (source_end - source_begin) <=
-                   target.size()) &&
-                 // overflow
-                 (target_begin + (source_end - source_begin) >= target_begin),
+  CUDF_EXPECTS((source_begin >= 0) &&
+               (source_end <= source.size()) &&
+               (source_begin <= source_end) &&                 
+               (target_begin >= 0) &&
+               (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.");
   CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.");
   CUDF_EXPECTS((target.nullable() == true) || (source.has_nulls() == false),
@@ -192,15 +187,10 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    rmm::mr::device_memory_resource* mr,
                                    cudaStream_t stream) {
   CUDF_EXPECTS((source_begin >= 0) &&
-                 (source_begin <= source_end) &&
-                 (source_begin < source.size()) &&
-                 (source_end <= source.size()) &&
-                 (target_begin >= 0) &&
-                 (target_begin < target.size()) &&
-                 (target_begin + (source_end - source_begin) <=
-                   target.size()) &&
-                 // overflow
-                 (target_begin + (source_end - source_begin) >= target_begin),
+               (source_end <= source.size()) &&
+               (source_begin <= source_end) &&
+               (target_begin >= 0) &&
+               (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.");
   CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.");
 
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index bf4d428db11..03654058a42 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -131,9 +131,8 @@ void fill_in_place(mutable_column_view& destination,
   CUDF_EXPECTS(cudf::is_fixed_width(destination.type()) == true,
                "In-place fill does not support variable-sized types.");
   CUDF_EXPECTS((begin >= 0) &&
-               (begin <= end) &&
-               (begin < destination.size()) &&
-               (end <= destination.size()),
+               (end <= destination.size()) &&
+               (begin <= end),
                "Range is out of bounds.");
   CUDF_EXPECTS((destination.nullable() == true) || (value.is_valid() == true),
                "destination should be nullable or value should be non-null.");
@@ -156,9 +155,8 @@ std::unique_ptr<column> fill(column_view const& input,
                              rmm::mr::device_memory_resource* mr,
                              cudaStream_t stream) {
   CUDF_EXPECTS((begin >= 0) &&
-               (begin <= end) &&
-               (begin < input.size()) &&
-               (end <= input.size()),
+               (end <= input.size()) &&
+                (begin <= end),
                "Range is out of bounds.");
   CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
 
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index b4e22ba00e8..53ca43ea8c1 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -422,6 +422,7 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange)
     thrust::make_counting_iterator(0) + size);
 
   cudf::mutable_column_view target_view{target};
+  cudf::column_view source_view{source};
 
   // empty_range == no-op, this is valid
   EXPECT_NO_THROW(cudf::experimental::copy_range(
@@ -447,10 +448,10 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange)
 
   // source_begin >= source.size()
   EXPECT_THROW(cudf::experimental::copy_range(
-                 source, target_view, 100, 100, 0),
+                 source, target_view, 101, 100, 0),
                cudf::logic_error);
   EXPECT_THROW(auto p_ret = cudf::experimental::copy_range(
-                 source, target, 100, 100, 0),
+                 source, target, 101, 100, 0),
                cudf::logic_error);
 
   // source_end > source.size()
@@ -484,6 +485,18 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange)
   EXPECT_THROW(auto p_ret = cudf::experimental::copy_range(
                  source, target, 50, 100, 80),
                cudf::logic_error);
+
+  // Empty column
+  target = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  source = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  target_view = target;
+  source_view = source;
+
+  // empty column == no-op, this is valid
+  EXPECT_NO_THROW(cudf::experimental::copy_range(
+                    source_view, target_view, 0, source_view.size(), 0));
+  EXPECT_NO_THROW(auto p_ret = cudf::experimental::copy_range(
+                    source_view, target, 0, source_view.size(), 0));
 }
 
 TEST_F(CopyRangeErrorTestFixture, DTypeMismatch)
diff --git a/cpp/tests/filling/fill_tests.cu b/cpp/tests/filling/fill_tests.cu
index 2f5a2621850..f059524e5ea 100644
--- a/cpp/tests/filling/fill_tests.cu
+++ b/cpp/tests/filling/fill_tests.cu
@@ -352,10 +352,10 @@ TEST_F(FillErrorTestFixture, InvalidRange)
                  *p_val),
                cudf::logic_error);
 
-  // out_begin >= destination.size()
-  EXPECT_THROW(cudf::experimental::fill_in_place(destination_view, 100, 100, *p_val),
-               cudf::logic_error);
-  EXPECT_THROW(auto p_ret = cudf::experimental::fill(destination, 100, 100,
+  // out_begin > destination.size()
+  EXPECT_THROW(cudf::experimental::fill_in_place(destination_view, 101, 100, *p_val),
+                  cudf::logic_error);
+  EXPECT_THROW(auto p_ret = cudf::experimental::fill(destination, 101, 100,
                  *p_val),
                cudf::logic_error);
 
@@ -365,6 +365,16 @@ TEST_F(FillErrorTestFixture, InvalidRange)
   EXPECT_THROW(auto p_ret = cudf::experimental::fill(destination, 99, 101,
                  *p_val),
                cudf::logic_error);
+
+  // Empty Column
+  destination = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  destination_view = destination;
+
+  // empty column, this is valid
+  EXPECT_NO_THROW(cudf::experimental::fill_in_place(destination_view, 0,
+                    destination_view.size(), *p_val));
+  EXPECT_NO_THROW(auto p_ret = cudf::experimental::fill(destination, 0,
+                    destination_view.size(), *p_val));
 }
 
 TEST_F(FillErrorTestFixture, DTypeMismatch)

From 557e8cc5654cdb30a1d1e991b0514efd12be6ed8 Mon Sep 17 00:00:00 2001
From: jeanp413 <jeanp413@hotmail.com>
Date: Fri, 14 Feb 2020 10:05:34 -0500
Subject: [PATCH 02/79] :lipstick:

---
 cpp/src/filling/fill.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 03654058a42..8d99d063fe1 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -156,7 +156,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              cudaStream_t stream) {
   CUDF_EXPECTS((begin >= 0) &&
                (end <= input.size()) &&
-                (begin <= end),
+               (begin <= end),
                "Range is out of bounds.");
   CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
 

From 1f48da7829079706eb95a4cb64906684f8b4305f Mon Sep 17 00:00:00 2001
From: jeanp413 <jeanp413@hotmail.com>
Date: Fri, 14 Feb 2020 10:08:55 -0500
Subject: [PATCH 03/79] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 340abb10610..121b4b8aef7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -101,6 +101,7 @@
 - PR #4125 Fix type enum to account for added Dictionary type in `types.hpp`
 - PR #4137 Update Java for mutating fill and rolling window changes
 - PR #4141 Fix NVStrings test_convert failure in 10.2 build
+- PR #4156 Make fill/copy_range no-op on empty columns
 
 
 # cuDF 0.12.0 (04 Feb 2020)

From bf18a0e2321b3317277b80a0a71c9850ba5e8569 Mon Sep 17 00:00:00 2001
From: Jean Pierre <jeanp413@hotmail.com>
Date: Sun, 16 Feb 2020 19:58:07 -0500
Subject: [PATCH 04/79] Fix unit test

---
 cpp/tests/copying/copy_range_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index 2c49dbc968c..225687296b3 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -448,7 +448,7 @@ TEST_F(CopyRangeErrorTestFixture, InvalidRange)
 
   // source_begin >= source.size()
   EXPECT_THROW(cudf::experimental::copy_range_in_place(
-                 source, target_view, 100, 100, 0),
+                 source, target_view, 101, 100, 0),
                cudf::logic_error);
   EXPECT_THROW(auto p_ret = cudf::experimental::copy_range(
                  source, target, 101, 100, 0),

From 55deaf8f3ec9784ec3868405c07dd63725a1a516 Mon Sep 17 00:00:00 2001
From: jeanp413 <jeanp413@hotmail.com>
Date: Mon, 2 Mar 2020 16:10:47 -0500
Subject: [PATCH 05/79] Update docs

---
 cpp/include/cudf/copying.hpp | 10 ++++------
 cpp/include/cudf/filling.hpp |  6 ++----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index bcfb167f80b..2a71a3a1fb5 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -243,9 +243,8 @@ std::unique_ptr<table> empty_like(table_view const& input_table);
  * variable width types).
  * @throws `cudf::logic_error` for invalid range (if
  * @p source_begin > @p source_end, @p source_begin < 0,
- * @p source_begin >= @p source.size(), @p source_end > @p source.size(),
- * @p target_begin < 0, target_begin >= @p target.size(), or
- * @p target_begin + (@p source_end - @p source_begin) > @p target.size()).
+ * @p source_end > @p source.size(), @p target_begin < 0,
+ * or @p target_begin + (@p source_end - @p source_begin) > @p target.size()).
  * @throws `cudf::logic_error` if @p target and @p source have different types.
  * @throws `cudf::logic_error` if @p source has null values and @p target is not
  * nullable.
@@ -278,9 +277,8 @@ void copy_range_in_place(column_view const& source,
  *
  * @throws `cudf::logic_error` for invalid range (if
  * @p source_begin > @p source_end, @p source_begin < 0,
- * @p source_begin >= @p source.size(), @p source_end > @p source.size(),
- * @p target_begin < 0, target_begin >= @p target.size(), or
- * @p target_begin + (@p source_end - @p source_begin) > @p target.size()).
+ * @p source_end > @p source.size(), @p target_begin < 0,
+ * or @p target_begin + (@p source_end - @p source_begin) > @p target.size()).
  * @throws `cudf::logic_error` if @p target and @p source have different types.
  *
  * @param source The column to copy from inside the range.
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 0316a154835..f6d3c67dfb6 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -37,8 +37,7 @@ namespace experimental {
  * @throws `cudf::logic_error` if memory reallocation is required (e.g. for
  * variable width types).
  * @throws `cudf::logic_error` for invalid range (if @p begin < 0,
- * @p begin > @p end, @p begin >= @p destination.size(), or
- * @p end > @p destination.size()).
+ * @p begin > @p end, or @p end > @p destination.size()).
  * @throws `cudf::logic_error` if @p destination and @p value have different
  * types.
  * @throws `cudf::logic_error` if @p value is invalid but @p destination is not
@@ -62,8 +61,7 @@ void fill_in_place(mutable_column_view& destination, size_type begin,
  * indicated by the indices [@p begin, @p end) were overwritten by @p value.
  *
  * @throws `cudf::logic_error` for invalid range (if @p begin < 0,
- * @p begin > @p end, @p begin >= @p destination.size(), or
- * @p end > @p destination.size()).
+ * @p begin > @p end, or @p end > @p destination.size()).
  * @throws `cudf::logic_error` if @p destination and @p value have different
  * types.
  *

From 8dbc1d51e81e2d8c94821bf957247817e99e0fb1 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Thu, 5 Mar 2020 00:57:07 +0530
Subject: [PATCH 06/79] Initial commit for binaryops cython port

---
 cpp/include/cudf/binaryop.hpp            |   2 +-
 python/cudf/cudf/_libxx/__init__.py      |   1 +
 python/cudf/cudf/_libxx/binaryop.pxd     |   6 ++
 python/cudf/cudf/_libxx/binaryop.pyx     | 117 +++++++++++++++++++++++
 python/cudf/cudf/_libxx/cpp/binaryop.pxd |  42 ++++++++
 5 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/_libxx/binaryop.pxd
 create mode 100644 python/cudf/cudf/_libxx/binaryop.pyx
 create mode 100644 python/cudf/cudf/_libxx/cpp/binaryop.pxd

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index f383b4f24ff..891e40f93a5 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -27,7 +27,7 @@ namespace experimental {
 /**
  * @brief Types of binary operations that can be performed on data.
  */
-enum class binary_operator {
+enum class binary_operator : int32_t {
   ADD,             ///< operator +
   SUB,             ///< operator -
   MUL,             ///< operator *
diff --git a/python/cudf/cudf/_libxx/__init__.py b/python/cudf/cudf/_libxx/__init__.py
index 0b3aa602333..cb16fc6d169 100644
--- a/python/cudf/cudf/_libxx/__init__.py
+++ b/python/cudf/cudf/_libxx/__init__.py
@@ -4,6 +4,7 @@
 
 from . import (
     avro,
+    binaryop,
     copying,
     dlpack,
     gpuarrow,
diff --git a/python/cudf/cudf/_libxx/binaryop.pxd b/python/cudf/cudf/_libxx/binaryop.pxd
new file mode 100644
index 00000000000..3fb36055465
--- /dev/null
+++ b/python/cudf/cudf/_libxx/binaryop.pxd
@@ -0,0 +1,6 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+
+
+ctypedef int32_t underlying_type_t_binary_operator
diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx
new file mode 100644
index 00000000000..8ba5ead41ad
--- /dev/null
+++ b/python/cudf/cudf/_libxx/binaryop.pyx
@@ -0,0 +1,117 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
+import numpy as np
+from enum import IntEnum
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._libxx.binaryop cimport underlying_type_t_binary_operator
+from cudf._libxx.column cimport Column
+from cudf._libxx.move cimport move
+from cudf._libxx.types import np_to_cudf_types
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.types cimport (
+    data_type,
+    type_id,
+)
+
+cimport cudf._libxx.cpp.binaryop as cpp_binaryop
+
+
+class BinaryOperation(IntEnum):
+    ADD = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.ADD
+    )
+    SUB = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.SUB
+    )
+    MUL = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.MUL
+    )
+    DIV = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.DIV
+    )
+    TRUE_DIV = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.TRUE_DIV
+    )
+    FLOOR_DIV = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.FLOOR_DIV
+    )
+    MOD = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.MOD
+    )
+    PYMOD = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.PYMOD
+    )
+    POW = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.POW
+    )
+    EQUAL = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.EQUAL
+    )
+    NOT_EQUAL = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.NOT_EQUAL
+    )
+    LESS = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LESS
+    )
+    GREATER = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.GREATER
+    )
+    LESS_EQUAL = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LESS_EQUAL
+    )
+    GREATER_EQUAL = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.GREATER_EQUAL
+    )
+    BITWISE_AND = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_AND
+    )
+    BITWISE_OR = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_OR
+    )
+    BITWISE_XOR = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_XOR
+    )
+    LOGICAL_AND = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LOGICAL_AND
+    )
+    LOGICAL_OR = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LOGICAL_OR
+    )
+    COALESCE = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.COALESCE
+    )
+    GENERIC_BINARY = (
+        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.GENERIC_BINARY
+    )
+
+
+def binaryop(Column lhs, Column rhs, object op, object dtype):
+    """
+    Dispatches a binary op call to the appropriate libcudf function:
+    """
+    cdef column_view c_lhs = lhs.view()
+    cdef column_view c_rhs = rhs.view()
+    cdef cpp_binaryop.binary_operator c_op = \
+        <cpp_binaryop.binary_operator>(
+            <underlying_type_t_binary_operator> op
+        )
+    cdef type_id tid = np_to_cudf_types[np.dtype(dtype)]
+    cdef data_type c_dtype = data_type(tid)
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_binaryop.binary_operation(
+                c_lhs,
+                c_rhs,
+                c_op,
+                c_dtype
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/cpp/binaryop.pxd b/python/cudf/cudf/_libxx/cpp/binaryop.pxd
new file mode 100644
index 00000000000..8eb3f65a313
--- /dev/null
+++ b/python/cudf/cudf/_libxx/cpp/binaryop.pxd
@@ -0,0 +1,42 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.types cimport (
+    data_type
+)
+
+cdef extern from "cudf/binaryop.hpp" namespace "cudf::experimental" nogil:
+    ctypedef enum binary_operator:
+        ADD "cudf::experimental::binary_operator::ADD"
+        SUB "cudf::experimental::binary_operator::SUB"
+        MUL "cudf::experimental::binary_operator::MUL"
+        DIV "cudf::experimental::binary_operator::DIV"
+        TRUE_DIV "cudf::experimental::binary_operator::TRUE_DIV"
+        FLOOR_DIV "cudf::experimental::binary_operator::FLOOR_DIV"
+        MOD "cudf::experimental::binary_operator::MOD"
+        PYMOD "cudf::experimental::binary_operator::PYMOD"
+        POW "cudf::experimental::binary_operator::POW"
+        EQUAL "cudf::experimental::binary_operator::EQUAL"
+        NOT_EQUAL "cudf::experimental::binary_operator::NOT_EQUAL"
+        LESS "cudf::experimental::binary_operator::LESS"
+        GREATER "cudf::experimental::binary_operator::GREATER"
+        LESS_EQUAL "cudf::experimental::binary_operator::LESS_EQUAL"
+        GREATER_EQUAL "cudf::experimental::binary_operator::GREATER_EQUAL"
+        BITWISE_AND "cudf::experimental::binary_operator::BITWISE_AND"
+        BITWISE_OR "cudf::experimental::binary_operator::BITWISE_OR"
+        BITWISE_XOR "cudf::experimental::binary_operator::BITWISE_XOR"
+        LOGICAL_AND "cudf::experimental::binary_operator::LOGICAL_AND"
+        LOGICAL_OR "cudf::experimental::binary_operator::LOGICAL_OR"
+        COALESCE "cudf::experimental::binary_operator::COALESCE"
+        GENERIC_BINARY "cudf::experimental::binary_operator::GENERIC_BINARY"
+
+    cdef unique_ptr[column] binary_operation (
+        const column_view& lhs,
+        const column_view& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
+    
\ No newline at end of file

From a0847eb3740aadf585ec41c575b2a6c5519e9af9 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 6 Mar 2020 01:09:27 +0530
Subject: [PATCH 07/79] column-column binop with libxx working

---
 python/cudf/cudf/_libxx/binaryop.pyx      | 29 ++++++++++-------------
 python/cudf/cudf/core/column/numerical.py | 22 ++---------------
 2 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx
index 8ba5ead41ad..fe2ffeb2b23 100644
--- a/python/cudf/cudf/_libxx/binaryop.pyx
+++ b/python/cudf/cudf/_libxx/binaryop.pyx
@@ -33,52 +33,49 @@ class BinaryOperation(IntEnum):
     DIV = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.DIV
     )
-    TRUE_DIV = (
+    TRUEDIV = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.TRUE_DIV
     )
-    FLOOR_DIV = (
+    FLOORDIV = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.FLOOR_DIV
     )
     MOD = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.MOD
-    )
-    PYMOD = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.PYMOD
     )
     POW = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.POW
     )
-    EQUAL = (
+    EQ = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.EQUAL
     )
-    NOT_EQUAL = (
+    NE = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.NOT_EQUAL
     )
-    LESS = (
+    LT = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LESS
     )
-    GREATER = (
+    GT = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.GREATER
     )
-    LESS_EQUAL = (
+    LE = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LESS_EQUAL
     )
-    GREATER_EQUAL = (
+    GE = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.GREATER_EQUAL
     )
-    BITWISE_AND = (
+    AND = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_AND
     )
-    BITWISE_OR = (
+    OR = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_OR
     )
-    BITWISE_XOR = (
+    XOR = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_XOR
     )
-    LOGICAL_AND = (
+    L_AND = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LOGICAL_AND
     )
-    LOGICAL_OR = (
+    L_OR = (
         <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LOGICAL_OR
     )
     COALESCE = (
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b700200f67c..a1960a19d71 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -458,29 +458,11 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
     if reflect:
         lhs, rhs = rhs, lhs
     libcudf.nvtx.nvtx_range_push("CUDF_BINARY_OP", "orange")
-    # Allocate output
-    masked = False
-    if np.isscalar(lhs):
-        masked = rhs.nullable
-        row_count = len(rhs)
-    elif np.isscalar(rhs):
-        masked = lhs.nullable
-        row_count = len(lhs)
-    elif rhs is None:
-        masked = True
-        row_count = len(lhs)
-    elif lhs is None:
-        masked = True
-        row_count = len(rhs)
-    else:
-        masked = lhs.nullable or rhs.nullable
-        row_count = len(lhs)
 
     is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]
 
-    out = column.column_empty(row_count, dtype=out_dtype, masked=masked)
-
-    _ = libcudf.binops.apply_op(lhs, rhs, out, op)
+    operator = libcudfxx.binaryop.BinaryOperation[op.upper()]
+    out = libcudfxx.binaryop.binaryop(lhs, rhs, operator, out_dtype)
 
     if is_op_comparison:
         out = out.fillna(op == "ne")

From 06b2609e7257c0a0c39eb0d74daeeb54cf1bbdd7 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 6 Mar 2020 04:19:53 +0530
Subject: [PATCH 08/79] Scalar binops now working with libxx

---
 python/cudf/cudf/_libxx/binaryop.pyx     | 129 +++++++++++++++++------
 python/cudf/cudf/_libxx/cpp/binaryop.pxd |  15 +++
 2 files changed, 114 insertions(+), 30 deletions(-)

diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx
index fe2ffeb2b23..4b9cc4d8afe 100644
--- a/python/cudf/cudf/_libxx/binaryop.pyx
+++ b/python/cudf/cudf/_libxx/binaryop.pyx
@@ -8,96 +8,110 @@ from libcpp.memory cimport unique_ptr
 from cudf._libxx.binaryop cimport underlying_type_t_binary_operator
 from cudf._libxx.column cimport Column
 from cudf._libxx.move cimport move
+from cudf._libxx.scalar cimport Scalar
 from cudf._libxx.types import np_to_cudf_types
 
 from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.scalar.scalar cimport scalar
 from cudf._libxx.cpp.column.column_view cimport column_view
 from cudf._libxx.cpp.types cimport (
     data_type,
     type_id,
 )
 
+from cudf._libxx.cpp.binaryop cimport binary_operator
 cimport cudf._libxx.cpp.binaryop as cpp_binaryop
 
 
 class BinaryOperation(IntEnum):
     ADD = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.ADD
+        <underlying_type_t_binary_operator> binary_operator.ADD
     )
     SUB = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.SUB
+        <underlying_type_t_binary_operator> binary_operator.SUB
     )
     MUL = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.MUL
+        <underlying_type_t_binary_operator> binary_operator.MUL
     )
     DIV = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.DIV
+        <underlying_type_t_binary_operator> binary_operator.DIV
     )
     TRUEDIV = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.TRUE_DIV
+        <underlying_type_t_binary_operator> binary_operator.TRUE_DIV
     )
     FLOORDIV = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.FLOOR_DIV
+        <underlying_type_t_binary_operator> binary_operator.FLOOR_DIV
     )
     MOD = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.PYMOD
+        <underlying_type_t_binary_operator> binary_operator.PYMOD
     )
     POW = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.POW
+        <underlying_type_t_binary_operator> binary_operator.POW
     )
     EQ = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.EQUAL
+        <underlying_type_t_binary_operator> binary_operator.EQUAL
     )
     NE = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.NOT_EQUAL
+        <underlying_type_t_binary_operator> binary_operator.NOT_EQUAL
     )
     LT = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LESS
+        <underlying_type_t_binary_operator> binary_operator.LESS
     )
     GT = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.GREATER
+        <underlying_type_t_binary_operator> binary_operator.GREATER
     )
     LE = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LESS_EQUAL
+        <underlying_type_t_binary_operator> binary_operator.LESS_EQUAL
     )
     GE = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.GREATER_EQUAL
+        <underlying_type_t_binary_operator> binary_operator.GREATER_EQUAL
     )
     AND = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_AND
+        <underlying_type_t_binary_operator> binary_operator.BITWISE_AND
     )
     OR = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_OR
+        <underlying_type_t_binary_operator> binary_operator.BITWISE_OR
     )
     XOR = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.BITWISE_XOR
+        <underlying_type_t_binary_operator> binary_operator.BITWISE_XOR
     )
     L_AND = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LOGICAL_AND
+        <underlying_type_t_binary_operator> binary_operator.LOGICAL_AND
     )
     L_OR = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.LOGICAL_OR
+        <underlying_type_t_binary_operator> binary_operator.LOGICAL_OR
     )
     COALESCE = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.COALESCE
+        <underlying_type_t_binary_operator> binary_operator.COALESCE
     )
     GENERIC_BINARY = (
-        <underlying_type_t_binary_operator> cpp_binaryop.binary_operator.GENERIC_BINARY
+        <underlying_type_t_binary_operator> binary_operator.GENERIC_BINARY
     )
 
 
-def binaryop(Column lhs, Column rhs, object op, object dtype):
-    """
-    Dispatches a binary op call to the appropriate libcudf function:
-    """
+cdef binaryop_v_v(Column lhs, Column rhs,
+                  binary_operator c_op, data_type c_dtype):
     cdef column_view c_lhs = lhs.view()
     cdef column_view c_rhs = rhs.view()
-    cdef cpp_binaryop.binary_operator c_op = \
-        <cpp_binaryop.binary_operator>(
-            <underlying_type_t_binary_operator> op
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_binaryop.binary_operation(
+                c_lhs,
+                c_rhs,
+                c_op,
+                c_dtype
+            )
         )
-    cdef type_id tid = np_to_cudf_types[np.dtype(dtype)]
-    cdef data_type c_dtype = data_type(tid)
+
+    return Column.from_unique_ptr(move(c_result))
+
+cdef binaryop_v_s(Column lhs, Scalar rhs,
+                  binary_operator c_op, data_type c_dtype):
+    cdef column_view c_lhs = lhs.view()
+    cdef scalar* c_rhs = rhs.c_value.get()
 
     cdef unique_ptr[column] c_result
 
@@ -105,6 +119,25 @@ def binaryop(Column lhs, Column rhs, object op, object dtype):
         c_result = move(
             cpp_binaryop.binary_operation(
                 c_lhs,
+                c_rhs[0],
+                c_op,
+                c_dtype
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+cdef binaryop_s_v(Scalar lhs, Column rhs,
+                  binary_operator c_op, data_type c_dtype):
+    cdef scalar* c_lhs = lhs.c_value.get()
+    cdef column_view c_rhs = rhs.view()
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_binaryop.binary_operation(
+                c_lhs[0],
                 c_rhs,
                 c_op,
                 c_dtype
@@ -112,3 +145,39 @@ def binaryop(Column lhs, Column rhs, object op, object dtype):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+def binaryop(lhs, rhs, op, dtype):
+    """
+    Dispatches a binary op call to the appropriate libcudf function:
+    """
+    cdef binary_operator c_op = <binary_operator> (
+        <underlying_type_t_binary_operator> op
+    )
+    cdef type_id tid = np_to_cudf_types[np.dtype(dtype)]
+    cdef data_type c_dtype = data_type(tid)
+
+    if np.isscalar(lhs) or lhs is None:
+        s_lhs = Scalar(lhs, dtype=rhs.dtype if lhs is None else None)
+        return binaryop_s_v(
+                s_lhs,
+                rhs,
+                c_op,
+                c_dtype
+            )
+        
+    elif np.isscalar(rhs) or rhs is None:
+        s_rhs = Scalar(rhs, dtype=lhs.dtype if rhs is None else None)
+        return binaryop_v_s(
+                lhs,
+                s_rhs,
+                c_op,
+                c_dtype
+            )
+        
+    else:
+        return binaryop_v_v(
+                lhs,
+                rhs,
+                c_op,
+                c_dtype
+            )
diff --git a/python/cudf/cudf/_libxx/cpp/binaryop.pxd b/python/cudf/cudf/_libxx/cpp/binaryop.pxd
index 8eb3f65a313..93e95bf6aba 100644
--- a/python/cudf/cudf/_libxx/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_libxx/cpp/binaryop.pxd
@@ -3,6 +3,7 @@
 from libcpp.memory cimport unique_ptr
 
 from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.scalar.scalar cimport scalar
 from cudf._libxx.cpp.column.column_view cimport column_view
 from cudf._libxx.cpp.types cimport (
     data_type
@@ -33,6 +34,20 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::experimental" nogil:
         COALESCE "cudf::experimental::binary_operator::COALESCE"
         GENERIC_BINARY "cudf::experimental::binary_operator::GENERIC_BINARY"
 
+    cdef unique_ptr[column] binary_operation (
+        const scalar& lhs,
+        const column_view& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
+    
+    cdef unique_ptr[column] binary_operation (
+        const column_view& lhs,
+        const scalar& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
+    
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,

From 79abc0c880d6ec6f3a45bc6d020ad8557ea24721 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 6 Mar 2020 05:03:12 +0530
Subject: [PATCH 09/79] UDF binaryop working with libxx

---
 python/cudf/cudf/_libxx/binaryop.pyx      | 30 +++++++++++++++++++++++
 python/cudf/cudf/_libxx/cpp/binaryop.pxd  | 12 +++++++--
 python/cudf/cudf/tests/test_udf_binops.py |  4 +--
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx
index 4b9cc4d8afe..9a7b8b9128d 100644
--- a/python/cudf/cudf/_libxx/binaryop.pyx
+++ b/python/cudf/cudf/_libxx/binaryop.pyx
@@ -4,6 +4,7 @@ import numpy as np
 from enum import IntEnum
 
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 
 from cudf._libxx.binaryop cimport underlying_type_t_binary_operator
 from cudf._libxx.column cimport Column
@@ -181,3 +182,32 @@ def binaryop(lhs, rhs, op, dtype):
                 c_op,
                 c_dtype
             )
+
+def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
+    """
+    Apply a user-defined binary operator (a UDF) defined in `udf_ptx` on
+    the two input columns `lhs` and `rhs`. The output type of the UDF
+    has to be specified in `dtype`, a numpy data type.
+    Currently ONLY int32, int64, float32 and float64 are supported.
+    """
+    cdef column_view c_lhs = lhs.view()
+    cdef column_view c_rhs = rhs.view()
+
+    cdef type_id tid = np_to_cudf_types[np.dtype(dtype)]
+    cdef data_type c_dtype = data_type(tid)
+
+    cdef string cpp_str = udf_ptx.encode("UTF-8")
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_binaryop.binary_operation(
+                c_lhs,
+                c_rhs,
+                cpp_str,
+                c_dtype
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/cpp/binaryop.pxd b/python/cudf/cudf/_libxx/cpp/binaryop.pxd
index 93e95bf6aba..37111fbacb7 100644
--- a/python/cudf/cudf/_libxx/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_libxx/cpp/binaryop.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 
 from cudf._libxx.cpp.column.column cimport column
 from cudf._libxx.cpp.scalar.scalar cimport scalar
@@ -40,18 +41,25 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::experimental" nogil:
         binary_operator op,
         data_type output_type
     ) except +
-    
+
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
     ) except +
-    
+
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
     ) except +
+
+    cdef unique_ptr[column] binary_operation (
+        const column_view& lhs,
+        const column_view& rhs,
+        const string& op,
+        data_type output_type
+    ) except +
     
\ No newline at end of file
diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index 4693d61fbbd..d377ef332cd 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -7,7 +7,7 @@
 import pytest
 from packaging.version import Version
 
-import cudf._lib as libcudf
+import cudf._libxx as libcudfxx
 from cudf.core import Series
 
 supported_types = ["int16", "int32", "int64", "float32", "float64"]
@@ -41,7 +41,7 @@ def generic_function(a, b):
 
     output_type = numba.numpy_support.as_dtype(result.signature.return_type)
 
-    out_col = libcudf.binops.apply_op_udf(
+    out_col = libcudfxx.binaryop.binaryop_udf(
         lhs_col, rhs_col, ptx_code, output_type.type
     )
 

From 72191c2c0b42cc5dedb295aad745016b33102e47 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Sat, 7 Mar 2020 01:22:05 +0530
Subject: [PATCH 10/79] style fixes

---
 python/cudf/cudf/_libxx/binaryop.pyx     | 38 +++++++++++++-----------
 python/cudf/cudf/_libxx/cpp/binaryop.pxd |  1 -
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx
index 9a7b8b9128d..d99a13f4fd1 100644
--- a/python/cudf/cudf/_libxx/binaryop.pyx
+++ b/python/cudf/cudf/_libxx/binaryop.pyx
@@ -109,6 +109,7 @@ cdef binaryop_v_v(Column lhs, Column rhs,
 
     return Column.from_unique_ptr(move(c_result))
 
+
 cdef binaryop_v_s(Column lhs, Scalar rhs,
                   binary_operator c_op, data_type c_dtype):
     cdef column_view c_lhs = lhs.view()
@@ -128,6 +129,7 @@ cdef binaryop_v_s(Column lhs, Scalar rhs,
 
     return Column.from_unique_ptr(move(c_result))
 
+
 cdef binaryop_s_v(Scalar lhs, Column rhs,
                   binary_operator c_op, data_type c_dtype):
     cdef scalar* c_lhs = lhs.c_value.get()
@@ -147,6 +149,7 @@ cdef binaryop_s_v(Scalar lhs, Column rhs,
 
     return Column.from_unique_ptr(move(c_result))
 
+
 def binaryop(lhs, rhs, op, dtype):
     """
     Dispatches a binary op call to the appropriate libcudf function:
@@ -160,28 +163,29 @@ def binaryop(lhs, rhs, op, dtype):
     if np.isscalar(lhs) or lhs is None:
         s_lhs = Scalar(lhs, dtype=rhs.dtype if lhs is None else None)
         return binaryop_s_v(
-                s_lhs,
-                rhs,
-                c_op,
-                c_dtype
-            )
-        
+            s_lhs,
+            rhs,
+            c_op,
+            c_dtype
+        )
+
     elif np.isscalar(rhs) or rhs is None:
         s_rhs = Scalar(rhs, dtype=lhs.dtype if rhs is None else None)
         return binaryop_v_s(
-                lhs,
-                s_rhs,
-                c_op,
-                c_dtype
-            )
-        
+            lhs,
+            s_rhs,
+            c_op,
+            c_dtype
+        )
+
     else:
         return binaryop_v_v(
-                lhs,
-                rhs,
-                c_op,
-                c_dtype
-            )
+            lhs,
+            rhs,
+            c_op,
+            c_dtype
+        )
+
 
 def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
     """
diff --git a/python/cudf/cudf/_libxx/cpp/binaryop.pxd b/python/cudf/cudf/_libxx/cpp/binaryop.pxd
index 37111fbacb7..07481ab2bca 100644
--- a/python/cudf/cudf/_libxx/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_libxx/cpp/binaryop.pxd
@@ -62,4 +62,3 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::experimental" nogil:
         const string& op,
         data_type output_type
     ) except +
-    
\ No newline at end of file

From e0c394273c355f3b706c9aaca6c0e554906ec379 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 10 Mar 2020 23:12:44 +0530
Subject: [PATCH 11/79] Fix issue with MIN/MAX strings when val have nulls

---
 cpp/include/cudf/detail/replace.hpp      |  2 +-
 cpp/src/groupby/hash/groupby.cu          | 20 ++++++++++++++++----
 cpp/tests/groupby/sort/group_max_test.cu | 14 ++++++++++++++
 cpp/tests/groupby/sort/group_min_test.cu | 14 ++++++++++++++
 4 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index 1818526ff53..413eb1f90ef 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -57,7 +57,7 @@ std::unique_ptr<column> replace_nulls(column_view const& input,
   * @returns Copy of `input` with null values replaced by `replacement`.
   */
 std::unique_ptr<column> replace_nulls(column_view const& input,
-                                            scalar const* replacement,
+                                            scalar const& replacement,
                                             rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(),
                                             cudaStream_t stream = 0);
 
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index cf281bc76d7..06a206f79b7 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -23,10 +23,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/detail/groupby.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/replace.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -166,10 +168,20 @@ void sparse_to_dense_results(
     [&col, to_dense_agg_result, mr, stream]
     (auto const& agg_kind) {
       auto tranformed_agg = std::make_unique<aggregation>(agg_kind);
-      auto argmax_result = to_dense_agg_result(tranformed_agg);
-      auto transformed_result = experimental::detail::gather(
-        table_view({col}), *argmax_result, false, false, false, mr, stream);
-      return std::move(transformed_result->release()[0]);
+      auto arg_result = to_dense_agg_result(tranformed_agg);
+      if (arg_result->has_nulls()) {
+        auto replacement = numeric_scalar<size_type>(-1, true, stream);
+        auto null_replaced_map = cudf::detail::replace_nulls(
+          *arg_result, replacement, rmm::mr::get_default_resource(), stream);
+        auto transformed_result = experimental::detail::gather(
+          table_view({col}), *null_replaced_map, false, true, false, mr, stream);
+        return std::move(transformed_result->release()[0]);
+      }
+      else {
+        auto transformed_result = experimental::detail::gather(
+          table_view({col}), *arg_result, false, false, false, mr, stream);
+        return std::move(transformed_result->release()[0]);
+      }
     };
 
     for (auto &&agg : agg_v) {
diff --git a/cpp/tests/groupby/sort/group_max_test.cu b/cpp/tests/groupby/sort/group_max_test.cu
index 77937050422..b540d00bdea 100644
--- a/cpp/tests/groupby/sort/group_max_test.cu
+++ b/cpp/tests/groupby/sort/group_max_test.cu
@@ -133,5 +133,19 @@ TEST_F(groupby_max_string_test, basic)
     test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TEST_F(groupby_max_string_test, zero_valid_values)
+{
+    using K = int32_t;
+
+    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
+    strings_column_wrapper        vals      ( { "año", "bit", "₹1"}, all_null() );
+
+    fixed_width_column_wrapper<K> expect_keys { 1 };
+    strings_column_wrapper        expect_vals({ "" }, all_null());
+
+    auto agg = cudf::experimental::make_max_aggregation();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 } // namespace test
 } // namespace cudf
diff --git a/cpp/tests/groupby/sort/group_min_test.cu b/cpp/tests/groupby/sort/group_min_test.cu
index abe5f6f006f..c2ae89cb971 100644
--- a/cpp/tests/groupby/sort/group_min_test.cu
+++ b/cpp/tests/groupby/sort/group_min_test.cu
@@ -133,5 +133,19 @@ TEST_F(groupby_min_string_test, basic)
     test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TEST_F(groupby_min_string_test, zero_valid_values)
+{
+    using K = int32_t;
+
+    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
+    strings_column_wrapper        vals      ( { "año", "bit", "₹1"}, all_null() );
+
+    fixed_width_column_wrapper<K> expect_keys { 1 };
+    strings_column_wrapper        expect_vals({ "" }, all_null());
+
+    auto agg = cudf::experimental::make_min_aggregation();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 } // namespace test
 } // namespace cudf

From 28f92451e6d5e8a83360a7c742f14def492cae20 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Tue, 10 Mar 2020 23:23:39 +0530
Subject: [PATCH 12/79] changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c660d83500b..0eba669fe2c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -221,7 +221,7 @@
 - PR #4358 Fix strings::concat where narep is an empty string
 - PR #4369 Fix race condition in gpuinflate
 - PR #4390 Disable ScatterValid and ScatterNull legacy tests
-
+- PR #4398 Fixes the bug in groupby in MIN/MAX on strings when strings some groups are empty
 
 # cuDF 0.12.0 (04 Feb 2020)
 

From 39a7537fc97b31c5238f9debd2e65e7da7fa2cec Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 11 Mar 2020 16:47:37 -0400
Subject: [PATCH 13/79] Move BinaryOperation creation to Cython

---
 python/cudf/cudf/_libxx/binaryop.pyx      | 1 +
 python/cudf/cudf/core/column/numerical.py | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_libxx/binaryop.pyx b/python/cudf/cudf/_libxx/binaryop.pyx
index d99a13f4fd1..40f124ce9ad 100644
--- a/python/cudf/cudf/_libxx/binaryop.pyx
+++ b/python/cudf/cudf/_libxx/binaryop.pyx
@@ -154,6 +154,7 @@ def binaryop(lhs, rhs, op, dtype):
     """
     Dispatches a binary op call to the appropriate libcudf function:
     """
+    op = BinaryOperation[op.upper()]
     cdef binary_operator c_op = <binary_operator> (
         <underlying_type_t_binary_operator> op
     )
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a1960a19d71..5bc3c106a12 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -461,8 +461,7 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
 
     is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]
 
-    operator = libcudfxx.binaryop.BinaryOperation[op.upper()]
-    out = libcudfxx.binaryop.binaryop(lhs, rhs, operator, out_dtype)
+    out = libcudfxx.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     if is_op_comparison:
         out = out.fillna(op == "ne")

From e21d82106823809bb8d94846e4c4a13def4c4f1b Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Thu, 12 Mar 2020 04:29:45 +0530
Subject: [PATCH 14/79] Remove extra replace_nulls operation

---
 cpp/src/groupby/hash/groupby.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 06a206f79b7..a039000ef2a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -169,12 +169,12 @@ void sparse_to_dense_results(
     (auto const& agg_kind) {
       auto tranformed_agg = std::make_unique<aggregation>(agg_kind);
       auto arg_result = to_dense_agg_result(tranformed_agg);
-      if (arg_result->has_nulls()) {
-        auto replacement = numeric_scalar<size_type>(-1, true, stream);
-        auto null_replaced_map = cudf::detail::replace_nulls(
-          *arg_result, replacement, rmm::mr::get_default_resource(), stream);
+      if (arg_result->nullable()) {
+        column_view null_removed_map(data_type(type_to_id<size_type>()),
+          arg_result->size(), 
+          static_cast<void const*>(arg_result->view().template data<size_type>()));
         auto transformed_result = experimental::detail::gather(
-          table_view({col}), *null_replaced_map, false, true, false, mr, stream);
+          table_view({col}), null_removed_map, false, true, false, mr, stream);
         return std::move(transformed_result->release()[0]);
       }
       else {

From 3e4a8d6587ad1beffdafbf7fdc56a4f8d2023cf5 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 12 Mar 2020 08:09:25 -0400
Subject: [PATCH 15/79] Remove Series._rbinaryop and Series._filled_binary_op

---
 python/cudf/cudf/core/series.py | 106 ++++++++++++++------------------
 1 file changed, 47 insertions(+), 59 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e8306710a8d..ace410f665a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2018, NVIDIA CORPORATION.
-import operator
 import pickle
 import warnings
 from numbers import Number
@@ -597,7 +596,7 @@ def __repr__(self):
             lines.append(category_memory)
         return "\n".join(lines)
 
-    def _binaryop(self, other, fn, reflect=False):
+    def _binaryop(self, other, fn, fill_value=None, reflect=False):
         """
         Internal util to call a binary operator *fn* on operands *self*
         and *other*.  Return the output Series.  The output dtype is
@@ -619,48 +618,31 @@ def _binaryop(self, other, fn, reflect=False):
         else:
             lhs, rhs = self, other
         rhs = self._normalize_binop_value(rhs)
-        outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect)
-        result = lhs._copy_construct(data=outcol, name=result_name)
-        libcudf.nvtx.nvtx_range_pop()
-        return result
-
-    def _rbinaryop(self, other, fn):
-        """
-        Internal util to call a binary operator *fn* on operands *self*
-        and *other* for reflected operations.  Return the output Series.
-        The output dtype is determined by the input operands.
-        """
-        return self._binaryop(other, fn, reflect=True)
-
-    def _filled_binaryop(self, other, fn, fill_value=None, reflect=False):
-        def func(lhs, rhs):
-            return fn(rhs, lhs) if reflect else fn(lhs, rhs)
-
-        if isinstance(other, Series):
-            lhs, rhs = _align_indices([self, other], allow_non_unique=True)
-        else:
-            lhs, rhs = self, other
 
         if fill_value is not None:
-            if isinstance(rhs, Series):
+            if is_scalar(rhs):
+                lhs = lhs.fillna(fill_value)
+            else:
                 if lhs.nullable and rhs.nullable:
                     lmask = Series(data=lhs.nullmask)
                     rmask = Series(data=rhs.nullmask)
                     mask = (lmask | rmask).data
                     lhs = lhs.fillna(fill_value)
                     rhs = rhs.fillna(fill_value)
-                    result = func(lhs, rhs)
+                    result = lhs._binaryop(rhs, fn=fn, reflect=reflect)
                     data = column.build_column(
                         data=result.data, dtype=result.dtype, mask=mask
                     )
                     return lhs._copy_construct(data=data)
                 elif lhs.nullable:
-                    return func(lhs.fillna(fill_value), rhs)
+                    lhs = lhs.fillna(fill_value)
                 elif rhs.nullable:
-                    return func(lhs, rhs.fillna(fill_value))
-            elif is_scalar(rhs):
-                return func(lhs.fillna(fill_value), rhs)
-        return func(lhs, rhs)
+                    rhs = rhs.fillna(fill_value)
+
+        outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect)
+        result = lhs._copy_construct(data=outcol, name=result_name)
+        libcudf.nvtx.nvtx_range_pop()
+        return result
 
     def add(self, other, fill_value=None, axis=0):
         """Addition of series and other, element-wise
@@ -675,7 +657,7 @@ def add(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.add, fill_value)
+        return self._binaryop(other, "add", fill_value)
 
     def __add__(self, other):
         return self._binaryop(other, "add")
@@ -693,10 +675,12 @@ def radd(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.add, fill_value, True)
+        return self._binaryop(
+            other, "add", fill_value=fill_value, reflect=True
+        )
 
     def __radd__(self, other):
-        return self._rbinaryop(other, "add")
+        return self._binaryop(other, "add", reflect=True)
 
     def sub(self, other, fill_value=None, axis=0):
         """Subtraction of series and other, element-wise
@@ -711,7 +695,7 @@ def sub(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.sub, fill_value)
+        return self._binaryop(other, "sub", fill_value)
 
     def __sub__(self, other):
         return self._binaryop(other, "sub")
@@ -729,10 +713,10 @@ def rsub(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.sub, fill_value, True)
+        return self._binaryop(other, "sub", fill_value, reflect=True)
 
     def __rsub__(self, other):
-        return self._rbinaryop(other, "sub")
+        return self._binaryop(other, "sub", reflect=True)
 
     def mul(self, other, fill_value=None, axis=0):
         """Multiplication of series and other, element-wise
@@ -747,7 +731,7 @@ def mul(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.mul, fill_value)
+        return self._binaryop(other, "mul", fill_value=fill_value)
 
     def __mul__(self, other):
         return self._binaryop(other, "mul")
@@ -765,10 +749,10 @@ def rmul(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.mul, fill_value, True)
+        return self._binaryop(other, "mul", fill_value, True)
 
     def __rmul__(self, other):
-        return self._rbinaryop(other, "mul")
+        return self._binaryop(other, "mul", reflect=True)
 
     def mod(self, other, fill_value=None, axis=0):
         """Modulo of series and other, element-wise
@@ -783,7 +767,7 @@ def mod(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.mod, fill_value)
+        return self._binaryop(other, "mod", fill_value)
 
     def __mod__(self, other):
         return self._binaryop(other, "mod")
@@ -801,10 +785,10 @@ def rmod(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.mod, fill_value, True)
+        return self._binaryop(other, "mod", fill_value, True)
 
     def __rmod__(self, other):
-        return self._rbinaryop(other, "mod")
+        return self._binaryop(other, "mod", reflect=True)
 
     def pow(self, other, fill_value=None, axis=0):
         """Exponential power of series and other, element-wise
@@ -819,7 +803,7 @@ def pow(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.pow, fill_value)
+        return self._binaryop(other, "pow", fill_value)
 
     def __pow__(self, other):
         return self._binaryop(other, "pow")
@@ -837,10 +821,10 @@ def rpow(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.pow, fill_value, True)
+        return self._binaryop(other, "pow", fill_value, True)
 
     def __rpow__(self, other):
-        return self._rbinaryop(other, "pow")
+        return self._binaryop(other, "pow", reflect=True)
 
     def floordiv(self, other, fill_value=None, axis=0):
         """Integer division of series and other, element-wise
@@ -855,7 +839,7 @@ def floordiv(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.floordiv, fill_value)
+        return self._binaryop(other, "floordiv", fill_value)
 
     def __floordiv__(self, other):
         return self._binaryop(other, "floordiv")
@@ -873,12 +857,12 @@ def rfloordiv(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(
-            other, operator.floordiv, fill_value, True
+        return self._binaryop(
+            other, "floordiv", fill_value, True
         )
 
     def __rfloordiv__(self, other):
-        return self._rbinaryop(other, "floordiv")
+        return self._binaryop(other, "floordiv", reflect=True)
 
     def truediv(self, other, fill_value=None, axis=0):
         """Floating division of series and other, element-wise
@@ -893,7 +877,7 @@ def truediv(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.truediv, fill_value)
+        return self._binaryop(other, "truediv", fill_value)
 
     def __truediv__(self, other):
         if self.dtype in list(truediv_int_dtype_corrections.keys()):
@@ -915,14 +899,18 @@ def rtruediv(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.truediv, fill_value, True)
+        return self._binaryop(other, "truediv", fill_value, True)
 
     def __rtruediv__(self, other):
         if self.dtype in list(truediv_int_dtype_corrections.keys()):
             truediv_type = truediv_int_dtype_corrections[str(self.dtype)]
-            return self.astype(truediv_type)._rbinaryop(other, "truediv")
+            return self.astype(truediv_type)._binaryop(
+                other,
+                "truediv",
+                reflect=True
+            )
         else:
-            return self._rbinaryop(other, "truediv")
+            return self._binaryop(other, "truediv", reflect=True)
 
     __div__ = __truediv__
 
@@ -1021,7 +1009,7 @@ def eq(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.eq, fill_value)
+        return self._binaryop(other, "eq", fill_value)
 
     def __eq__(self, other):
         return self._unordered_compare(other, "eq")
@@ -1046,7 +1034,7 @@ def ne(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.ne, fill_value)
+        return self._binaryop(other, "ne", fill_value)
 
     def __ne__(self, other):
         return self._unordered_compare(other, "ne")
@@ -1064,7 +1052,7 @@ def lt(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.lt, fill_value)
+        return self._binaryop(other, "lt", fill_value)
 
     def __lt__(self, other):
         return self._ordered_compare(other, "lt")
@@ -1082,7 +1070,7 @@ def le(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.le, fill_value)
+        return self._binaryop(other, "le", fill_value)
 
     def __le__(self, other):
         return self._ordered_compare(other, "le")
@@ -1100,7 +1088,7 @@ def gt(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.gt, fill_value)
+        return self._binaryop(other, "gt", fill_value)
 
     def __gt__(self, other):
         return self._ordered_compare(other, "gt")
@@ -1118,7 +1106,7 @@ def ge(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._filled_binaryop(other, operator.ge, fill_value)
+        return self._binaryop(other, "ge", fill_value)
 
     def __ge__(self, other):
         return self._ordered_compare(other, "ge")

From ea3fb8928d7f63ffac7c65fc5f5473f42ec61ad4 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 12 Mar 2020 08:09:47 -0400
Subject: [PATCH 16/79] Fix to_cudf_compatible_scalar

---
 python/cudf/cudf/utils/dtypes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 2150646e434..8741f3e7046 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -143,6 +143,8 @@ def to_cudf_compatible_scalar(val, dtype=None):
     if val is None:
         return val
 
+    dtype = "str" if is_string_dtype(dtype) else dtype
+
     if not is_scalar(val):
         raise ValueError(
             f"Cannot convert value of type {type(val).__name__} "

From e739c076aae284505f350e22d5d424c86995e9a5 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 12 Mar 2020 11:36:24 -0400
Subject: [PATCH 17/79] Remove (un)ordered_compare methods across all column
 types

---
 python/cudf/cudf/core/column/categorical.py | 31 +++++------
 python/cudf/cudf/core/column/column.py      |  2 +-
 python/cudf/cudf/core/column/datetime.py    | 12 ++---
 python/cudf/cudf/core/column/numerical.py   | 18 +++----
 python/cudf/cudf/core/column/string.py      | 11 ++--
 python/cudf/cudf/core/series.py             | 57 ++++++---------------
 python/cudf/cudf/tests/test_categorical.py  |  6 +--
 7 files changed, 48 insertions(+), 89 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 69e89ab52cd..7778ecde5a9 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -140,7 +140,7 @@ def _categories_equal(self, new_categories, **kwargs):
 
             cur_categories = Series(cur_categories).sort_values()
             new_categories = Series(new_categories).sort_values()
-        return cur_categories.equals(new_categories)
+        return cur_categories._column.equals(new_categories._column)
 
     def _set_categories(self, new_categories, **kwargs):
         """Returns a new CategoricalColumn with the categories set to the
@@ -362,13 +362,6 @@ def ordered(self, value):
     def cat(self, parent=None):
         return CategoricalAccessor(self, parent=parent)
 
-    def binary_operator(self, binop, rhs, reflect=False):
-        msg = (
-            "Series of dtype `category` cannot perform the operation: "
-            "{}".format(binop)
-        )
-        raise TypeError(msg)
-
     def unary_operator(self, unaryop):
         msg = (
             "Series of dtype `category` cannot perform the operation: "
@@ -376,18 +369,20 @@ def unary_operator(self, unaryop):
         )
         raise TypeError(msg)
 
-    def unordered_compare(self, cmpop, rhs):
-        if self.dtype != rhs.dtype:
-            raise TypeError("Categoricals can only compare with the same type")
-        return self.as_numerical.unordered_compare(cmpop, rhs.as_numerical)
+    def binary_operator(self, op, rhs, reflect=False):
 
-    def ordered_compare(self, cmpop, rhs):
-        if not (self.ordered and rhs.ordered):
-            msg = "Unordered Categoricals can only compare equality or not"
-            raise TypeError(msg)
+        if not (self.ordered and rhs.ordered) and op not in ("eq", "ne"):
+            if op in ("lt", "gt", "le", "ge"):
+                raise TypeError(
+                    f"Unordered Categoricals can only compare equality or not"
+                )
+            raise TypeError(
+                f"Series of dtype `{self.dtype}` cannot perform the "
+                f"operation: {op}"
+            )
         if self.dtype != rhs.dtype:
             raise TypeError("Categoricals can only compare with the same type")
-        return self.as_numerical.ordered_compare(cmpop, rhs.as_numerical)
+        return self.as_numerical.binary_operator(op, rhs.as_numerical)
 
     def normalize_binop_value(self, other):
         ary = utils.scalar_broadcast_to(
@@ -631,7 +626,7 @@ def pandas_categorical_as_column(categorical, codes=None):
     codes = categorical.codes if codes is None else codes
     codes = column.as_column(codes)
 
-    valid_codes = codes.unordered_compare("ne", codes.dtype.type(-1))
+    valid_codes = codes.binary_operator("ne", codes.dtype.type(-1))
 
     mask = None
     if not valid_codes.all():
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9bc9bab6087..57fd880c51b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -140,7 +140,7 @@ def equals(self, other):
             if isinstance(val, np.ndarray):
                 return val.all()
             return bool(val)
-        return self.unordered_compare("eq", other).min()
+        return self.binary_operator("eq", other).min()
 
     def __sizeof__(self):
         n = self.data.size
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 029bb8dbee7..1ac5a90335f 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -177,14 +177,6 @@ def as_string_column(self, dtype, **kwargs):
         else:
             return column.column_empty(0, dtype="object", masked=False)
 
-    def unordered_compare(self, cmpop, rhs):
-        lhs, rhs = self, rhs
-        return binop(lhs, rhs, op=cmpop, out_dtype=np.bool)
-
-    def ordered_compare(self, cmpop, rhs):
-        lhs, rhs = self, rhs
-        return binop(lhs, rhs, op=cmpop, out_dtype=np.bool)
-
     def to_pandas(self, index=None):
         return pd.Series(
             self.to_array(fillna="pandas").astype(self.dtype), index=index
@@ -214,6 +206,10 @@ def default_na_value(self):
                 "datetime column of {} has no NaN value".format(self.dtype)
             )
 
+    def binary_operator(self, op, rhs, reflect=False):
+        lhs, rhs = self, rhs
+        return binop(lhs, rhs, op=op, out_dtype=np.bool)
+
     def fillna(self, fill_value):
         if is_scalar(fill_value):
             fill_value = np.datetime64(fill_value, self.time_unit)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 5bc3c106a12..1595a54c92d 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -59,6 +59,9 @@ def __contains__(self, item):
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
+    def unary_operator(self, unaryop):
+        return _numeric_column_unaryop(self, op=unaryop)
+
     def binary_operator(self, binop, rhs, reflect=False):
         int_dtypes = [
             np.dtype("int8"),
@@ -89,12 +92,6 @@ def binary_operator(self, binop, rhs, reflect=False):
     def unary_operator(self, unaryop):
         return _numeric_column_unaryop(self, op=unaryop)
 
-    def unordered_compare(self, cmpop, rhs):
-        return _numeric_column_compare(self, rhs, op=cmpop)
-
-    def ordered_compare(self, cmpop, rhs):
-        return _numeric_column_compare(self, rhs, op=cmpop)
-
     def _apply_scan_op(self, op):
         return libcudfxx.reduce.scan(op, self, True)
 
@@ -461,6 +458,9 @@ def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
 
     is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]
 
+    if is_op_comparison:
+        out_dtype = "bool"
+
     out = libcudfxx.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     if is_op_comparison:
@@ -478,10 +478,6 @@ def _numeric_column_unaryop(operand, op):
     return libcudfxx.unary.unary_operation(operand, op)
 
 
-def _numeric_column_compare(lhs, rhs, op):
-    return _numeric_column_binop(lhs, rhs, op, out_dtype=np.bool_)
-
-
 def _safe_cast_to_int(col, dtype):
     """
     Cast given NumericalColumn to given integer dtype safely.
@@ -492,7 +488,7 @@ def _safe_cast_to_int(col, dtype):
         return col
 
     new_col = col.astype(dtype)
-    if new_col.unordered_compare("eq", col).all():
+    if new_col.binary_operator("eq", col).all():
         return new_col
     else:
         raise TypeError(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 048e2dadb07..e280c86187a 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1023,9 +1023,6 @@ def sort_by_values(self, ascending=True, na_position="last"):
     def copy(self, deep=True):
         return column.as_column(self.nvstrings.copy())
 
-    def unordered_compare(self, cmpop, rhs):
-        return _string_column_binop(self, rhs, op=cmpop)
-
     def find_and_replace(self, to_replace, replacement, all_nan):
         """
         Return col with *to_replace* replaced with *value*
@@ -1108,15 +1105,17 @@ def normalize_binop_value(self, other):
     def default_na_value(self):
         return None
 
-    def binary_operator(self, binop, rhs, reflect=False):
+    def binary_operator(self, op, rhs, reflect=False):
         lhs = self
         if reflect:
             lhs, rhs = rhs, lhs
-        if isinstance(rhs, StringColumn) and binop == "add":
+        if isinstance(rhs, StringColumn) and op == "add":
             return lhs.nvstrings.cat(others=rhs.nvstrings)
+        elif op in ("eq", "ne"):
+            return _string_column_binop(self, rhs, op=op)
         else:
             msg = "{!r} operator not supported between {} and {}"
-            raise TypeError(msg.format(binop, type(self), type(rhs)))
+            raise TypeError(msg.format(op, type(self), type(rhs)))
 
     def sum(self, dtype=None):
         # dtype is irrelevant it is needed to be in sync with
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index ace410f665a..93f6edcd4d2 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -619,6 +619,11 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False):
             lhs, rhs = self, other
         rhs = self._normalize_binop_value(rhs)
 
+        if fn == "truediv":
+            if str(lhs.dtype) in truediv_int_dtype_corrections:
+                truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)]
+                lhs = lhs.astype(truediv_type)
+
         if fill_value is not None:
             if is_scalar(rhs):
                 lhs = lhs.fillna(fill_value)
@@ -857,9 +862,7 @@ def rfloordiv(self, other, fill_value=None, axis=0):
         """
         if axis != 0:
             raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(
-            other, "floordiv", fill_value, True
-        )
+        return self._binaryop(other, "floordiv", fill_value, True)
 
     def __rfloordiv__(self, other):
         return self._binaryop(other, "floordiv", reflect=True)
@@ -880,11 +883,7 @@ def truediv(self, other, fill_value=None, axis=0):
         return self._binaryop(other, "truediv", fill_value)
 
     def __truediv__(self, other):
-        if self.dtype in list(truediv_int_dtype_corrections.keys()):
-            truediv_type = truediv_int_dtype_corrections[str(self.dtype)]
-            return self.astype(truediv_type)._binaryop(other, "truediv")
-        else:
-            return self._binaryop(other, "truediv")
+        return self._binaryop(other, "truediv")
 
     def rtruediv(self, other, fill_value=None, axis=0):
         """Floating division of series and other, element-wise
@@ -902,15 +901,7 @@ def rtruediv(self, other, fill_value=None, axis=0):
         return self._binaryop(other, "truediv", fill_value, True)
 
     def __rtruediv__(self, other):
-        if self.dtype in list(truediv_int_dtype_corrections.keys()):
-            truediv_type = truediv_int_dtype_corrections[str(self.dtype)]
-            return self.astype(truediv_type)._binaryop(
-                other,
-                "truediv",
-                reflect=True
-            )
-        else:
-            return self._binaryop(other, "truediv", reflect=True)
+        return self._binaryop(other, "truediv", reflect=True)
 
     __div__ = __truediv__
 
@@ -978,24 +969,6 @@ def _normalize_binop_value(self, other):
         else:
             return self._column.normalize_binop_value(other)
 
-    def _unordered_compare(self, other, cmpops):
-        libcudf.nvtx.nvtx_range_push("CUDF_UNORDERED_COMP", "orange")
-        result_name = utils.get_result_name(self, other)
-        other = self._normalize_binop_value(other)
-        outcol = self._column.unordered_compare(cmpops, other)
-        result = self._copy_construct(data=outcol, name=result_name)
-        libcudf.nvtx.nvtx_range_pop()
-        return result
-
-    def _ordered_compare(self, other, cmpops):
-        libcudf.nvtx.nvtx_range_push("CUDF_ORDERED_COMP", "orange")
-        result_name = utils.get_result_name(self, other)
-        other = self._normalize_binop_value(other)
-        outcol = self._column.ordered_compare(cmpops, other)
-        result = self._copy_construct(data=outcol, name=result_name)
-        libcudf.nvtx.nvtx_range_pop()
-        return result
-
     def eq(self, other, fill_value=None, axis=0):
         """Equal to of series and other, element-wise
         (binary operator eq).
@@ -1012,14 +985,14 @@ def eq(self, other, fill_value=None, axis=0):
         return self._binaryop(other, "eq", fill_value)
 
     def __eq__(self, other):
-        return self._unordered_compare(other, "eq")
+        return self._binaryop(other, "eq")
 
     def equals(self, other):
         if self is other:
             return True
         if other is None or len(self) != len(other):
             return False
-        return self._unordered_compare(other, "eq").min()
+        return self._binaryop(other, "eq").min()
 
     def ne(self, other, fill_value=None, axis=0):
         """Not equal to of series and other, element-wise
@@ -1037,7 +1010,7 @@ def ne(self, other, fill_value=None, axis=0):
         return self._binaryop(other, "ne", fill_value)
 
     def __ne__(self, other):
-        return self._unordered_compare(other, "ne")
+        return self._binaryop(other, "ne")
 
     def lt(self, other, fill_value=None, axis=0):
         """Less than of series and other, element-wise
@@ -1055,7 +1028,7 @@ def lt(self, other, fill_value=None, axis=0):
         return self._binaryop(other, "lt", fill_value)
 
     def __lt__(self, other):
-        return self._ordered_compare(other, "lt")
+        return self._binaryop(other, "lt")
 
     def le(self, other, fill_value=None, axis=0):
         """Less than or equal to of series and other, element-wise
@@ -1073,7 +1046,7 @@ def le(self, other, fill_value=None, axis=0):
         return self._binaryop(other, "le", fill_value)
 
     def __le__(self, other):
-        return self._ordered_compare(other, "le")
+        return self._binaryop(other, "le")
 
     def gt(self, other, fill_value=None, axis=0):
         """Greater than of series and other, element-wise
@@ -1091,7 +1064,7 @@ def gt(self, other, fill_value=None, axis=0):
         return self._binaryop(other, "gt", fill_value)
 
     def __gt__(self, other):
-        return self._ordered_compare(other, "gt")
+        return self._binaryop(other, "gt")
 
     def ge(self, other, fill_value=None, axis=0):
         """Greater than or equal to of series and other, element-wise
@@ -1109,7 +1082,7 @@ def ge(self, other, fill_value=None, axis=0):
         return self._binaryop(other, "ge", fill_value)
 
     def __ge__(self, other):
-        return self._ordered_compare(other, "ge")
+        return self._binaryop(other, "ge")
 
     def __invert__(self):
         """Bitwise invert (~) for each element.
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 2f857e7023a..54a35f58130 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -150,7 +150,7 @@ def test_categorical_binary_add():
     with pytest.raises(TypeError) as raises:
         sr + sr
     raises.match(
-        "Series of dtype `category` cannot perform the operation: " "add"
+        "Series of dtype `category` cannot perform the operation: add"
     )
 
 
@@ -166,7 +166,7 @@ def test_categorical_unary_ceil():
     with pytest.raises(TypeError) as raises:
         sr.ceil()
     raises.match(
-        "Series of dtype `category` cannot perform the operation: " "ceil"
+        "Series of dtype `category` cannot perform the operation: ceil"
     )
 
 
@@ -243,7 +243,7 @@ def test_cat_series_binop_error():
     with pytest.raises(TypeError) as raises:
         dfa + dfb
     raises.match(
-        "Series of dtype `category` cannot perform the operation: " "add"
+        "Series of dtype `category` cannot perform the operation: add"
     )
     # if lhs is a numerical
     with pytest.raises(TypeError) as raises:

From 32b39d2c1041e1d314fe268ff86252fb74bcfaf5 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 12 Mar 2020 12:14:31 -0400
Subject: [PATCH 18/79] Port datetime/string ops to libcudfxx

---
 python/cudf/cudf/core/column/datetime.py |  4 +---
 python/cudf/cudf/core/column/string.py   | 12 ++++--------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 1ac5a90335f..70728889852 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -288,8 +288,6 @@ def is_monotonic_decreasing(self):
 
 def binop(lhs, rhs, op, out_dtype):
     libcudf.nvtx.nvtx_range_push("CUDF_BINARY_OP", "orange")
-    masked = lhs.nullable or rhs.nullable
-    out = column.column_empty_like(lhs, dtype=out_dtype, masked=masked)
-    _ = libcudf.binops.apply_op(lhs, rhs, out, op)
+    out = libcudfxx.binaryop.binaryop(lhs, rhs, op, out_dtype)
     libcudf.nvtx.nvtx_range_pop()
     return out
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index e280c86187a..ba2811fbe75 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1111,8 +1111,8 @@ def binary_operator(self, op, rhs, reflect=False):
             lhs, rhs = rhs, lhs
         if isinstance(rhs, StringColumn) and op == "add":
             return lhs.nvstrings.cat(others=rhs.nvstrings)
-        elif op in ("eq", "ne"):
-            return _string_column_binop(self, rhs, op=op)
+        elif op in ("eq", "ne", "gt", "lt", "ge", "le"):
+            return _string_column_binop(self, rhs, op=op, out_dtype="bool")
         else:
             msg = "{!r} operator not supported between {} and {}"
             raise TypeError(msg.format(op, type(self), type(rhs)))
@@ -1164,12 +1164,8 @@ def _mimic_inplace(self, other_col, inplace=False):
         return out
 
 
-def _string_column_binop(lhs, rhs, op):
+def _string_column_binop(lhs, rhs, op, out_dtype):
     nvtx_range_push("CUDF_BINARY_OP", "orange")
-    # Allocate output
-    masked = lhs.nullable or rhs.nullable
-    out = column.column_empty_like(lhs, dtype="bool", masked=masked)
-    # Call and fix null_count
-    _ = libcudf.binops.apply_op(lhs=lhs, rhs=rhs, out=out, op=op)
+    out = libcudfxx.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype=out_dtype)
     nvtx_range_pop()
     return out

From acc60a55119f9a8a90fe6964f3a407c347acce4b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 12 Mar 2020 14:53:07 -0700
Subject: [PATCH 19/79] fix .str.rsplit to be similar to .str.split and enable
 tests

---
 python/cudf/cudf/core/column/string.py | 16 +++++++++++-----
 python/cudf/cudf/tests/test_string.py  | 21 ++++++++-------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 24984aadd4e..982777d526c 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -879,10 +879,11 @@ def split(self, pat=None, n=-1, expand=True, **kwargs):
         from cudf._libxx.scalar import Scalar
 
         result_table = cpp_split(self._column, Scalar(pat, "str"), n)
-
         if len(result_table._data) == 1:
             if result_table._data[0].null_count == len(self._parent):
                 result_table = []
+            elif self._parent.null_count == len(self._parent):
+                result_table = [self._column.copy()]
 
         return self._return_or_inplace(result_table, **kwargs,)
 
@@ -920,13 +921,18 @@ def rsplit(self, pat=None, n=-1, expand=True, **kwargs):
 
         kwargs.setdefault("expand", expand)
         if pat is None:
-            pat = " "
+            pat = ""
 
         from cudf._libxx.scalar import Scalar
 
-        return self._return_or_inplace(
-            cpp_rsplit(self._column, Scalar(pat), n), **kwargs
-        )
+        result_table = cpp_rsplit(self._column, Scalar(pat), n)
+        if len(result_table._data) == 1:
+            if result_table._data[0].null_count == len(self._parent):
+                result_table = []
+            elif self._parent.null_count == len(self._parent):
+                result_table = [self._column.copy()]
+
+        return self._return_or_inplace(result_table, **kwargs)
 
     def partition(self, sep=" ", expand=True, **kwargs):
         """
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 96273e0a483..9764a20f0cf 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -469,11 +469,8 @@ def test_string_extract(ps_gs, pat, expand, flags, flags_raise):
         ("a", False),
         ("a", True),
         ("f", False),
-        # TODO, PREM: Analyse and uncomment the
-        # two tests as they seem to pass when run
-        # as independent test but seem to fail as a group test.
-        # (r"[a-z]", True),
-        # (r"[A-Z]", True),
+        (r"[a-z]", True),
+        (r"[A-Z]", True),
         ("hello", False),
         ("FGHI", False),
     ],
@@ -537,8 +534,7 @@ def test_string_upper(ps_gs):
         ["a b", " c ", "   d", "e   ", "f"],
         ["a-b", "-c-", "---d", "e---", "f"],
         ["ab", "c", "d", "e", "f"],
-        # TODO, PREM: Uncomment in future PR
-        # [None, None, None, None, None],
+        [None, None, None, None, None],
     ],
 )
 @pytest.mark.parametrize("pat", [None, " ", "-"])
@@ -1226,11 +1222,11 @@ def test_strings_rsplit(data, n, expand):
     gs = Series(data)
     ps = pd.Series(data)
 
-    # TODO: Uncomment this test once
-    # this is fixed: https://github.com/rapidsai/cudf/issues/4357
-    # assert_eq(
-    #     ps.str.rsplit(n=n, expand=expand), gs.str.rsplit(n=n, expand=expand)
-    # )
+    pd.testing.assert_frame_equal(
+        ps.str.rsplit(n=n, expand=expand).reset_index(),
+        gs.str.rsplit(n=n, expand=expand).to_pandas().reset_index(),
+        check_index_type=False,
+    )
     assert_eq(
         ps.str.rsplit(",", n=n, expand=expand),
         gs.str.rsplit(",", n=n, expand=expand),
@@ -1591,7 +1587,6 @@ def test_string_starts_ends(data, pat):
     [
         # TODO, PREM: Uncomment after this issue is fixed
         # '',
-        # None,
         " ",
         "a",
         "abc",

From ade1f53fbfa4a39fe5b956b561b857497f56a88b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 12 Mar 2020 16:55:39 -0500
Subject: [PATCH 20/79] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76b6c91caeb..69b6fcce685 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -249,6 +249,7 @@
 - PR #4434 Fix join_strings logic with all-null strings and non-null narep
 - PR #4464 Update Cmake to always link in libnvToolsExt
 - PR #4467 Fix dropna issue for a DataFrame having np.nan
+- PR #4482 Fix `.str.rsplit` & `.str.split` and enable related tests
 
 
 # cuDF 0.12.0 (04 Feb 2020)

From 7efdc49ab94a6e1dd92a9e70ada0e66dcd69ad24 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Fri, 13 Mar 2020 09:29:20 -0500
Subject: [PATCH 21/79] nvtext cython stubs

---
 .../cudf/cudf/_libxx/cpp/nvtext/__init__.pxd  |  0
 .../_libxx/cpp/nvtext/generate_ngrams.pxd     | 16 ++++
 .../_libxx/cpp/nvtext/ngrams_tokenize.pxd     | 17 ++++
 .../cudf/cudf/_libxx/cpp/nvtext/normalize.pxd | 12 +++
 .../cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd  | 29 +++++++
 python/cudf/cudf/_libxx/nvtext/__init__.py    |  0
 .../cudf/_libxx/nvtext/generate_ngrams.pyx    | 32 +++++++
 .../cudf/_libxx/nvtext/ngrams_tokenize.pyx    | 39 +++++++++
 python/cudf/cudf/_libxx/nvtext/normalize.pyx  | 22 +++++
 python/cudf/cudf/_libxx/nvtext/tokenize.pyx   | 87 +++++++++++++++++++
 10 files changed, 254 insertions(+)
 create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/__init__.pxd
 create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
 create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
 create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd
 create mode 100644 python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
 create mode 100644 python/cudf/cudf/_libxx/nvtext/__init__.py
 create mode 100644 python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
 create mode 100644 python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
 create mode 100644 python/cudf/cudf/_libxx/nvtext/normalize.pyx
 create mode 100644 python/cudf/cudf/_libxx/nvtext/tokenize.pyx

diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/__init__.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/__init__.pxd
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
new file mode 100644
index 00000000000..f20209956b1
--- /dev/null
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.scalar.scalar cimport scalar
+from cudf._libxx.cpp.types cimport size_type
+
+cdef extern from "cudf/nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
+
+    cdef unique_ptr[column] generate_ngrams(
+        const column_view &strings,
+        size_type ngrams,
+        const scalar & separator
+    ) except +
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
new file mode 100644
index 00000000000..0fc892d3b9d
--- /dev/null
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
@@ -0,0 +1,17 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.scalar.scalar cimport scalar
+from cudf._libxx.cpp.types cimport size_type
+
+cdef extern from "cudf/nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
+
+    cdef unique_ptr[column] ngrams_tokenize(
+        const column_view & strings,
+        size_type ngrams,
+        const scalar & delimiter,
+        const scalar & separator
+    ) except +
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd
new file mode 100644
index 00000000000..dc4b060d7f6
--- /dev/null
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.column.column_view cimport column_view
+
+cdef extern from "cudf/nvtext/normalize.hpp" namespace "nvtext" nogil:
+
+    cdef unique_ptr[column] normalize_spaces(
+        const column_view & strings
+    ) except +
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
new file mode 100644
index 00000000000..0c653c7afbe
--- /dev/null
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
@@ -0,0 +1,29 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.scalar.scalar cimport scalar
+
+cdef extern from "cudf/nvtext/tokenize.hpp" namespace "nvtext" nogil:
+
+    cdef unique_ptr[column] tokenize(
+        const column_view & strings,
+        const scalar & delimiter
+    ) except +
+
+    cdef unique_ptr[column] tokenize(
+        const column_view & strings,
+        const column_view & delimiters
+    ) except +
+
+    cdef unique_ptr[column] count_tokens(
+        const column_view & strings,
+        const scalar & delimiter
+    ) except +
+
+    cdef unique_ptr[column] count_tokens(
+        const column_view & strings,
+        const column_view & delimiters
+    ) except +
diff --git a/python/cudf/cudf/_libxx/nvtext/__init__.py b/python/cudf/cudf/_libxx/nvtext/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
new file mode 100644
index 00000000000..51796e5411a
--- /dev/null
+++ b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
@@ -0,0 +1,32 @@
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from cudf._libxx.move cimport move
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.scalar.scalar cimport scalar
+from cudf._libxx.cpp.types cimport size_type
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.nvtext.generate_ngrams cimport (
+    generate_ngrams as cpp_generate_ngrams
+)
+from cudf._libxx.column cimport Column
+from cudf._libxx.scalar cimport Scalar
+
+
+def generate_ngrams(Column strings, int ngrams, Scalar separator):
+    cdef column_view source_view = strings.view()
+    cdef size_type c_ngrams = ngrams
+    cdef scalar* c_separator = separator.c_value.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_generate_ngrams(
+                c_strings,
+                c_ngrams
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
new file mode 100644
index 00000000000..1f7afa2bf09
--- /dev/null
+++ b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
@@ -0,0 +1,39 @@
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from cudf._libxx.move cimport move
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.scalar.scalar cimport scalar
+from cudf._libxx.cpp.types cimport size_type
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport (
+    ngrams_tokenize as cpp_ngrams_tokenize
+)
+from cudf._libxx.column cimport Column
+from cudf._libxx.scalar cimport Scalar
+
+
+def ngrams_tokenize(
+    Column strings,
+    int ngrams,
+    Scalar delimiter,
+    Scalar separator
+):
+    cdef column_view source_view = strings.view()
+    cdef size_type c_ngrams = ngrams
+    cdef scalar* c_separator = separator.c_value.get()
+    cdef scalar* c_delimiter = delimiter.c_value.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_ngrams_tokenize(
+                c_strings,
+                c_ngrams
+                c_delimiter[0]
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/nvtext/normalize.pyx b/python/cudf/cudf/_libxx/nvtext/normalize.pyx
new file mode 100644
index 00000000000..7ed15b9dfc6
--- /dev/null
+++ b/python/cudf/cudf/_libxx/nvtext/normalize.pyx
@@ -0,0 +1,22 @@
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from cudf._libxx.move cimport move
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.nvtext.normalize cimport (
+    normalize as cpp_normalize
+)
+from cudf._libxx.column cimport Column
+from cudf._libxx.scalar cimport Scalar
+
+
+def normalize_spaces(Column strings, int ngrams):
+    cdef column_view source_view = strings.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_normalize(c_strings))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
new file mode 100644
index 00000000000..5383b8136b2
--- /dev/null
+++ b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
@@ -0,0 +1,87 @@
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from cudf._libxx.move cimport move
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.scalar.scalar cimport scalar
+from cudf._libxx.cpp.types cimport size_type
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.nvtext.tokenize cimport (
+    tokenize as cpp_tokenize,
+    count_tokens as cpp_count_tokens
+)
+from cudf._libxx.column cimport Column
+from cudf._libxx.scalar cimport Scalar
+
+
+def tokenize(Column strings, Scalar delimiter):
+    cdef column_view c_strings = strings.view()
+    cdef scalar* c_delimiter = delimiter.c_value.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_tokenize(
+                c_strings,
+                c_ngrams
+                c_delimiter[0]
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def tokenize(Column strings, Column delimiters):
+    cdef column_view c_strings = strings.view()
+    cdef column_view c_delimiter = delimiter.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_tokenize(
+                c_strings,
+                c_ngrams
+                c_delimiter[0]
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def count_tokens(Column strings, Scalar delimiter):
+    cdef column_view c_strings = strings.view()
+    cdef scalar* c_delimiter = delimiter.c_value.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_count_tokens(
+                c_strings,
+                c_ngrams
+                c_delimiter[0]
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def count_tokens(Column strings, Column delimiters):
+    cdef column_view c_strings = strings.view()
+    cdef column_view c_delimiter = delimiter.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_count_tokens(
+                c_strings,
+                c_ngrams
+                c_delimiter[0]
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))

From f33806c812abfe3b2e0365796ec4a9d8326e7ddf Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Fri, 13 Mar 2020 20:27:24 +0530
Subject: [PATCH 22/79] Chnagelog typo

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cdc3e7eb71d..2236e701ed9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -234,7 +234,7 @@
 - PR #4358 Fix strings::concat where narep is an empty string
 - PR #4369 Fix race condition in gpuinflate
 - PR #4390 Disable ScatterValid and ScatterNull legacy tests
-- PR #4398 Fixes the bug in groupby in MIN/MAX on strings when strings some groups are empty
+- PR #4398 Fixes the failure in groupby in MIN/MAX on strings when some groups are empty
 - PR #4406 Fix sorted merge issue with null values and ascending=False
 - PR #4423 Tighten up Dask serialization checks
 - PR #4434 Fix join_strings logic with all-null strings and non-null narep

From a77f47864acdcd278c60d5c72ed33b1da0bfb5e8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 13 Mar 2020 10:13:45 -0500
Subject: [PATCH 23/79] Update python/cudf/cudf/core/column/string.py

Co-Authored-By: Keith Kraus <keith.j.kraus@gmail.com>
---
 python/cudf/cudf/core/column/string.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 982777d526c..8196719f60f 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -882,7 +882,7 @@ def split(self, pat=None, n=-1, expand=True, **kwargs):
         if len(result_table._data) == 1:
             if result_table._data[0].null_count == len(self._parent):
                 result_table = []
-            elif self._parent.null_count == len(self._parent):
+            elif self._column.null_count == len(self._column):
                 result_table = [self._column.copy()]
 
         return self._return_or_inplace(result_table, **kwargs,)

From cfe1786a2ddbdfcc0c7bb1b2be52dbaf797231ed Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 13 Mar 2020 10:13:56 -0500
Subject: [PATCH 24/79] Update python/cudf/cudf/core/column/string.py

Co-Authored-By: Keith Kraus <keith.j.kraus@gmail.com>
---
 python/cudf/cudf/core/column/string.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 8196719f60f..ebe338a8014 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -880,7 +880,7 @@ def split(self, pat=None, n=-1, expand=True, **kwargs):
 
         result_table = cpp_split(self._column, Scalar(pat, "str"), n)
         if len(result_table._data) == 1:
-            if result_table._data[0].null_count == len(self._parent):
+            if result_table._data[0].null_count == len(self._column):
                 result_table = []
             elif self._column.null_count == len(self._column):
                 result_table = [self._column.copy()]

From f198709ac65add6b0229ce1a62bda3e2c267427b Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 13 Mar 2020 08:37:00 -0700
Subject: [PATCH 25/79] enable replace_with_backrefs tests

---
 python/cudf/cudf/tests/test_string.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 9764a20f0cf..abb0dd45cdc 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1479,8 +1479,6 @@ def test_string_replace_multi():
     assert_eq(expect, got)
 
 
-# TODO, PREM: Uncomment this following tests after
-# this is fixed: https://github.com/rapidsai/cudf/issues/4380
 @pytest.mark.parametrize(
     "find",
     [
@@ -1488,22 +1486,15 @@ def test_string_replace_multi():
         "(\\d)(\\d)",
         "(\\d)(\\d)",
         "(\\d)(\\d)",
-        # "([a-z])-([a-z])",
+        "([a-z])-([a-z])",
         "([a-z])-([a-zé])",
         "([a-z])-([a-z])",
-        # "([a-z])-([a-zé])",
+        "([a-z])-([a-zé])",
     ],
 )
 @pytest.mark.parametrize(
     "replace",
-    [
-        "\\1-\\2",
-        "V\\2-\\1",
-        "\\1 \\2",
-        "\\2 \\1",
-        # "X\\1+\\2Z",
-        #  "X\\1+\\2Z"
-    ],
+    ["\\1-\\2", "V\\2-\\1", "\\1 \\2", "\\2 \\1", "X\\1+\\2Z", "X\\1+\\2Z"],
 )
 def test_string_replace_with_backrefs(find, replace):
     s = [
@@ -1513,7 +1504,7 @@ def test_string_replace_with_backrefs(find, replace):
         None,
         "tést-string",
         "two-thréé four-fivé",
-        # "abcd-éfgh",
+        "abcd-éfgh",
         "tést-string-again",
     ]
     ps = pd.Series(s)

From b3a4a2bd78c506a23285f4f0067c4f709c9c0c42 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)" <ramakrishnap@nvidia.com>
Date: Fri, 13 Mar 2020 10:57:43 -0500
Subject: [PATCH 26/79] code changes

---
 python/cudf/cudf/_libxx/copying.pyx | 43 +++++++++++++++++++++++------
 python/cudf/cudf/core/dataframe.py  |  2 +-
 python/cudf/cudf/core/frame.py      | 28 +++++++++++++++----
 3 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/_libxx/copying.pyx b/python/cudf/cudf/_libxx/copying.pyx
index 4c692e7b210..03d3a017137 100644
--- a/python/cudf/cudf/_libxx/copying.pyx
+++ b/python/cudf/cudf/_libxx/copying.pyx
@@ -313,9 +313,14 @@ def column_allocate_like(Column input_column, size=None):
     return Column.from_unique_ptr(move(c_result))
 
 
-def table_empty_like(Table input_table):
+def table_empty_like(Table input_table, bool keep_index=True):
+
+    cdef table_view input_table_view
+    if keep_index is True:
+        input_table_view = input_table.view()
+    else:
+        input_table_view = input_table.data_view()
 
-    cdef table_view input_table_view = input_table.view()
     cdef unique_ptr[table] c_result
 
     with nogil:
@@ -324,7 +329,9 @@ def table_empty_like(Table input_table):
     return Table.from_unique_ptr(
         move(c_result),
         column_names=input_table._column_names,
-        index_names=input_table._index._column_names
+        index_names=(
+            input_table._index._column_names if keep_index is True else None
+        )
     )
 
 
@@ -357,9 +364,14 @@ def column_slice(Column input_column, object indices):
     return result
 
 
-def table_slice(Table input_table, object indices):
+def table_slice(Table input_table, object indices, bool keep_index=True):
+
+    cdef table_view input_table_view
+    if keep_index is True:
+        input_table_view = input_table.view()
+    else:
+        input_table_view = input_table.data_view()
 
-    cdef table_view input_table_view = input_table.view()
     cdef vector[size_type] c_indices
     c_indices.reserve(len(indices))
 
@@ -382,7 +394,11 @@ def table_slice(Table input_table, object indices):
             c_result[i],
             input_table,
             column_names=input_table._column_names,
-            index_names=input_table._index._column_names
+            index_names=(
+                input_table._index._column_names if (
+                    keep_index is True)
+                else None
+            )
         ) for i in range(num_of_result_cols)]
 
     return result
@@ -419,9 +435,14 @@ def column_split(Column input_column, object splits):
     return result
 
 
-def table_split(Table input_table, object splits):
+def table_split(Table input_table, object splits, keep_index=True):
+
+    cdef table_view input_table_view
+    if keep_index is True:
+        input_table_view = input_table.view()
+    else:
+        input_table_view = input_table.data_view()
 
-    cdef table_view input_table_view = input_table.view()
     cdef vector[size_type] c_splits
     c_splits.reserve(len(splits))
 
@@ -444,7 +465,11 @@ def table_split(Table input_table, object splits):
             c_result[i],
             input_table,
             column_names=input_table._column_names,
-            index_names=input_table._index._column_names
+            index_names=(
+                input_table._index._column_names if (
+                    keep_index is True)
+                else None
+            )
         ) for i in range(num_of_result_cols)]
 
     return result
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4c31547cb1b..2e18dcedc5f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3964,7 +3964,7 @@ def scatter_by_map(self, map_index, map_size=None, keep_index=True):
             # Append empty dataframes if map_size > len(tables)
 
             for i in range(map_size - len(tables)):
-                tables.append(self.take([]))
+                tables.append(self._empty_like(keep_index))
         return tables
 
     def stack(self, level=-1, dropna=True):
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 72bf2af4cfc..72b5a4eacca 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -106,8 +106,10 @@ def _scatter(self, key, value):
         result._copy_categories(self)
         return result
 
-    def _empty_like(self):
-        result = self._from_table(libcudfxx.copying.table_empty_like(self))
+    def _empty_like(self, keep_index=True):
+        result = self._from_table(
+            libcudfxx.copying.table_empty_like(self, keep_index)
+        )
 
         result._copy_categories(self)
         return result
@@ -121,6 +123,8 @@ def _slice(self, arg):
        arg : should always be of type slice and doesn't handle step
 
        """
+        from cudf.core.index import RangeIndex
+
         num_rows = len(self)
         if num_rows == 0:
             return self
@@ -131,22 +135,34 @@ def _slice(self, arg):
                 """Step size is not supported other than None and 1"""
             )
 
+        # This is just to handle RangeIndex type, stop
+        # it from materializing unnecessarily
+        keep_index = True
+        if isinstance(self.index, RangeIndex):
+            keep_index = False
+
         if start < 0:
             start = start + num_rows
         if stop < 0:
             stop = stop + num_rows
 
         if start > stop:
-            return self._empty_like()
+            return self._empty_like(keep_index)
         else:
             start = len(self) if start > num_rows else start
             stop = len(self) if stop > num_rows else stop
 
             result = self._from_table(
-                libcudfxx.copying.table_slice(self, [start, stop])[0]
+                libcudfxx.copying.table_slice(self, [start, stop], keep_index)[
+                    0
+                ]
             )
 
-            result._copy_categories(self)
+            result._copy_categories(self, keep_index)
+            # Adding index of type RangeIndex back to
+            # result
+            if keep_index is False:
+                result.index = RangeIndex(start, stop)
             return result
 
     def _normalize_scalars(self, other):
@@ -366,7 +382,7 @@ def _scatter_to_tables(self, scatter_map, keep_index=True):
             self, scatter_map, keep_index
         )
         result = [self._from_table(tbl) for tbl in result]
-        [frame._copy_categories(self) for frame in result]
+        [frame._copy_categories(self, keep_index) for frame in result]
 
         return result
 

From 39e1103b819bc644edf9bf77c2f65b371e43e905 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)" <ramakrishnap@nvidia.com>
Date: Fri, 13 Mar 2020 11:03:59 -0500
Subject: [PATCH 27/79] CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54329da76a7..33918e3e8c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -165,6 +165,7 @@
 - PR #4316 Add Java and JNI bindings for substring expression
 - PR #4314 Add Java and JNI bindings for string contains
 - PR #4461 Port nvstrings Miscellaneous functions to cuDF Python/Cython
+- PR #4499 Adding changes to handle include `keep_index` and `RangeIndex`
 
 ## Bug Fixes
 

From 5b1d24ce4cbd51d50d7cae7868bf8024e298ee27 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 13 Mar 2020 09:17:04 -0700
Subject: [PATCH 28/79] clean up .str.join stale code

---
 python/cudf/cudf/core/column/string.py | 38 --------------------------
 1 file changed, 38 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ebe338a8014..9296ebc9e70 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -295,44 +295,6 @@ def cat(self, others=None, sep=None, na_rep=None, **kwargs):
             out = out[0]
         return out
 
-    # TODO, PREM: Uncomment in future PR
-    # def join(self, sep, na_rep="", **kwargs):
-    #     """
-    #     Join lists contained as elements in the Series/Index with passed
-    #     delimiter.
-
-    #     Parameters
-    #     ----------
-    #         sep : str
-    #             Delimiter to use between list entries.
-
-    #         na_rep : str
-    #             This character will take the place of any null strings
-    #             (not empty strings) in either list.
-
-    #     Returns
-    #     -------
-    #     Series/Index of str dtype
-    #         The list entries concatenated by intervening
-    #         occurrences of the delimiter.
-
-    #     """
-    #     from cudf._libxx.scalar import Scalar
-    #     from cudf.core.series import Series
-    #     # import pdb; pdb.set_trace()
-
-    #     data = cpp_join(self._column, Scalar(sep), Scalar(na_rep))
-    #     if len(data) != len(self._parent):
-    #         data = column.as_column(
-    #             utils.scalar_broadcast_to(data[0],
-    # len(self._parent), dtype='str')
-    #         )
-    #     return Series(
-    #         data=data,
-    #         index=self._parent.index,
-    #         dtype='str'
-    #     )
-
     def join(self, sep):
         """
         Join lists contained as elements in the Series/Index with passed

From 56b0859311036af30193db45ec0748ac54648336 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)" <ramakrishnap@nvidia.com>
Date: Fri, 13 Mar 2020 11:56:50 -0500
Subject: [PATCH 29/79] Review changes

---
 python/cudf/cudf/core/dataframe.py |  2 +-
 python/cudf/cudf/core/frame.py     | 20 +++++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2e18dcedc5f..ea57b9b92b7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1490,7 +1490,6 @@ def reset_index(self, drop=False, inplace=False):
             result = self
         else:
             result = self.copy()
-        index_columns = self.index._data.columns
         if all(name is None for name in self.index.names):
             if isinstance(self.index, cudf.MultiIndex):
                 names = tuple(
@@ -1502,6 +1501,7 @@ def reset_index(self, drop=False, inplace=False):
             names = self.index.names
 
         if not drop:
+            index_columns = self.index._data.columns
             for name, index_column in zip(
                 reversed(names), reversed(index_columns)
             ):
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 72b5a4eacca..7f86c55cf6c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -84,7 +84,7 @@ def _hash_partition(
             self, columns_to_hash, num_partitions, keep_index
         )
         output = self.__class__._from_table(output)
-        output._copy_categories(self)
+        output._copy_categories(self, include_index=keep_index)
         return output, offsets
 
     def _as_column(self):
@@ -111,7 +111,7 @@ def _empty_like(self, keep_index=True):
             libcudfxx.copying.table_empty_like(self, keep_index)
         )
 
-        result._copy_categories(self)
+        result._copy_categories(self, include_index=keep_index)
         return result
 
     def _slice(self, arg):
@@ -124,6 +124,7 @@ def _slice(self, arg):
 
        """
         from cudf.core.index import RangeIndex
+        from cudf import DataFrame, Series
 
         num_rows = len(self)
         if num_rows == 0:
@@ -138,7 +139,9 @@ def _slice(self, arg):
         # This is just to handle RangeIndex type, stop
         # it from materializing unnecessarily
         keep_index = True
-        if isinstance(self.index, RangeIndex):
+        if isinstance(self, (DataFrame, Series)) and isinstance(
+            self.index, RangeIndex
+        ):
             keep_index = False
 
         if start < 0:
@@ -158,11 +161,11 @@ def _slice(self, arg):
                 ]
             )
 
-            result._copy_categories(self, keep_index)
+            result._copy_categories(self, include_index=keep_index)
             # Adding index of type RangeIndex back to
             # result
-            if keep_index is False:
-                result.index = RangeIndex(start, stop)
+            if keep_index is False and isinstance(self, (DataFrame, Series)):
+                result.index = self.index[start:stop]
             return result
 
     def _normalize_scalars(self, other):
@@ -382,7 +385,10 @@ def _scatter_to_tables(self, scatter_map, keep_index=True):
             self, scatter_map, keep_index
         )
         result = [self._from_table(tbl) for tbl in result]
-        [frame._copy_categories(self, keep_index) for frame in result]
+        [
+            frame._copy_categories(self, include_index=keep_index)
+            for frame in result
+        ]
 
         return result
 

From 2b3e24da0dfa11202cc3bc393991fb96796a60c8 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 13 Mar 2020 10:14:22 -0700
Subject: [PATCH 30/79] special case handling for .str.startswith and
 .str.endswith

---
 python/cudf/cudf/core/column/string.py | 30 ++++++++++++++++++++------
 python/cudf/cudf/tests/test_string.py  | 13 +----------
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 9296ebc9e70..d94df8265b5 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1435,9 +1435,18 @@ def endswith(self, pat, **kwargs):
 
         from cudf._libxx.scalar import Scalar
 
-        return self._return_or_inplace(
-            cpp_endswith(self._column, Scalar(pat, "str")), **kwargs
-        )
+        # TODO: Cleanup if/else blocks after this issue is fixed:
+        # https://github.com/rapidsai/cudf/issues/4500
+        if pat == "":
+            result_col = column.as_column(
+                True, dtype="bool", length=len(self._column)
+            ).set_mask(self._column.mask)
+        elif pat is None:
+            result_col = column.as_column(np.nan, length=len(self._column))
+        else:
+            result_col = cpp_endswith(self._column, Scalar(pat, "str"))
+
+        return self._return_or_inplace(result_col, **kwargs)
 
     def startswith(self, pat, **kwargs):
         """
@@ -1463,9 +1472,18 @@ def startswith(self, pat, **kwargs):
 
         from cudf._libxx.scalar import Scalar
 
-        return self._return_or_inplace(
-            cpp_startswith(self._column, Scalar(pat, "str")), **kwargs
-        )
+        # TODO: Cleanup if/else blocks after this issue is fixed:
+        # https://github.com/rapidsai/cudf/issues/4500
+        if pat == "":
+            result_col = column.as_column(
+                True, dtype="bool", length=len(self._column)
+            ).set_mask(self._column.mask)
+        elif pat is None:
+            result_col = column.as_column(np.nan, length=len(self._column))
+        else:
+            result_col = cpp_startswith(self._column, Scalar(pat, "str"))
+
+        return self._return_or_inplace(result_col, **kwargs)
 
     def find(self, sub, start=0, end=None, **kwargs):
         """
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index abb0dd45cdc..2db7587f8ad 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1539,18 +1539,7 @@ def test_string_table_view_creation():
     ],
 )
 @pytest.mark.parametrize(
-    "pat",
-    [
-        # TODO, PREM: Uncomment after this issue is fixed
-        # '',
-        # None,
-        " ",
-        "a",
-        "abc",
-        "cat",
-        "$",
-        "\n",
-    ],
+    "pat", ["", None, " ", "a", "abc", "cat", "$", "\n"],
 )
 def test_string_starts_ends(data, pat):
     ps = pd.Series(data)

From a8ef01d6ba21b8e562f7eb2c282179960df80cae Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Fri, 13 Mar 2020 13:11:29 -0500
Subject: [PATCH 31/79] nvtext: fix various syntax and import errors.

---
 CHANGELOG.md                                  |   2 +
 .../_libxx/cpp/nvtext/generate_ngrams.pxd     |   2 +-
 .../_libxx/cpp/nvtext/ngrams_tokenize.pxd     |   2 +-
 .../cudf/cudf/_libxx/cpp/nvtext/normalize.pxd |   2 +-
 .../cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd  |   2 +-
 python/cudf/cudf/_libxx/nvtext/__init__.pxd   |   0
 .../cudf/_libxx/nvtext/generate_ngrams.pyx    |  52 +++---
 .../cudf/_libxx/nvtext/ngrams_tokenize.pyx    |  66 +++----
 python/cudf/cudf/_libxx/nvtext/normalize.pyx  |   6 +-
 python/cudf/cudf/_libxx/nvtext/tokenize.pyx   | 166 +++++++++---------
 10 files changed, 147 insertions(+), 153 deletions(-)
 create mode 100644 python/cudf/cudf/_libxx/nvtext/__init__.pxd

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54329da76a7..0de164a1fa1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -145,6 +145,7 @@
 - PR #4244 Port nvstrings Substring Gather/Scatter functions to cuDF Python/Cython
 - PR #4280 Port nvstrings Numeric Handling functions to cuDF Python/Cython
 - PR #4278 Port filling.pyx to libcudf++ API
+- PR #4278 Port filling.pyx to libcudf++ API
 - PR #4328 Add memory threshold callbacks for Java RMM event handler
 - PR #4336 Move a bunch of internal nvstrings code to use native StringColumns
 - PR #4166 Port `is_sorted.pyx` to use libcudf++ APIs
@@ -165,6 +166,7 @@
 - PR #4316 Add Java and JNI bindings for substring expression
 - PR #4314 Add Java and JNI bindings for string contains
 - PR #4461 Port nvstrings Miscellaneous functions to cuDF Python/Cython
+- PR #4495 Port nvtext to cuDF Python/Cython
 
 ## Bug Fixes
 
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
index f20209956b1..d75acb92c71 100644
--- a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
@@ -7,7 +7,7 @@ from cudf._libxx.cpp.column.column_view cimport column_view
 from cudf._libxx.cpp.scalar.scalar cimport scalar
 from cudf._libxx.cpp.types cimport size_type
 
-cdef extern from "cudf/nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
+cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] generate_ngrams(
         const column_view &strings,
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
index 0fc892d3b9d..b34c1a2953d 100644
--- a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
@@ -7,7 +7,7 @@ from cudf._libxx.cpp.column.column_view cimport column_view
 from cudf._libxx.cpp.scalar.scalar cimport scalar
 from cudf._libxx.cpp.types cimport size_type
 
-cdef extern from "cudf/nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
+cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] ngrams_tokenize(
         const column_view & strings,
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd
index dc4b060d7f6..900b9e0b0b9 100644
--- a/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/normalize.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 from cudf._libxx.cpp.column.column cimport column
 from cudf._libxx.cpp.column.column_view cimport column_view
 
-cdef extern from "cudf/nvtext/normalize.hpp" namespace "nvtext" nogil:
+cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] normalize_spaces(
         const column_view & strings
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
index 0c653c7afbe..7aa0fb2f12d 100644
--- a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
@@ -6,7 +6,7 @@ from cudf._libxx.cpp.column.column cimport column
 from cudf._libxx.cpp.column.column_view cimport column_view
 from cudf._libxx.cpp.scalar.scalar cimport scalar
 
-cdef extern from "cudf/nvtext/tokenize.hpp" namespace "nvtext" nogil:
+cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] tokenize(
         const column_view & strings,
diff --git a/python/cudf/cudf/_libxx/nvtext/__init__.pxd b/python/cudf/cudf/_libxx/nvtext/__init__.pxd
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
index 51796e5411a..f83262ae521 100644
--- a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
@@ -1,32 +1,32 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# # Copyright (c) 2018-2020, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from cudf._libxx.move cimport move
+# from libcpp.memory cimport unique_ptr
+# from cudf._libxx.move cimport move
 
-from cudf._libxx.cpp.column.column cimport column
-from cudf._libxx.cpp.scalar.scalar cimport scalar
-from cudf._libxx.cpp.types cimport size_type
-from cudf._libxx.cpp.column.column_view cimport column_view
-from cudf._libxx.cpp.nvtext.generate_ngrams cimport (
-    generate_ngrams as cpp_generate_ngrams
-)
-from cudf._libxx.column cimport Column
-from cudf._libxx.scalar cimport Scalar
+# from cudf._libxx.cpp.column.column cimport column
+# from cudf._libxx.cpp.scalar.scalar cimport scalar
+# from cudf._libxx.cpp.types cimport size_type
+# from cudf._libxx.cpp.column.column_view cimport column_view
+# from cudf._libxx.cpp.nvtext.generate_ngrams cimport (
+#     generate_ngrams as cpp_generate_ngrams
+# )
+# from cudf._libxx.column cimport Column
+# from cudf._libxx.scalar cimport Scalar
 
 
-def generate_ngrams(Column strings, int ngrams, Scalar separator):
-    cdef column_view source_view = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef scalar* c_separator = separator.c_value.get()
-    cdef unique_ptr[column] c_result
+# def generate_ngrams(Column strings, int ngrams, Scalar separator):
+#     cdef column_view c_strings = strings.view()
+#     cdef size_type c_ngrams = ngrams
+#     cdef scalar* c_separator = separator.c_value.get()
+#     cdef unique_ptr[column] c_result
 
-    with nogil:
-        c_result = move(
-            cpp_generate_ngrams(
-                c_strings,
-                c_ngrams
-                c_separator[0]
-            )
-        )
+#     with nogil:
+#         c_result = move(
+#             cpp_generate_ngrams(
+#                 c_strings,
+#                 c_ngrams,
+#                 c_separator[0]
+#             )
+#         )
 
-    return Column.from_unique_ptr(move(c_result))
+#     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
index 1f7afa2bf09..86d47382dea 100644
--- a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
@@ -1,39 +1,39 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# # Copyright (c) 2018-2020, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from cudf._libxx.move cimport move
+# from libcpp.memory cimport unique_ptr
+# from cudf._libxx.move cimport move
 
-from cudf._libxx.cpp.column.column cimport column
-from cudf._libxx.cpp.scalar.scalar cimport scalar
-from cudf._libxx.cpp.types cimport size_type
-from cudf._libxx.cpp.column.column_view cimport column_view
-from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport (
-    ngrams_tokenize as cpp_ngrams_tokenize
-)
-from cudf._libxx.column cimport Column
-from cudf._libxx.scalar cimport Scalar
+# from cudf._libxx.cpp.column.column cimport column
+# from cudf._libxx.cpp.scalar.scalar cimport scalar
+# from cudf._libxx.cpp.types cimport size_type
+# from cudf._libxx.cpp.column.column_view cimport column_view
+# from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport (
+#     ngrams_tokenize as cpp_ngrams_tokenize
+# )
+# from cudf._libxx.column cimport Column
+# from cudf._libxx.scalar cimport Scalar
 
 
-def ngrams_tokenize(
-    Column strings,
-    int ngrams,
-    Scalar delimiter,
-    Scalar separator
-):
-    cdef column_view source_view = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef scalar* c_separator = separator.c_value.get()
-    cdef scalar* c_delimiter = delimiter.c_value.get()
-    cdef unique_ptr[column] c_result
+# def ngrams_tokenize(
+#     Column strings,
+#     int ngrams,
+#     Scalar delimiter,
+#     Scalar separator
+# ):
+#     cdef column_view c_strings = strings.view()
+#     cdef size_type c_ngrams = ngrams
+#     cdef scalar* c_separator = separator.c_value.get()
+#     cdef scalar* c_delimiter = delimiter.c_value.get()
+#     cdef unique_ptr[column] c_result
 
-    with nogil:
-        c_result = move(
-            cpp_ngrams_tokenize(
-                c_strings,
-                c_ngrams
-                c_delimiter[0]
-                c_separator[0]
-            )
-        )
+#     with nogil:
+#         c_result = move(
+#             cpp_ngrams_tokenize(
+#                 c_strings,
+#                 c_ngrams,
+#                 c_delimiter[0],
+#                 c_separator[0]
+#             )
+#         )
 
-    return Column.from_unique_ptr(move(c_result))
+#     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/nvtext/normalize.pyx b/python/cudf/cudf/_libxx/nvtext/normalize.pyx
index 7ed15b9dfc6..7b4432c54a4 100644
--- a/python/cudf/cudf/_libxx/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/normalize.pyx
@@ -6,17 +6,17 @@ from cudf._libxx.move cimport move
 from cudf._libxx.cpp.column.column cimport column
 from cudf._libxx.cpp.column.column_view cimport column_view
 from cudf._libxx.cpp.nvtext.normalize cimport (
-    normalize as cpp_normalize
+    normalize_spaces as cpp_normalize_spaces
 )
 from cudf._libxx.column cimport Column
 from cudf._libxx.scalar cimport Scalar
 
 
 def normalize_spaces(Column strings, int ngrams):
-    cdef column_view source_view = strings.view()
+    cdef column_view c_strings = strings.view()
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_normalize(c_strings))
+        c_result = move(cpp_normalize_spaces(c_strings))
 
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
index 5383b8136b2..e3af10e2cac 100644
--- a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
@@ -1,87 +1,79 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from cudf._libxx.move cimport move
-
-from cudf._libxx.cpp.column.column cimport column
-from cudf._libxx.cpp.scalar.scalar cimport scalar
-from cudf._libxx.cpp.types cimport size_type
-from cudf._libxx.cpp.column.column_view cimport column_view
-from cudf._libxx.cpp.nvtext.tokenize cimport (
-    tokenize as cpp_tokenize,
-    count_tokens as cpp_count_tokens
-)
-from cudf._libxx.column cimport Column
-from cudf._libxx.scalar cimport Scalar
-
-
-def tokenize(Column strings, Scalar delimiter):
-    cdef column_view c_strings = strings.view()
-    cdef scalar* c_delimiter = delimiter.c_value.get()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize(
-                c_strings,
-                c_ngrams
-                c_delimiter[0]
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def tokenize(Column strings, Column delimiters):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_delimiter = delimiter.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_tokenize(
-                c_strings,
-                c_ngrams
-                c_delimiter[0]
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def count_tokens(Column strings, Scalar delimiter):
-    cdef column_view c_strings = strings.view()
-    cdef scalar* c_delimiter = delimiter.c_value.get()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_count_tokens(
-                c_strings,
-                c_ngrams
-                c_delimiter[0]
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
-
-
-def count_tokens(Column strings, Column delimiters):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_delimiter = delimiter.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_count_tokens(
-                c_strings,
-                c_ngrams
-                c_delimiter[0]
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+# # Copyright (c) 2018-2020, NVIDIA CORPORATION.
+
+# from libcpp.memory cimport unique_ptr
+# from cudf._libxx.move cimport move
+
+# from cudf._libxx.cpp.column.column cimport column
+# from cudf._libxx.cpp.scalar.scalar cimport scalar
+# from cudf._libxx.cpp.types cimport size_type
+# from cudf._libxx.cpp.column.column_view cimport column_view
+# from cudf._libxx.cpp.nvtext.tokenize cimport (
+#     tokenize as cpp_tokenize,
+#     count_tokens as cpp_count_tokens
+# )
+# from cudf._libxx.column cimport Column
+# from cudf._libxx.scalar cimport Scalar
+
+
+# def tokenize(Column strings, Scalar delimiter):
+#     cdef column_view c_strings = strings.view()
+#     cdef scalar* c_delimiter = delimiter.c_value.get()
+#     cdef unique_ptr[column] c_result
+
+#     with nogil:
+#         c_result = move(
+#             cpp_tokenize(
+#                 c_strings,
+#                 c_delimiter[0],
+#             )
+#         )
+
+#     return Column.from_unique_ptr(move(c_result))
+
+
+# def tokenize(Column strings, Column delimiters):
+#     cdef column_view c_strings = strings.view()
+#     cdef column_view c_delimiters = delimiters.view()
+#     cdef unique_ptr[column] c_result
+
+#     with nogil:
+#         c_result = move(
+#             cpp_tokenize(
+#                 c_strings,
+#                 c_delimiters
+#             )
+#         )
+
+#     return Column.from_unique_ptr(move(c_result))
+
+
+# def count_tokens(Column strings, Scalar delimiter):
+#     cdef column_view c_strings = strings.view()
+#     cdef scalar* c_delimiter = delimiter.c_value.get()
+#     cdef unique_ptr[column] c_result
+
+#     with nogil:
+#         c_result = move(
+#             cpp_count_tokens(
+#                 c_strings,
+#                 c_delimiter[0]
+#             )
+#         )
+
+#     return Column.from_unique_ptr(move(c_result))
+
+
+# def count_tokens(Column strings, Column delimiters):
+#     cdef column_view c_strings = strings.view()
+#     cdef column_view c_delimiters = delimiters.view()
+#     cdef unique_ptr[column] c_result
+
+#     with nogil:
+#         c_result = move(
+#             cpp_count_tokens(
+#                 c_strings,
+#                 c_delimiters
+#             )
+#         )
+
+#     return Column.from_unique_ptr(move(c_result))

From fd489e927624d1fe040a7caead05eecaecb11fd2 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)" <ramakrishnap@nvidia.com>
Date: Fri, 13 Mar 2020 13:36:04 -0500
Subject: [PATCH 32/79] Review changes, rather than series which might create
 one more index, use column and update table.pyx

---
 python/cudf/cudf/_libxx/table.pyx  |  4 +---
 python/cudf/cudf/core/dataframe.py | 20 ++++++++++----------
 python/cudf/cudf/core/frame.py     |  7 ++-----
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/_libxx/table.pyx b/python/cudf/cudf/_libxx/table.pyx
index c83123ee38b..9bfd413a028 100644
--- a/python/cudf/cudf/_libxx/table.pyx
+++ b/python/cudf/cudf/_libxx/table.pyx
@@ -51,9 +51,7 @@ cdef class Table:
     @property
     def _num_rows(self):
         if self._index is not None:
-            if len(self._index._data) == 0:
-                return 0
-            return self._index._num_rows
+            return len(self._index)
         return len(self._data.columns[0])
 
     @property
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ea57b9b92b7..dd956f0dffa 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3925,33 +3925,33 @@ def scatter_by_map(self, map_index, map_size=None, keep_index=True):
         """
 
         # map_index might be a column name or array,
-        # make it a Series
+        # make it a Column
         if isinstance(map_index, str):
-            map_index = self[map_index]
+            map_index = self[map_index]._column
+        elif isinstance(map_index, Series):
+            map_index = map_index._column
         else:
-            map_index = Series(map_index)
+            map_index = as_column(map_index)
 
         # Convert float to integer
         if map_index.dtype == np.float:
             map_index = map_index.astype(np.int32)
 
         # Convert string or categorical to integer
-        if isinstance(map_index._column, StringColumn):
-            map_index = Series(
-                map_index._column.as_categorical_column(np.int32).as_numerical
-            )
+        if isinstance(map_index, StringColumn):
+            map_index = map_index.as_categorical_column(np.int32).as_numerical
             warnings.warn(
                 "Using StringColumn for map_index in scatter_by_map. "
                 "Use an integer array/column for better performance."
             )
-        elif isinstance(map_index._column, CategoricalColumn):
-            map_index = Series(map_index._column.as_numerical)
+        elif isinstance(map_index, CategoricalColumn):
+            map_index = map_index.as_numerical
             warnings.warn(
                 "Using CategoricalColumn for map_index in scatter_by_map. "
                 "Use an integer array/column for better performance."
             )
 
-        tables = self._scatter_to_tables(map_index._column, keep_index)
+        tables = self._scatter_to_tables(map_index, keep_index)
 
         if map_size:
             # Make sure map_size is >= the number of uniques in map_index
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7f86c55cf6c..73f416cdb9c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -124,7 +124,6 @@ def _slice(self, arg):
 
        """
         from cudf.core.index import RangeIndex
-        from cudf import DataFrame, Series
 
         num_rows = len(self)
         if num_rows == 0:
@@ -139,9 +138,7 @@ def _slice(self, arg):
         # This is just to handle RangeIndex type, stop
         # it from materializing unnecessarily
         keep_index = True
-        if isinstance(self, (DataFrame, Series)) and isinstance(
-            self.index, RangeIndex
-        ):
+        if self.index is not None and isinstance(self.index, RangeIndex):
             keep_index = False
 
         if start < 0:
@@ -164,7 +161,7 @@ def _slice(self, arg):
             result._copy_categories(self, include_index=keep_index)
             # Adding index of type RangeIndex back to
             # result
-            if keep_index is False and isinstance(self, (DataFrame, Series)):
+            if keep_index is False and self.index is not None:
                 result.index = self.index[start:stop]
             return result
 

From f8e61dfc062ff9bf91f3cffcc998b46f8f9297f0 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 13 Mar 2020 12:15:24 -0700
Subject: [PATCH 33/79] re-work .str.find, .str.rfind, .str.index, .str.rindex
 logic to handle special cases

---
 python/cudf/cudf/core/column/string.py | 89 ++++++++++++++++++++++----
 python/cudf/cudf/tests/test_string.py  | 48 ++++----------
 2 files changed, 88 insertions(+), 49 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d94df8265b5..5b1d8acc182 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1507,14 +1507,33 @@ def find(self, sub, start=0, end=None, **kwargs):
         Series or Index of int
 
         """
+        if not isinstance(sub, str):
+            msg = "expected a string object, not {0}"
+            raise TypeError(msg.format(type(sub).__name__))
+
         from cudf._libxx.scalar import Scalar
 
         if end is None:
             end = -1
+        mask = self._column.mask
 
-        return self._return_or_inplace(
-            cpp_find(self._column, Scalar(sub, "str"), start, end), **kwargs
-        )
+        if sub == "":
+            result_col = column.as_column(
+                start, dtype="float", length=len(self._column)
+            )
+        else:
+            result_col = cpp_find(self._column, Scalar(sub, "str"), start, end)
+
+        result_col = result_col.set_mask(mask)
+        if self._column.has_nulls:
+            result_col = result_col.astype("float64")
+        else:
+            result_col = result_col.astype("int64")
+
+        result = self._return_or_inplace(result_col, **kwargs)
+        if sub == "":
+            result[self._parent.str.len() < start] = -1
+        return result
 
     def rfind(self, sub, start=0, end=None, **kwargs):
         """
@@ -1538,13 +1557,35 @@ def rfind(self, sub, start=0, end=None, **kwargs):
         Series or Index of int
 
         """
+        if not isinstance(sub, str):
+            msg = "expected a string object, not {0}"
+            raise TypeError(msg.format(type(sub).__name__))
+
         from cudf._libxx.scalar import Scalar
 
         if end is None:
             end = -1
-        return self._return_or_inplace(
-            cpp_rfind(self._column, Scalar(sub, "str"), start, end), **kwargs
-        )
+        mask = self._column.mask
+
+        if sub == "":
+            result_col = cpp_count_characters(self._column)
+        else:
+            result_col = cpp_rfind(
+                self._column, Scalar(sub, "str"), start, end
+            )
+
+        result_col = result_col.set_mask(mask)
+        if self._column.has_nulls:
+            result_col = result_col.astype("float64")
+        else:
+            result_col = result_col.astype("int64")
+
+        result = self._return_or_inplace(result_col, **kwargs)
+        if sub == "":
+            result[result < start] = -1
+            if end != -1:
+                result[result > end] = end
+        return result
 
     def index(self, sub, start=0, end=None, **kwargs):
         """
@@ -1569,14 +1610,25 @@ def index(self, sub, start=0, end=None, **kwargs):
         Series or Index of object
 
         """
+        if not isinstance(sub, str):
+            msg = "expected a string object, not {0}"
+            raise TypeError(msg.format(type(sub).__name__))
+
         from cudf._libxx.scalar import Scalar
 
         if end is None:
             end = -1
 
-        result = self._return_or_inplace(
-            cpp_find(self._column, Scalar(sub, "str"), start, end), **kwargs
-        )
+        if sub == "":
+            result_col = column.as_column(
+                0.0, dtype="float", length=len(self._column)
+            ).set_mask(self._column.mask)
+        else:
+            result_col = cpp_find(self._column, Scalar(sub, "str"), start, end)
+
+        result = self._return_or_inplace(result_col, **kwargs)
+        if sub == "":
+            result[self._parent.str.len() < start] = -1
 
         if (result == -1).any():
             raise ValueError("substring not found")
@@ -1606,14 +1658,27 @@ def rindex(self, sub, start=0, end=None, **kwargs):
         Series or Index of object
 
         """
+        if not isinstance(sub, str):
+            msg = "expected a string object, not {0}"
+            raise TypeError(msg.format(type(sub).__name__))
+
         from cudf._libxx.scalar import Scalar
 
         if end is None:
             end = -1
 
-        result = self._return_or_inplace(
-            cpp_rfind(self._column, Scalar(sub, "str"), start, end), **kwargs
-        )
+        if sub == "":
+            result_col = cpp_count_characters(self._column)
+        else:
+            result_col = cpp_rfind(
+                self._column, Scalar(sub, "str"), start, end
+            )
+
+        result = self._return_or_inplace(result_col, **kwargs)
+        if sub == "":
+            result[result < start] = -1
+            if end != -1:
+                result[result > end] = end
 
         if (result == -1).any():
             raise ValueError("substring not found")
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 2db7587f8ad..6e48a930d20 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1563,56 +1563,28 @@ def test_string_starts_ends(data, pat):
     ],
 )
 @pytest.mark.parametrize(
-    "sub",
-    [
-        # TODO, PREM: Uncomment after this issue is fixed
-        # '',
-        " ",
-        "a",
-        "abc",
-        "cat",
-        "$",
-        "\n",
-    ],
+    "sub", ["", " ", "a", "abc", "cat", "$", "\n"],
 )
 def test_string_find(data, sub):
     ps = pd.Series(data)
     gs = Series(data)
 
-    assert_eq(ps.str.find(sub).fillna(-1), gs.str.find(sub), check_dtype=False)
-    assert_eq(
-        ps.str.find(sub, start=1).fillna(-1),
-        gs.str.find(sub, start=1),
-        check_dtype=False,
-    )
+    assert_eq(ps.str.find(sub), gs.str.find(sub))
     assert_eq(
-        ps.str.find(sub, end=10).fillna(-1),
-        gs.str.find(sub, end=10),
-        check_dtype=False,
+        ps.str.find(sub, start=1), gs.str.find(sub, start=1),
     )
+    assert_eq(ps.str.find(sub, end=10), gs.str.find(sub, end=10))
     assert_eq(
-        ps.str.find(sub, start=2, end=10).fillna(-1),
-        gs.str.find(sub, start=2, end=10),
-        check_dtype=False,
+        ps.str.find(sub, start=2, end=10), gs.str.find(sub, start=2, end=10),
     )
 
+    assert_eq(ps.str.rfind(sub), gs.str.rfind(sub))
+    assert_eq(ps.str.rfind(sub, start=1), gs.str.rfind(sub, start=1))
     assert_eq(
-        ps.str.rfind(sub).fillna(-1), gs.str.rfind(sub), check_dtype=False
+        ps.str.rfind(sub, end=10), gs.str.rfind(sub, end=10),
     )
     assert_eq(
-        ps.str.rfind(sub, start=1).fillna(-1),
-        gs.str.rfind(sub, start=1),
-        check_dtype=False,
-    )
-    assert_eq(
-        ps.str.rfind(sub, end=10).fillna(-1),
-        gs.str.rfind(sub, end=10),
-        check_dtype=False,
-    )
-    assert_eq(
-        ps.str.rfind(sub, start=2, end=10).fillna(-1),
-        gs.str.rfind(sub, start=2, end=10),
-        check_dtype=False,
+        ps.str.rfind(sub, start=2, end=10), gs.str.rfind(sub, start=2, end=10),
     )
 
 
@@ -1630,6 +1602,7 @@ def test_string_find(data, sub):
             "+",
             ValueError,
         ),
+        (["line to be wrapped", "another line to be wrapped"], "", None),
     ],
 )
 def test_string_str_index(data, sub, er):
@@ -1668,6 +1641,7 @@ def test_string_str_index(data, sub, er):
             "+",
             ValueError,
         ),
+        (["line to be wrapped", "another line to be wrapped"], "", None),
     ],
 )
 def test_string_str_rindex(data, sub, er):

From 31b619b067bfaed75e033ebf602b7d0f3abc60d6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 13 Mar 2020 14:19:03 -0500
Subject: [PATCH 34/79] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f8498170886..4b074c2b518 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -256,7 +256,7 @@
 - PR #4480 Fix string_scalar.value to return an empty string_view for empty string-scalar
 - PR #4474 Fix to not materialize RangeIndex in copy_categories
 - PR #4494 Update Java memory event handler for new RMM resource API
-- PR #4482 Fix `.str.rsplit` & `.str.split` and enable related tests
+- PR #4482 Fix `.str.rsplit`, `.str.split`, `.str.find`, `.str.rfind`, `.str.index`, `.str.rindex` and enable related tests
 
 # cuDF 0.12.0 (04 Feb 2020)
 

From 799fd02a72ce90f979d71be23052ffa19fa68d98 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)"
 <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Fri, 13 Mar 2020 14:38:19 -0500
Subject: [PATCH 35/79] Update python/cudf/cudf/_libxx/copying.pyx

Co-Authored-By: GALI PREM SAGAR <sagarprem75@gmail.com>
---
 python/cudf/cudf/_libxx/copying.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_libxx/copying.pyx b/python/cudf/cudf/_libxx/copying.pyx
index 03d3a017137..63d8d4cbc9a 100644
--- a/python/cudf/cudf/_libxx/copying.pyx
+++ b/python/cudf/cudf/_libxx/copying.pyx
@@ -435,7 +435,7 @@ def column_split(Column input_column, object splits):
     return result
 
 
-def table_split(Table input_table, object splits, keep_index=True):
+def table_split(Table input_table, object splits, bool keep_index=True):
 
     cdef table_view input_table_view
     if keep_index is True:

From 836e34fb3043540572db70dfe847373e82fbe11d Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Fri, 13 Mar 2020 13:24:19 -0500
Subject: [PATCH 36/79] nvtext cython: uncomment code and fix cython
 declaration errors

---
 .../_libxx/cpp/nvtext/generate_ngrams.pxd     |   4 +-
 .../_libxx/cpp/nvtext/ngrams_tokenize.pxd     |   6 +-
 .../cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd  |  10 +-
 python/cudf/cudf/_libxx/nvtext/__init__.py    |   0
 .../cudf/_libxx/nvtext/generate_ngrams.pyx    |  52 ++---
 .../cudf/_libxx/nvtext/ngrams_tokenize.pyx    |  66 +++---
 python/cudf/cudf/_libxx/nvtext/normalize.pyx  |   1 -
 python/cudf/cudf/_libxx/nvtext/tokenize.pyx   | 189 ++++++++++--------
 8 files changed, 179 insertions(+), 149 deletions(-)
 delete mode 100644 python/cudf/cudf/_libxx/nvtext/__init__.py

diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
index d75acb92c71..5505fda1f7d 100644
--- a/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/generate_ngrams.pxd
@@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._libxx.cpp.column.column cimport column
 from cudf._libxx.cpp.column.column_view cimport column_view
-from cudf._libxx.cpp.scalar.scalar cimport scalar
+from cudf._libxx.cpp.scalar.scalar cimport string_scalar
 from cudf._libxx.cpp.types cimport size_type
 
 cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
@@ -12,5 +12,5 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
     cdef unique_ptr[column] generate_ngrams(
         const column_view &strings,
         size_type ngrams,
-        const scalar & separator
+        const string_scalar & separator
     ) except +
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
index b34c1a2953d..3c65358d777 100644
--- a/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/ngrams_tokenize.pxd
@@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._libxx.cpp.column.column cimport column
 from cudf._libxx.cpp.column.column_view cimport column_view
-from cudf._libxx.cpp.scalar.scalar cimport scalar
+from cudf._libxx.cpp.scalar.scalar cimport string_scalar
 from cudf._libxx.cpp.types cimport size_type
 
 cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
@@ -12,6 +12,6 @@ cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
     cdef unique_ptr[column] ngrams_tokenize(
         const column_view & strings,
         size_type ngrams,
-        const scalar & delimiter,
-        const scalar & separator
+        const string_scalar & delimiter,
+        const string_scalar & separator
     ) except +
diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
index 7aa0fb2f12d..21ea6dc09ae 100644
--- a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
@@ -4,26 +4,26 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._libxx.cpp.column.column cimport column
 from cudf._libxx.cpp.column.column_view cimport column_view
-from cudf._libxx.cpp.scalar.scalar cimport scalar
+from cudf._libxx.cpp.scalar.scalar cimport string_scalar
 
 cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] tokenize(
         const column_view & strings,
-        const scalar & delimiter
+        const string_scalar & delimiter
     ) except +
 
-    cdef unique_ptr[column] tokenize(
+    cdef unique_ptr[column] tokenize_multi "nvtext::tokenize" (
         const column_view & strings,
         const column_view & delimiters
     ) except +
 
     cdef unique_ptr[column] count_tokens(
         const column_view & strings,
-        const scalar & delimiter
+        const string_scalar & delimiter
     ) except +
 
-    cdef unique_ptr[column] count_tokens(
+    cdef unique_ptr[column] count_tokens_multi "nvtext::count_tokens" (
         const column_view & strings,
         const column_view & delimiters
     ) except +
diff --git a/python/cudf/cudf/_libxx/nvtext/__init__.py b/python/cudf/cudf/_libxx/nvtext/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
index f83262ae521..b563ce9f884 100644
--- a/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/generate_ngrams.pyx
@@ -1,32 +1,32 @@
-# # Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
 
-# from libcpp.memory cimport unique_ptr
-# from cudf._libxx.move cimport move
+from libcpp.memory cimport unique_ptr
+from cudf._libxx.move cimport move
 
-# from cudf._libxx.cpp.column.column cimport column
-# from cudf._libxx.cpp.scalar.scalar cimport scalar
-# from cudf._libxx.cpp.types cimport size_type
-# from cudf._libxx.cpp.column.column_view cimport column_view
-# from cudf._libxx.cpp.nvtext.generate_ngrams cimport (
-#     generate_ngrams as cpp_generate_ngrams
-# )
-# from cudf._libxx.column cimport Column
-# from cudf._libxx.scalar cimport Scalar
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.scalar.scalar cimport string_scalar
+from cudf._libxx.cpp.types cimport size_type
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.nvtext.generate_ngrams cimport (
+    generate_ngrams as cpp_generate_ngrams
+)
+from cudf._libxx.column cimport Column
+from cudf._libxx.scalar cimport Scalar
 
 
-# def generate_ngrams(Column strings, int ngrams, Scalar separator):
-#     cdef column_view c_strings = strings.view()
-#     cdef size_type c_ngrams = ngrams
-#     cdef scalar* c_separator = separator.c_value.get()
-#     cdef unique_ptr[column] c_result
+def generate_ngrams(Column strings, int ngrams, Scalar separator):
+    cdef column_view c_strings = strings.view()
+    cdef size_type c_ngrams = ngrams
+    cdef string_scalar* c_separator = <string_scalar*>separator.c_value.get()
+    cdef unique_ptr[column] c_result
 
-#     with nogil:
-#         c_result = move(
-#             cpp_generate_ngrams(
-#                 c_strings,
-#                 c_ngrams,
-#                 c_separator[0]
-#             )
-#         )
+    with nogil:
+        c_result = move(
+            cpp_generate_ngrams(
+                c_strings,
+                c_ngrams,
+                c_separator[0]
+            )
+        )
 
-#     return Column.from_unique_ptr(move(c_result))
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
index 86d47382dea..f89ba1c8669 100644
--- a/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/ngrams_tokenize.pyx
@@ -1,39 +1,39 @@
-# # Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
 
-# from libcpp.memory cimport unique_ptr
-# from cudf._libxx.move cimport move
+from libcpp.memory cimport unique_ptr
+from cudf._libxx.move cimport move
 
-# from cudf._libxx.cpp.column.column cimport column
-# from cudf._libxx.cpp.scalar.scalar cimport scalar
-# from cudf._libxx.cpp.types cimport size_type
-# from cudf._libxx.cpp.column.column_view cimport column_view
-# from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport (
-#     ngrams_tokenize as cpp_ngrams_tokenize
-# )
-# from cudf._libxx.column cimport Column
-# from cudf._libxx.scalar cimport Scalar
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.scalar.scalar cimport string_scalar
+from cudf._libxx.cpp.types cimport size_type
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.nvtext.ngrams_tokenize cimport (
+    ngrams_tokenize as cpp_ngrams_tokenize
+)
+from cudf._libxx.column cimport Column
+from cudf._libxx.scalar cimport Scalar
 
 
-# def ngrams_tokenize(
-#     Column strings,
-#     int ngrams,
-#     Scalar delimiter,
-#     Scalar separator
-# ):
-#     cdef column_view c_strings = strings.view()
-#     cdef size_type c_ngrams = ngrams
-#     cdef scalar* c_separator = separator.c_value.get()
-#     cdef scalar* c_delimiter = delimiter.c_value.get()
-#     cdef unique_ptr[column] c_result
+def ngrams_tokenize(
+    Column strings,
+    int ngrams,
+    Scalar delimiter,
+    Scalar separator
+):
+    cdef column_view c_strings = strings.view()
+    cdef size_type c_ngrams = ngrams
+    cdef string_scalar* c_separator = <string_scalar*>separator.c_value.get()
+    cdef string_scalar* c_delimiter = <string_scalar*>delimiter.c_value.get()
+    cdef unique_ptr[column] c_result
 
-#     with nogil:
-#         c_result = move(
-#             cpp_ngrams_tokenize(
-#                 c_strings,
-#                 c_ngrams,
-#                 c_delimiter[0],
-#                 c_separator[0]
-#             )
-#         )
+    with nogil:
+        c_result = move(
+            cpp_ngrams_tokenize(
+                c_strings,
+                c_ngrams,
+                c_delimiter[0],
+                c_separator[0]
+            )
+        )
 
-#     return Column.from_unique_ptr(move(c_result))
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_libxx/nvtext/normalize.pyx b/python/cudf/cudf/_libxx/nvtext/normalize.pyx
index 7b4432c54a4..b2654446ab9 100644
--- a/python/cudf/cudf/_libxx/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/normalize.pyx
@@ -9,7 +9,6 @@ from cudf._libxx.cpp.nvtext.normalize cimport (
     normalize_spaces as cpp_normalize_spaces
 )
 from cudf._libxx.column cimport Column
-from cudf._libxx.scalar cimport Scalar
 
 
 def normalize_spaces(Column strings, int ngrams):
diff --git a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
index e3af10e2cac..a1a3d81398d 100644
--- a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
@@ -1,79 +1,110 @@
-# # Copyright (c) 2018-2020, NVIDIA CORPORATION.
-
-# from libcpp.memory cimport unique_ptr
-# from cudf._libxx.move cimport move
-
-# from cudf._libxx.cpp.column.column cimport column
-# from cudf._libxx.cpp.scalar.scalar cimport scalar
-# from cudf._libxx.cpp.types cimport size_type
-# from cudf._libxx.cpp.column.column_view cimport column_view
-# from cudf._libxx.cpp.nvtext.tokenize cimport (
-#     tokenize as cpp_tokenize,
-#     count_tokens as cpp_count_tokens
-# )
-# from cudf._libxx.column cimport Column
-# from cudf._libxx.scalar cimport Scalar
-
-
-# def tokenize(Column strings, Scalar delimiter):
-#     cdef column_view c_strings = strings.view()
-#     cdef scalar* c_delimiter = delimiter.c_value.get()
-#     cdef unique_ptr[column] c_result
-
-#     with nogil:
-#         c_result = move(
-#             cpp_tokenize(
-#                 c_strings,
-#                 c_delimiter[0],
-#             )
-#         )
-
-#     return Column.from_unique_ptr(move(c_result))
-
-
-# def tokenize(Column strings, Column delimiters):
-#     cdef column_view c_strings = strings.view()
-#     cdef column_view c_delimiters = delimiters.view()
-#     cdef unique_ptr[column] c_result
-
-#     with nogil:
-#         c_result = move(
-#             cpp_tokenize(
-#                 c_strings,
-#                 c_delimiters
-#             )
-#         )
-
-#     return Column.from_unique_ptr(move(c_result))
-
-
-# def count_tokens(Column strings, Scalar delimiter):
-#     cdef column_view c_strings = strings.view()
-#     cdef scalar* c_delimiter = delimiter.c_value.get()
-#     cdef unique_ptr[column] c_result
-
-#     with nogil:
-#         c_result = move(
-#             cpp_count_tokens(
-#                 c_strings,
-#                 c_delimiter[0]
-#             )
-#         )
-
-#     return Column.from_unique_ptr(move(c_result))
-
-
-# def count_tokens(Column strings, Column delimiters):
-#     cdef column_view c_strings = strings.view()
-#     cdef column_view c_delimiters = delimiters.view()
-#     cdef unique_ptr[column] c_result
-
-#     with nogil:
-#         c_result = move(
-#             cpp_count_tokens(
-#                 c_strings,
-#                 c_delimiters
-#             )
-#         )
-
-#     return Column.from_unique_ptr(move(c_result))
+# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from cudf._libxx.move cimport move
+
+from cudf._libxx.cpp.column.column cimport column
+from cudf._libxx.cpp.scalar.scalar cimport string_scalar
+from cudf._libxx.cpp.types cimport size_type
+from cudf._libxx.cpp.column.column_view cimport column_view
+from cudf._libxx.cpp.nvtext.tokenize cimport (
+    tokenize as cpp_tokenize,
+    tokenize_multi as cpp_tokenize_multi,
+    count_tokens as cpp_count_tokens,
+    count_tokens_multi as cpp_count_tokens_multi,
+)
+from cudf._libxx.column cimport Column
+from cudf._libxx.scalar cimport Scalar
+
+
+def tokenize(Column strings, object delimiter):
+    if isinstance(delimiter, Scalar):
+        return _tokenize_scalar(strings, delimiter)
+
+    if isinstance(delimiter, Column):
+        return _tokenize_column(strings, delimiter)
+
+    raise TypeError(
+        "Expected a Scalar or Column for delimiters, but got {}".format(
+            type(delimiter)
+        )
+    )
+
+
+def _tokenize_scalar(Column strings, Scalar delimiter):
+
+    cdef column_view c_strings = strings.view()
+    cdef string_scalar* c_delimiter = <string_scalar*>delimiter.c_value.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_tokenize(
+                c_strings,
+                c_delimiter[0],
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def _tokenize_column(Column strings, Column delimiters):
+    cdef column_view c_strings = strings.view()
+    cdef column_view c_delimiters = delimiters.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_tokenize_multi(
+                c_strings,
+                c_delimiters
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def count_tokens(Column strings, object delimiter):
+    if isinstance(delimiter, Scalar):
+        return _count_tokens_scalar(strings, delimiter)
+
+    if isinstance(delimiter, Column):
+        return _count_tokens_column(strings, delimiter)
+
+    raise TypeError(
+        "Expected a Scalar or Column for delimiters, but got {}".format(
+            type(delimiter)
+        )
+    )
+
+
+def _count_tokens_scalar(Column strings, Scalar delimiter):
+    cdef column_view c_strings = strings.view()
+    cdef string_scalar* c_delimiter = <string_scalar*>delimiter.c_value.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_count_tokens(
+                c_strings,
+                c_delimiter[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def _count_tokens_column(Column strings, Column delimiters):
+    cdef column_view c_strings = strings.view()
+    cdef column_view c_delimiters = delimiters.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_count_tokens_multi(
+                c_strings,
+                c_delimiters
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))

From 5066f65427fbc1b56c6b9251fb6be85ecb10447e Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Fri, 13 Mar 2020 14:47:51 -0500
Subject: [PATCH 37/79] changelog

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0de164a1fa1..d1035f99499 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -145,7 +145,6 @@
 - PR #4244 Port nvstrings Substring Gather/Scatter functions to cuDF Python/Cython
 - PR #4280 Port nvstrings Numeric Handling functions to cuDF Python/Cython
 - PR #4278 Port filling.pyx to libcudf++ API
-- PR #4278 Port filling.pyx to libcudf++ API
 - PR #4328 Add memory threshold callbacks for Java RMM event handler
 - PR #4336 Move a bunch of internal nvstrings code to use native StringColumns
 - PR #4166 Port `is_sorted.pyx` to use libcudf++ APIs

From cc6901fbbe445666bf6b77f15c9892ae64c7057d Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Fri, 13 Mar 2020 15:03:10 -0500
Subject: [PATCH 38/79] nvtext cython: remove explicit method names in favor of
 overload

---
 python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd | 4 ++--
 python/cudf/cudf/_libxx/nvtext/tokenize.pyx     | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
index 21ea6dc09ae..1e4ebbf3d43 100644
--- a/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
+++ b/python/cudf/cudf/_libxx/cpp/nvtext/tokenize.pxd
@@ -13,7 +13,7 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
         const string_scalar & delimiter
     ) except +
 
-    cdef unique_ptr[column] tokenize_multi "nvtext::tokenize" (
+    cdef unique_ptr[column] tokenize(
         const column_view & strings,
         const column_view & delimiters
     ) except +
@@ -23,7 +23,7 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
         const string_scalar & delimiter
     ) except +
 
-    cdef unique_ptr[column] count_tokens_multi "nvtext::count_tokens" (
+    cdef unique_ptr[column] count_tokens(
         const column_view & strings,
         const column_view & delimiters
     ) except +
diff --git a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
index a1a3d81398d..b755303e0ef 100644
--- a/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/tokenize.pyx
@@ -9,9 +9,7 @@ from cudf._libxx.cpp.types cimport size_type
 from cudf._libxx.cpp.column.column_view cimport column_view
 from cudf._libxx.cpp.nvtext.tokenize cimport (
     tokenize as cpp_tokenize,
-    tokenize_multi as cpp_tokenize_multi,
     count_tokens as cpp_count_tokens,
-    count_tokens_multi as cpp_count_tokens_multi,
 )
 from cudf._libxx.column cimport Column
 from cudf._libxx.scalar cimport Scalar
@@ -55,7 +53,7 @@ def _tokenize_column(Column strings, Column delimiters):
 
     with nogil:
         c_result = move(
-            cpp_tokenize_multi(
+            cpp_tokenize(
                 c_strings,
                 c_delimiters
             )
@@ -101,7 +99,7 @@ def _count_tokens_column(Column strings, Column delimiters):
 
     with nogil:
         c_result = move(
-            cpp_count_tokens_multi(
+            cpp_count_tokens(
                 c_strings,
                 c_delimiters
             )

From 18f2fd2ec12ab85acbe8bec7689149f5bc0924de Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 13 Mar 2020 16:08:30 -0400
Subject: [PATCH 39/79] Add workaround for all-empty strings

---
 python/cudf/cudf/core/column/string.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 3c423496fc9..f16b0a6d260 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1769,6 +1769,17 @@ def __init__(self, mask=None, size=None, offset=0, children=()):
             None, size, dtype, mask=mask, offset=offset, children=children
         )
 
+        # For an "all empty" StringColumn (e.g., [""]) libcudf still
+        # needs the chars child column pointer to be non-null:
+        if self.size:
+            if self.null_count == 0 and self.children[1].size == 0:
+                self.set_base_children(
+                    (
+                        self.base_children[0],
+                        column_empty(1, dtype=self.base_children[1].dtype),
+                    )
+                )
+
         # TODO: Remove these once NVStrings is fully deprecated / removed
         self._nvstrings = None
         self._nvcategory = None

From 3952f9a5f8f02695801cf9eeddccfbc9417c7a87 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 13 Mar 2020 16:11:12 -0400
Subject: [PATCH 40/79] Style

---
 python/cudf/cudf/core/column/numerical.py | 6 ------
 python/cudf/cudf/core/column/string.py    | 1 -
 2 files changed, 7 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 3c392e06843..b75423fef08 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -56,9 +56,6 @@ def __contains__(self, item):
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
-    def unary_operator(self, unaryop):
-        return _numeric_column_unaryop(self, op=unaryop)
-
     def binary_operator(self, binop, rhs, reflect=False):
         int_dtypes = [
             np.dtype("int8"),
@@ -86,9 +83,6 @@ def binary_operator(self, binop, rhs, reflect=False):
             lhs=self, rhs=rhs, op=binop, out_dtype=out_dtype, reflect=reflect
         )
 
-    def unary_operator(self, unaryop):
-        return _numeric_column_unaryop(self, op=unaryop)
-
     def _apply_scan_op(self, op):
         return libcudfxx.reduce.scan(op, self, True)
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index f16b0a6d260..05fed9d0d36 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -11,7 +11,6 @@
 
 import nvstrings
 
-import cudf._lib as libcudf
 import cudf._libxx as libcudfxx
 import cudf._libxx.string_casting as str_cast
 from cudf._lib.nvtx import nvtx_range_pop, nvtx_range_push

From 74e6d6a954d601fd4cdc7d45e7ef94736cffd16c Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 13 Mar 2020 16:32:11 -0400
Subject: [PATCH 41/79] Changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f79724130ab..e1e62e4bb84 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -166,6 +166,7 @@
 - PR #4316 Add Java and JNI bindings for substring expression
 - PR #4314 Add Java and JNI bindings for string contains
 - PR #4461 Port nvstrings Miscellaneous functions to cuDF Python/Cython
+- PR #4503 Port binaryop.pyx to libcudf++ API
 
 ## Bug Fixes
 

From 07a8b545abaf5884e97a8126df84865bc67ad280 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 13 Mar 2020 16:36:00 -0400
Subject: [PATCH 42/79] Restore lost unary_operator

---
 python/cudf/cudf/core/column/numerical.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b75423fef08..6be5408e2ac 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -56,6 +56,9 @@ def __contains__(self, item):
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
+    def unary_operator(self, unaryop):
+        return _numeric_column_unaryop(self, op=unaryop)
+
     def binary_operator(self, binop, rhs, reflect=False):
         int_dtypes = [
             np.dtype("int8"),

From ca17bc6c4d0cc3685b6318fbdac47f37e434dd70 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 13 Mar 2020 16:36:46 -0400
Subject: [PATCH 43/79] Remove legacy binops Cython bindings

---
 python/cudf/cudf/_lib/binops.pyx          | 195 ----------------------
 python/cudf/cudf/_lib/includes/binops.pxd |  67 --------
 2 files changed, 262 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/binops.pyx
 delete mode 100644 python/cudf/cudf/_lib/includes/binops.pxd

diff --git a/python/cudf/cudf/_lib/binops.pyx b/python/cudf/cudf/_lib/binops.pyx
deleted file mode 100644
index ac7c8240d03..00000000000
--- a/python/cudf/cudf/_lib/binops.pyx
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-
-from cudf._lib.cudf cimport *
-from cudf._lib.cudf import *
-from cudf._lib.GDFError import GDFError
-from libcpp.vector cimport vector
-from libc.stdlib cimport free
-
-from libcpp.string cimport string
-
-import rmm
-
-from cudf._lib.includes.binops cimport *
-
-
-_BINARY_OP = {
-    'add': GDF_ADD,
-    'sub': GDF_SUB,
-    'mul': GDF_MUL,
-    'div': GDF_DIV,
-    'truediv': GDF_TRUE_DIV,
-    'floordiv': GDF_FLOOR_DIV,
-    'mod': GDF_PYMOD,
-    'pow': GDF_POW,
-    'eq': GDF_EQUAL,
-    'ne': GDF_NOT_EQUAL,
-    'lt': GDF_LESS,
-    'gt': GDF_GREATER,
-    'le': GDF_LESS_EQUAL,
-    'ge': GDF_GREATER_EQUAL,
-    'and': GDF_BITWISE_AND,
-    'or': GDF_BITWISE_OR,
-    'xor': GDF_BITWISE_XOR,
-    'l_and': GDF_LOGICAL_AND,
-    'l_or': GDF_LOGICAL_OR,
-}
-
-cdef apply_op_v_v(gdf_column* c_lhs, gdf_column* c_rhs, gdf_column* c_out, op):
-    """
-    Call gdf binary ops between two columns.
-    """
-
-    cdef gdf_binary_operator c_op = _BINARY_OP[op]
-    with nogil:
-        binary_operation(
-            <gdf_column*>c_out,
-            <gdf_column*>c_lhs,
-            <gdf_column*>c_rhs,
-            c_op)
-
-    cdef int nullct = c_out[0].null_count
-
-    return nullct
-
-cdef apply_op_v_s(gdf_column* c_lhs, gdf_scalar* c_rhs, gdf_column* c_out, op):
-    """
-    Call gdf binary ops between a column and a scalar.
-    """
-
-    cdef gdf_binary_operator c_op = _BINARY_OP[op]
-    with nogil:
-        binary_operation(
-            <gdf_column*>c_out,
-            <gdf_column*>c_lhs,
-            <gdf_scalar*>c_rhs,
-            c_op)
-
-    cdef int nullct = c_out[0].null_count
-
-    return nullct
-
-
-cdef apply_op_s_v(gdf_scalar* c_lhs, gdf_column* c_rhs, gdf_column* c_out, op):
-    """
-    Call gdf binary ops between a scalar and a column.
-    """
-
-    cdef gdf_binary_operator c_op = _BINARY_OP[op]
-    with nogil:
-        binary_operation(
-            <gdf_column*>c_out,
-            <gdf_scalar*>c_lhs,
-            <gdf_column*>c_rhs,
-            c_op)
-
-    cdef int nullct = c_out[0].null_count
-
-    return nullct
-
-
-def apply_op(lhs, rhs, out, op):
-    """
-    Dispatches a binary op call to the appropriate libcudf function:
-    """
-    check_gdf_compatibility(out)
-    cdef gdf_column* c_lhs = NULL
-    cdef gdf_column* c_rhs = NULL
-    cdef gdf_scalar* c_scalar = NULL
-    cdef gdf_column* c_out = column_view_from_column(out)
-
-    if np.isscalar(lhs):
-        check_gdf_compatibility(rhs)
-        c_rhs = column_view_from_column(rhs)
-        c_scalar = gdf_scalar_from_scalar(lhs)
-        nullct = apply_op_s_v(
-            <gdf_scalar*> c_scalar,
-            <gdf_column*> c_rhs,
-            <gdf_column*> c_out,
-            op
-        )
-    elif lhs is None:
-        check_gdf_compatibility(rhs)
-        c_rhs = column_view_from_column(rhs)
-        c_scalar = gdf_scalar_from_scalar(lhs, rhs.dtype)
-        nullct = apply_op_s_v(
-            <gdf_scalar*> c_scalar,
-            <gdf_column*> c_rhs,
-            <gdf_column*> c_out,
-            op
-        )
-
-    elif np.isscalar(rhs):
-        check_gdf_compatibility(lhs)
-        c_lhs = column_view_from_column(lhs)
-        c_scalar = gdf_scalar_from_scalar(rhs)
-        nullct = apply_op_v_s(
-            <gdf_column*> c_lhs,
-            <gdf_scalar*> c_scalar,
-            <gdf_column*> c_out,
-            op
-        )
-
-    elif rhs is None:
-        check_gdf_compatibility(lhs)
-        c_lhs = column_view_from_column(lhs)
-        c_scalar = gdf_scalar_from_scalar(rhs, lhs.dtype)
-        nullct = apply_op_v_s(
-            <gdf_column*> c_lhs,
-            <gdf_scalar*> c_scalar,
-            <gdf_column*> c_out,
-            op
-        )
-
-    else:
-        check_gdf_compatibility(lhs)
-        check_gdf_compatibility(rhs)
-        c_lhs = column_view_from_column(lhs)
-        c_rhs = column_view_from_column(rhs)
-
-        nullct = apply_op_v_v(
-            <gdf_column*>c_lhs,
-            <gdf_column*>c_rhs,
-            <gdf_column*>c_out,
-            op
-        )
-
-    free(c_scalar)
-    free_column(c_lhs)
-    free_column(c_rhs)
-    free_column(c_out)
-
-    return nullct
-
-
-def apply_op_udf(lhs, rhs, udf_ptx, np_dtype):
-    """
-    Apply a user-defined binary operator (a UDF) defined in `udf_ptx` on
-    the two input columns `lhs` and `rhs`. The output type of the UDF
-    has to be specified in `np_dtype`, a numpy data type.
-    Currently ONLY int32, int64, float32 and float64 are supported.
-    """
-    check_gdf_compatibility(lhs)
-    check_gdf_compatibility(rhs)
-    cdef gdf_column* c_lhs = column_view_from_column(lhs)
-    cdef gdf_column* c_rhs = column_view_from_column(rhs)
-
-    # get the gdf_type related to the input np type
-    cdef gdf_dtype g_type = dtypes[np_dtype]
-
-    cdef string cpp_str = udf_ptx.encode("UTF-8")
-
-    cdef gdf_column c_out_col
-
-    with nogil:
-        c_out_col = binary_operation(
-            <gdf_column>c_lhs[0],
-            <gdf_column>c_rhs[0],
-            cpp_str,
-            g_type
-        )
-
-    free_column(c_lhs)
-    free_column(c_rhs)
-
-    return gdf_column_to_column(&c_out_col)
diff --git a/python/cudf/cudf/_lib/includes/binops.pxd b/python/cudf/cudf/_lib/includes/binops.pxd
deleted file mode 100644
index 3c462bb4048..00000000000
--- a/python/cudf/cudf/_lib/includes/binops.pxd
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-
-from cudf._lib.cudf cimport *
-
-from libcpp.string cimport string
-
-cdef extern from "cudf/legacy/binaryop.hpp" nogil:
-
-    ctypedef enum gdf_binary_operator:
-        GDF_ADD,
-        GDF_SUB,
-        GDF_MUL,
-        GDF_DIV,
-        GDF_TRUE_DIV,
-        GDF_FLOOR_DIV,
-        GDF_MOD,
-        GDF_PYMOD,
-        GDF_POW,
-        GDF_EQUAL,
-        GDF_NOT_EQUAL,
-        GDF_LESS,
-        GDF_GREATER,
-        GDF_LESS_EQUAL,
-        GDF_GREATER_EQUAL,
-        GDF_BITWISE_AND,
-        GDF_BITWISE_OR,
-        GDF_BITWISE_XOR,
-        GDF_LOGICAL_AND,
-        GDF_LOGICAL_OR,
-        GDF_INVALID_BINARY
-
-cdef extern from "cudf/legacy/binaryop.hpp" namespace "cudf" nogil:
-
-    cdef void binary_operation(
-        gdf_column* out,
-        gdf_scalar* lhs,
-        gdf_column* rhs,
-        gdf_binary_operator ope
-    ) except +
-
-    cdef void binary_operation(
-        gdf_column* out,
-        gdf_column* lhs,
-        gdf_scalar* rhs,
-        gdf_binary_operator ope
-    ) except +
-
-    cdef void binary_operation(
-        gdf_column* out,
-        gdf_column* lhs,
-        gdf_column* rhs,
-        gdf_binary_operator ope
-    ) except +
-
-    cdef void binary_operation(
-        gdf_column* out,
-        gdf_column* lhs,
-        gdf_column* rhs,
-        const string& ptx
-    ) except +
-
-    cdef gdf_column binary_operation(
-        const gdf_column& lhs,
-        const gdf_column& rhs,
-        const string& ptx,
-        gdf_dtype output_type
-    ) except +

From 5c846b20f3fcd50c5d97a3de191d89e9d81d16c7 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)" <ramakrishnap@nvidia.com>
Date: Fri, 13 Mar 2020 15:49:45 -0500
Subject: [PATCH 44/79] adding _num_rows property to RangeIndex

---
 python/cudf/cudf/core/index.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index ec3fb4336ef..0a9f2e84483 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -495,6 +495,10 @@ def name(self, value):
     def _num_columns(self):
         return 1
 
+    @property
+    def _num_rows(self):
+        return len(self)
+
     @cached_property
     def _values(self):
         if len(self) > 0:

From 3f10098ae94b87e8d6abe33dadf8d973a8cfa3f7 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 13 Mar 2020 14:05:33 -0700
Subject: [PATCH 45/79] check for zero length frames when serializing cuda
 buffers

---
 python/cudf/cudf/comm/serialize.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/comm/serialize.py b/python/cudf/cudf/comm/serialize.py
index cff0fce550a..9fe5a55c0d0 100644
--- a/python/cudf/cudf/comm/serialize.py
+++ b/python/cudf/cudf/comm/serialize.py
@@ -40,9 +40,10 @@ def dask_serialize_cudf_object(x):
     def deserialize_cudf_object(header, frames):
         with log_errors():
             if header["serializer"] == "cuda":
-                assert all(
-                    hasattr(f, "__cuda_array_interface__") for f in frames
-                )
+                for f in frames:
+                    # some frames are empty -- meta/empty partitions/etc
+                    if len(f) > 0:
+                        assert hasattr(f, "__cuda_array_interface__")
             if header["serializer"] == "dask":
                 frames = [memoryview(f) for f in frames]
 

From f5adf23a8615472063a130557141dc16af3450c7 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 13 Mar 2020 14:14:44 -0700
Subject: [PATCH 46/79] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 09ed8f3bde1..a1fbe82bcef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -259,6 +259,7 @@
 - PR #4474 Fix to not materialize RangeIndex in copy_categories
 - PR #4496 Skip tests which require 2+ GPUs
 - PR #4494 Update Java memory event handler for new RMM resource API
+- PR #4505 Fix 0 length buffers during serialization
 
 
 # cuDF 0.12.0 (04 Feb 2020)

From 6d631de2afbab9990e34baea856566dca93dada0 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)" <ramakrishnap@nvidia.com>
Date: Fri, 13 Mar 2020 18:19:44 -0500
Subject: [PATCH 47/79] changes

---
 python/cudf/cudf/_libxx/copying.pyx |  2 +-
 python/cudf/cudf/core/dataframe.py  | 60 +++++++++++++++++------------
 2 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/_libxx/copying.pyx b/python/cudf/cudf/_libxx/copying.pyx
index 995abc43295..32a82ae5322 100644
--- a/python/cudf/cudf/_libxx/copying.pyx
+++ b/python/cudf/cudf/_libxx/copying.pyx
@@ -315,7 +315,7 @@ def table_empty_like(Table input_table):
     return Table.from_unique_ptr(
         move(c_result),
         column_names=input_table._column_names,
-        index_names=input_table._index._column_names
+        index_names=input_table._index_names
     )
 
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 112975a7ac4..d0acd24a369 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -501,6 +501,7 @@ def mask(self, other):
     def __setitem__(self, arg, value):
         """Add/set column by *arg or DataFrame*
         """
+        print("__setitem__ is being called ___________________")
         if isinstance(arg, DataFrame):
             # not handling set_item where arg = df & value = df
             if isinstance(value, DataFrame):
@@ -513,10 +514,12 @@ def __setitem__(self, arg, value):
                 for col_name in self._data:
                     scatter_map = arg[col_name]
                     if is_scalar(value):
-                        value = utils.scalar_broadcast_to(value, len(self))
-                    self._data[col_name][scatter_map] = column.as_column(
-                        value
-                    )[scatter_map]
+                        self._data[col_name][scatter_map] = value
+                    else:
+
+                        self._data[col_name][scatter_map] = column.as_column(
+                            value
+                        )[scatter_map]
         elif is_scalar(arg) or isinstance(arg, tuple):
             if isinstance(value, DataFrame):
                 _setitem_with_dataframe(
@@ -527,30 +530,34 @@ def __setitem__(self, arg, value):
                 )
             else:
                 if arg in self._data:
-                    if is_scalar(value):
-                        value = utils.scalar_broadcast_to(value, len(self))
                     if len(self) == 0:
-                        if isinstance(value, (pd.Series, Series)):
-                            self._index = as_index(value.index)
-                        elif len(value) > 0:
-                            self._index = RangeIndex(start=0, stop=len(value))
                         value = column.as_column(value)
                         new_data = self._data.__class__()
                         for key in self._data:
-                            if key == arg:
+                            if key in arg or key == arg:
                                 new_data[key] = value
                             else:
                                 new_data[key] = column.column_empty_like(
-                                    self._data[key],
-                                    masked=True,
-                                    newsize=len(value),
+                                            self._data[key],
+                                            masked=True,
+                                            newsize=len(value),
                                 )
+
                         self._data = new_data
+                        return
                     elif isinstance(value, (pd.Series, Series)):
                         value = Series(value)._align_to_index(
                             self._index, how="right", allow_non_unique=True
                         )
-                    self._data[arg] = column.as_column(value)
+                    if is_scalar(arg):
+                        arg=[arg]
+                    if is_scalar(value):
+                        for key in arg:
+                            self._data[key][:] = value
+                    else:
+                        value = as_column(value)
+                        for key in arg:
+                            self._data[key] = value
                 else:
                     # disc. with pandas here
                     # pandas raises key error here
@@ -562,13 +569,10 @@ def __setitem__(self, arg, value):
             mask = arg
             if isinstance(mask, list):
                 mask = np.array(mask)
-
-            if is_scalar(value):
-                value = column.as_column(
-                    utils.scalar_broadcast_to(value, len(self))
-                )
-
+            mask = np.array(arg)
             if mask.dtype == "bool":
+                mask = column.as_column(arg)
+
                 if isinstance(value, DataFrame):
                     _setitem_with_dataframe(
                         input_df=self,
@@ -577,10 +581,10 @@ def __setitem__(self, arg, value):
                         mask=mask,
                     )
                 else:
+                    if not is_scalar(value):
+                        value = column.as_column(value)[mask]
                     for col_name in self._data:
-                        self._data[col_name][mask] = column.as_column(value)[
-                            mask
-                        ]
+                        self._data[col_name][mask] = value
             else:
                 if isinstance(value, DataFrame):
                     _setitem_with_dataframe(
@@ -590,11 +594,17 @@ def __setitem__(self, arg, value):
                         mask=None,
                     )
                 else:
+                    if not is_scalar(value):
+                        value = column.as_column(value)
                     for col in arg:
                         # we will raise a key error if col not in dataframe
                         # this behavior will make it
                         # consistent to pandas >0.21.0
-                        self._data[col] = column.as_column(value)
+                        if not is_scalar(value):
+                            self._data[col] = value
+                        else:
+                            self._data[col][:] = value
+
         else:
             msg = "__setitem__ on type {!r} is not supported"
             raise TypeError(msg.format(type(arg)))

From 8dcd79b7087ea5045369cba53a554afaa35b4d26 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Fri, 13 Mar 2020 19:34:54 -0500
Subject: [PATCH 48/79] nvtext cython: add nvtext methods to `StringMethods`.

---
 python/cudf/cudf/_libxx/nvtext/normalize.pyx |  2 +-
 python/cudf/cudf/core/column/string.py       | 67 ++++++++++++++++++--
 2 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/_libxx/nvtext/normalize.pyx b/python/cudf/cudf/_libxx/nvtext/normalize.pyx
index b2654446ab9..e8817495a81 100644
--- a/python/cudf/cudf/_libxx/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_libxx/nvtext/normalize.pyx
@@ -11,7 +11,7 @@ from cudf._libxx.cpp.nvtext.normalize cimport (
 from cudf._libxx.column cimport Column
 
 
-def normalize_spaces(Column strings, int ngrams):
+def normalize_spaces(Column strings):
     cdef column_view c_strings = strings.view()
     cdef unique_ptr[column] c_result
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 24984aadd4e..590902fd95c 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1172,11 +1172,6 @@ def rjust(self, width, fillchar=" ", **kwargs):
             with fillchar.
 
         fillchar : str, default ' ' (whitespace)
-            Additional character for filling, default is whitespace.
-
-        Returns
-        -------
-        Series/Index of str dtype
             Returns Series or Index.
 
         """
@@ -1735,6 +1730,68 @@ def translate(self, table, **kwargs):
             cpp_translate(self._column, table), **kwargs
         )
 
+    def normalize_spaces(self):
+        return libcudfxx.nvtext.normalize_spaces(self._column)
+
+    def tokenize(self, delimiter=None):
+        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        return libcudfxx.nvtext.tokenize(self._column, delimiter)
+
+    def token_count(self, delimiter=None):
+        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        return libcudfxx.nvtext.count_tokens(self._column, delimiter)
+
+    def ngrams(self, ngrams=2, separator="_"):
+        separator = _massage_string_arg(separator, "separator")
+        return libcudfxx.nvtext.ngrams(self._column, ngrams, separator)
+
+    def ngrams_tokenize(self, ngrams=2, delimiter=" ", separator="_"):
+        delimiter = _massage_string_arg(delimiter, "delimiter")
+        separator = _massage_string_arg(separator, "separator")
+
+        return libcudfxx.nvtext.ngrams_tokenize(
+            self._column, ngrams, delimiter, separator
+        )
+
+
+def _massage_string_arg(value, name, allow_col=False):
+    from cudf._libxx.scalar import Scalar
+    from cudf._libxx.column import Column
+    from cudf.utils.dtypes import is_string_dtype
+
+    allowed_types = ["Scalar"]
+
+    if isinstance(value, str):
+        return Scalar(value, dtype="str")
+
+    if isinstance(value, Scalar) and is_string_dtype(value.dtype):
+        return value
+
+    if allow_col:
+        allowed_types += ["Column"]
+
+        if isinstance(value, list):
+            return column.as_column(value, dtype="str")
+
+        if isinstance(value, Column) and is_string_dtype(value.dtype):
+            return value
+
+    raise ValueError(
+        "Expected {} for {} but got {}".format(
+            _expected_types_format(allowed_types), name, type(value)
+        )
+    )
+
+
+def _expected_types_format(types):
+    if len(types) == 0:
+        raise ValueError
+
+    if len(types) == 1:
+        return types[0]
+
+    return ", ".join(types[:-1]) + ", or " + types[-1]
+
 
 class StringColumn(column.ColumnBase):
     """Implements operations for Columns of String type

From d0736567e553dce67846180b97108c61e98d7490 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 14 Mar 2020 07:19:03 -0400
Subject: [PATCH 49/79] Stale import

---
 python/cudf/cudf/_lib/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 472a36064f6..e0beeeec493 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -2,7 +2,6 @@
 
 from . import (
     avro,
-    binops,
     concat,
     copying,
     csv,

From 1d55fd700655566d2d2cf29d82f68786a4f2ee19 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 14 Mar 2020 07:19:10 -0400
Subject: [PATCH 50/79] Raise on unsupported datetime binop

---
 python/cudf/cudf/core/column/datetime.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index a36ed8e4bb6..89dffe17f78 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -213,7 +213,13 @@ def default_na_value(self):
 
     def binary_operator(self, op, rhs, reflect=False):
         lhs, rhs = self, rhs
-        return binop(lhs, rhs, op=op, out_dtype=np.bool)
+
+        if op in ("eq", "ne", "lt", "gt", "le", "ge"):
+            out_dtype = np.bool
+        else:
+            raise TypeError(f"Series of dtype {self.dtype} cannot perform "
+                            f" the operation {op}")
+        return binop(lhs, rhs, op=op, out_dtype=out_dtype)
 
     def fillna(self, fill_value):
         if is_scalar(fill_value):

From 48b5f8c07dbe3d7550874274f346aa3941273f01 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 14 Mar 2020 07:19:22 -0400
Subject: [PATCH 51/79] Fix logic for empty string handling

---
 python/cudf/cudf/core/column/string.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 05fed9d0d36..a540dab0587 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1771,13 +1771,12 @@ def __init__(self, mask=None, size=None, offset=0, children=()):
         # For an "all empty" StringColumn (e.g., [""]) libcudf still
         # needs the chars child column pointer to be non-null:
         if self.size:
-            if self.null_count == 0 and self.children[1].size == 0:
-                self.set_base_children(
-                    (
-                        self.base_children[0],
-                        column_empty(1, dtype=self.base_children[1].dtype),
-                    )
+            if self.children[1].size == 0 and self.null_count != self.size:
+                offsets = self.base_children[0]
+                chars = column_empty(
+                    self.base_children[1].size + 1, dtype="int8"
                 )
+                self.set_base_children((offsets, chars))
 
         # TODO: Remove these once NVStrings is fully deprecated / removed
         self._nvstrings = None

From ba4269e3f1e4f8dfe2535775985ced73994f13bc Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 14 Mar 2020 07:19:32 -0400
Subject: [PATCH 52/79] Remove function introduced by bad merge

---
 python/cudf/cudf/core/column/string.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a540dab0587..341a85add9d 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2065,9 +2065,6 @@ def deserialize(cls, header, frames):
         )
         return col
 
-    def copy(self, deep=True):
-        return column.as_column(self.nvstrings.copy())
-
     def find_and_replace(self, to_replace, replacement, all_nan):
         """
         Return col with *to_replace* replaced with *value*

From 956d2b1d64e4739db6f8ac749c92d18a2bbc87de Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 14 Mar 2020 07:19:50 -0400
Subject: [PATCH 53/79] Remove use of legacy bindings

---
 python/cudf/cudf/utils/applyutils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index 7a0ddc9b6f5..0b5614e5c84 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -6,7 +6,7 @@
 from numba import cuda, six
 from numba.utils import exec_, pysignature
 
-import cudf._lib as libcudf
+import cudf._libxx as libcudfxx
 from cudf.core.column import column
 from cudf.core.series import Series
 from cudf.utils import utils
@@ -116,8 +116,8 @@ def make_aggregate_nullmask(df, columns=None, op="and"):
             )
             continue
 
-        libcudf.binops.apply_op(
-            column.as_column(nullmask), out_mask, out_mask, op
+        out_mask = libcudfxx.binaryop.binaryop(
+            column.as_column(nullmask), out_mask, op, out_mask.dtype
         )
 
     return out_mask

From f61c713aa3d45f5fc6b3e4d16e6473238328b971 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Sat, 14 Mar 2020 10:28:43 -0400
Subject: [PATCH 54/79] Black

---
 python/cudf/cudf/core/column/datetime.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 89dffe17f78..5639e1dabe4 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -217,8 +217,10 @@ def binary_operator(self, op, rhs, reflect=False):
         if op in ("eq", "ne", "lt", "gt", "le", "ge"):
             out_dtype = np.bool
         else:
-            raise TypeError(f"Series of dtype {self.dtype} cannot perform "
-                            f" the operation {op}")
+            raise TypeError(
+                f"Series of dtype {self.dtype} cannot perform "
+                f" the operation {op}"
+            )
         return binop(lhs, rhs, op=op, out_dtype=out_dtype)
 
     def fillna(self, fill_value):

From 08d892c0600d7a5d297236f9c4b8f623f8b70736 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Sat, 14 Mar 2020 11:34:35 -0500
Subject: [PATCH 55/79] nvtext cython: unit tests

---
 python/cudf/cudf/_libxx/__init__.py    |   1 +
 python/cudf/cudf/core/column/string.py |  38 +++---
 python/cudf/cudf/tests/test_text.py    | 162 +++++++++++++++++++++++++
 3 files changed, 186 insertions(+), 15 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_text.py

diff --git a/python/cudf/cudf/_libxx/__init__.py b/python/cudf/cudf/_libxx/__init__.py
index 9e29af03850..9e95af0963d 100644
--- a/python/cudf/cudf/_libxx/__init__.py
+++ b/python/cudf/cudf/_libxx/__init__.py
@@ -12,6 +12,7 @@
     join,
     merge,
     null_mask,
+    nvtext,
     orc,
     quantiles,
     reduce,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 83f5a631501..09506e481e2 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -15,6 +15,19 @@
 import cudf._libxx as libcudfxx
 import cudf._libxx.string_casting as str_cast
 from cudf._lib.nvtx import nvtx_range_pop, nvtx_range_push
+from cudf._libxx.nvtext.generate_ngrams import (
+    generate_ngrams as cpp_generate_ngrams,
+)
+from cudf._libxx.nvtext.ngrams_tokenize import (
+    ngrams_tokenize as cpp_ngrams_tokenize,
+)
+from cudf._libxx.nvtext.normalize import (
+    normalize_spaces as cpp_normalize_spaces,
+)
+from cudf._libxx.nvtext.tokenize import (
+    count_tokens as cpp_count_tokens,
+    tokenize as cpp_tokenize,
+)
 from cudf._libxx.strings.attributes import (
     code_points as cpp_code_points,
     count_characters as cpp_count_characters,
@@ -815,10 +828,7 @@ def split(self, pat=None, n=-1, expand=True, **kwargs):
             String to split on, does not yet support regular expressions.
         n : int, default -1 (all)
             Limit number of splits in output. `None`, 0, and -1 will all be
-            interpreted as "all splits".
-
-        Returns
-        -------
+            interpreted as "all splits".libcudfxx.nvtext.tokenize
         DataFrame
             Returns a DataFrame with each split as a column.
 
@@ -1782,27 +1792,25 @@ def translate(self, table, **kwargs):
         )
 
     def normalize_spaces(self):
-        return libcudfxx.nvtext.normalize_spaces(self._column)
+        return cpp_normalize_spaces(self._column)
 
-    def tokenize(self, delimiter=None):
+    def tokenize(self, delimiter=" "):
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        return libcudfxx.nvtext.tokenize(self._column, delimiter)
+        return cpp_tokenize(self._column, delimiter)
 
-    def token_count(self, delimiter=None):
+    def token_count(self, delimiter=" "):
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        return libcudfxx.nvtext.count_tokens(self._column, delimiter)
+        return cpp_count_tokens(self._column, delimiter)
 
-    def ngrams(self, ngrams=2, separator="_"):
+    def ngrams(self, n=2, separator="_"):
         separator = _massage_string_arg(separator, "separator")
-        return libcudfxx.nvtext.ngrams(self._column, ngrams, separator)
+        return cpp_generate_ngrams(self._column, n, separator)
 
-    def ngrams_tokenize(self, ngrams=2, delimiter=" ", separator="_"):
+    def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"):
         delimiter = _massage_string_arg(delimiter, "delimiter")
         separator = _massage_string_arg(separator, "separator")
 
-        return libcudfxx.nvtext.ngrams_tokenize(
-            self._column, ngrams, delimiter, separator
-        )
+        return cpp_ngrams_tokenize(self._column, n, delimiter, separator)
 
 
 def _massage_string_arg(value, name, allow_col=False):
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
new file mode 100644
index 00000000000..16523054c17
--- /dev/null
+++ b/python/cudf/cudf/tests/test_text.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.
+
+import pytest
+from pandas.util.testing import assert_series_equal
+
+import cudf
+
+
+def test_tokenize():
+    strings = cudf.Series(
+        [
+            "the quick fox jumped over the lazy dog",
+            "the siamésé cat jumped under the sofa",
+            None,
+            "",
+        ]
+    )
+
+    expected = cudf.Series(
+        [
+            "the",
+            "quick",
+            "fox",
+            "jumped",
+            "over",
+            "the",
+            "lazy",
+            "dog",
+            "the",
+            "siamésé",
+            "cat",
+            "jumped",
+            "under",
+            "the",
+            "sofa",
+        ]
+    )
+
+    actual = strings.str.tokenize()
+
+    assert_series_equal(expected.to_pandas(), actual.to_pandas())
+
+
+@pytest.mark.parametrize(
+    "delimiter, expected_token_counts",
+    [
+        (" ", [10, 9, 0, 0, 1]),  # TODO: verify last count should be 1, not 5
+        ("o", [6, 3, 0, 0, 1]),
+        (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]),
+        (["a", "e", "i", "o"], [12, 11, 0, 0, 6]),
+        # ([], [10, 9, 0, 0, 5]), # throws
+    ],
+)
+def test_token_count(delimiter, expected_token_counts):
+    strings = cudf.Series(
+        [
+            "the quick brown fox jumped over the lazy brown dog",
+            "the sable siamésé cat jumped under the brown sofa",
+            None,
+            "",
+            "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05",
+        ]
+    )
+
+    expected = cudf.Series(expected_token_counts)
+
+    actual = strings.str.token_count(delimiter)
+
+    assert_series_equal(
+        expected.to_pandas(), actual.to_pandas(), check_dtype=False
+    )
+
+
+def test_normalize_spaces():
+    strings = cudf.Series(
+        [
+            " the\t quick fox  jumped over the lazy dog",
+            "the siamésé cat\f jumped\t\tunder the sofa  ",
+            None,
+            "",
+        ]
+    )
+    expected = cudf.Series(
+        [
+            "the quick fox jumped over the lazy dog",
+            "the siamésé cat jumped under the sofa",
+            None,
+            "",
+        ]
+    )
+
+    actual = strings.str.normalize_spaces()
+
+    assert_series_equal(expected.to_pandas(), actual.to_pandas())
+
+
+@pytest.mark.parametrize(
+    "n, separator, expected_values",
+    [
+        (
+            2,
+            "_",
+            [
+                "this_is",
+                "is_my",
+                "my_favorite",
+                "favorite_book",
+                "book_on",
+                "on_my",
+                "my_bookshelf",
+            ],
+        ),
+        (
+            3,
+            "-",
+            [
+                "this-is-my",
+                "is-my-favorite",
+                "my-favorite-book",
+                "favorite-book-on",
+                "book-on-my",
+                "on-my-bookshelf",
+            ],
+        ),
+    ],
+)
+def test_ngrams(n, separator, expected_values):
+    strings = cudf.Series(
+        ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"]
+    )
+    expected = cudf.Series(expected_values)
+    actual = strings.str.ngrams(n=n, separator=separator)
+    assert_series_equal(expected.to_pandas(), actual.to_pandas())
+
+
+@pytest.mark.parametrize(
+    "n, separator, expected_values",
+    [
+        (
+            2,
+            "_",
+            [
+                "this_is",
+                "is_my",
+                "my_favorite",
+                "book_on",
+                "on_my",
+                "my_bookshelf",
+            ],
+        ),
+        (
+            3,
+            "-",
+            ["this-is-my", "is-my-favorite", "book-on-my", "on-my-bookshelf"],
+        ),
+    ],
+)
+def test_ngrams_tokenize(n, separator, expected_values):
+    strings = cudf.Series(["this is my favorite", "book on my bookshelf"])
+    expected = cudf.Series(expected_values)
+    actual = strings.str.ngrams_tokenize(n=n, separator=separator)
+    assert_series_equal(expected.to_pandas(), actual.to_pandas())

From d224fc1bec5fa1a3251333710ca5faecff7e7af3 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Sat, 14 Mar 2020 11:37:40 -0500
Subject: [PATCH 56/79] string.py: add-back accidentally deleted lines.

---
 python/cudf/cudf/core/column/string.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 09506e481e2..f935a2a9f1f 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -828,7 +828,10 @@ def split(self, pat=None, n=-1, expand=True, **kwargs):
             String to split on, does not yet support regular expressions.
         n : int, default -1 (all)
             Limit number of splits in output. `None`, 0, and -1 will all be
-            interpreted as "all splits".libcudfxx.nvtext.tokenize
+            interpreted as "all splits".
+
+        Returns
+        -------
         DataFrame
             Returns a DataFrame with each split as a column.
 
@@ -1150,6 +1153,11 @@ def rjust(self, width, fillchar=" ", **kwargs):
             with fillchar.
 
         fillchar : str, default ' ' (whitespace)
+            Additional character for filling, default is whitespace.
+
+        Returns
+        -------
+        Series/Index of str dtype
             Returns Series or Index.
 
         """

From 9dc0b3b7b8d9d98b9b082933be8e3a849c1cb41e Mon Sep 17 00:00:00 2001
From: Dave Baranec <dbaranec@nvidia.com>
Date: Tue, 10 Mar 2020 12:50:41 -0500
Subject: [PATCH 57/79] Make scalar destructor virtual to ensure all derived
 class destructors properly get called.

---
 cpp/include/cudf/scalar/scalar.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 9d9f5fc98fb..dbd4e0a52cc 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -40,7 +40,7 @@ namespace cudf {
  */
 class scalar {
  public:
-  ~scalar() = default;
+  virtual ~scalar() = default;
   scalar(scalar&& other) = default;
   scalar(scalar const& other) = default;
   scalar& operator=(scalar const& other) = delete;

From bfffdcd9467ceae0e240ab603e936c5ee7421d44 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Sat, 14 Mar 2020 13:28:46 -0500
Subject: [PATCH 58/79] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7bfc58c5222..ba6bc22b152 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -262,6 +262,7 @@
 - PR #4494 Update Java memory event handler for new RMM resource API
 - PR #4505 Fix 0 length buffers during serialization
 - PR #4482 Fix `.str.rsplit`, `.str.split`, `.str.find`, `.str.rfind`, `.str.index`, `.str.rindex` and enable related tests
+- PR #4513 Backport scalar virtual destructor fix
 
 
 # cuDF 0.12.0 (04 Feb 2020)

From 943aa0f11f5145d2d48f9fee780d9f4e91dd8701 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Sat, 14 Mar 2020 16:22:45 -0500
Subject: [PATCH 59/79] strings.py: return correct subclass from nvtext methods

---
 python/cudf/cudf/core/column/string.py | 28 +++++++++++++++++---------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index f935a2a9f1f..df765e397a9 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1799,26 +1799,34 @@ def translate(self, table, **kwargs):
             cpp_translate(self._column, table), **kwargs
         )
 
-    def normalize_spaces(self):
-        return cpp_normalize_spaces(self._column)
+    def normalize_spaces(self, **kwargs):
+        return self._return_or_inplace(
+            cpp_normalize_spaces(self._column), **kwargs
+        )
 
-    def tokenize(self, delimiter=" "):
+    def tokenize(self, delimiter=" ", **kwargs):
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        return cpp_tokenize(self._column, delimiter)
+        return self._return_or_inplace(cpp_tokenize(self._column, delimiter))
 
-    def token_count(self, delimiter=" "):
+    def token_count(self, delimiter=" ", **kwargs):
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        return cpp_count_tokens(self._column, delimiter)
+        return self._return_or_inplace(
+            cpp_count_tokens(self._column, delimiter)
+        )
 
-    def ngrams(self, n=2, separator="_"):
+    def ngrams(self, n=2, separator="_", **kwargs):
         separator = _massage_string_arg(separator, "separator")
-        return cpp_generate_ngrams(self._column, n, separator)
+        return self._return_or_inplace(
+            cpp_generate_ngrams(self._column, n, separator)
+        )
 
-    def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"):
+    def ngrams_tokenize(self, n=2, delimiter=" ", separator="_", **kwargs):
         delimiter = _massage_string_arg(delimiter, "delimiter")
         separator = _massage_string_arg(separator, "separator")
 
-        return cpp_ngrams_tokenize(self._column, n, delimiter, separator)
+        return self._return_or_inplace(
+            cpp_ngrams_tokenize(self._column, n, delimiter, separator)
+        )
 
 
 def _massage_string_arg(value, name, allow_col=False):

From 897b9413747b6445b778ebec4f3858c0d0865912 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Sat, 14 Mar 2020 16:23:57 -0500
Subject: [PATCH 60/79] string.py: move unnecessary code down past early
 returns.

---
 python/cudf/cudf/core/column/string.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index df765e397a9..a3c566c1d97 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1834,23 +1834,23 @@ def _massage_string_arg(value, name, allow_col=False):
     from cudf._libxx.column import Column
     from cudf.utils.dtypes import is_string_dtype
 
-    allowed_types = ["Scalar"]
-
     if isinstance(value, str):
         return Scalar(value, dtype="str")
 
     if isinstance(value, Scalar) and is_string_dtype(value.dtype):
         return value
 
-    if allow_col:
-        allowed_types += ["Column"]
+    allowed_types = ["Scalar"]
 
+    if allow_col:
         if isinstance(value, list):
             return column.as_column(value, dtype="str")
 
         if isinstance(value, Column) and is_string_dtype(value.dtype):
             return value
 
+        allowed_types.append("Column")
+
     raise ValueError(
         "Expected {} for {} but got {}".format(
             _expected_types_format(allowed_types), name, type(value)
@@ -1859,9 +1859,6 @@ def _massage_string_arg(value, name, allow_col=False):
 
 
 def _expected_types_format(types):
-    if len(types) == 0:
-        raise ValueError
-
     if len(types) == 1:
         return types[0]
 

From ef32f2cd4f5b00744bd4f8024fa762eb9ac36333 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Sat, 14 Mar 2020 17:04:24 -0500
Subject: [PATCH 61/79] strings.py: fix nvtext methods incomptable index
 errors, update unit tests

---
 python/cudf/cudf/core/column/string.py | 27 ++++++++++++++++++--------
 python/cudf/cudf/tests/test_text.py    | 11 +++++++++++
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a3c566c1d97..0d360ecf13f 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -214,9 +214,15 @@ def _return_or_inplace(self, new_col, **kwargs):
                         index=self._parent.index,
                     )
             elif isinstance(self._parent, Series):
-                return Series(
-                    new_col, index=self._parent.index, name=self._parent.name
-                )
+                retain_index = kwargs.get("retain_index", True)
+                if retain_index:
+                    return Series(
+                        new_col,
+                        name=self._parent.name,
+                        index=self._parent.index,
+                    )
+                else:
+                    return Series(new_col, name=self._parent.name)
             elif isinstance(self._parent, Index):
                 return as_index(new_col, name=self._parent.name)
             else:
@@ -1806,26 +1812,31 @@ def normalize_spaces(self, **kwargs):
 
     def tokenize(self, delimiter=" ", **kwargs):
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        return self._return_or_inplace(cpp_tokenize(self._column, delimiter))
+        kwargs.setdefault("retain_index", False)
+        return self._return_or_inplace(
+            cpp_tokenize(self._column, delimiter), **kwargs
+        )
 
     def token_count(self, delimiter=" ", **kwargs):
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
         return self._return_or_inplace(
-            cpp_count_tokens(self._column, delimiter)
+            cpp_count_tokens(self._column, delimiter), **kwargs
         )
 
     def ngrams(self, n=2, separator="_", **kwargs):
         separator = _massage_string_arg(separator, "separator")
+        kwargs.setdefault("retain_index", False)
         return self._return_or_inplace(
-            cpp_generate_ngrams(self._column, n, separator)
+            cpp_generate_ngrams(self._column, n, separator), **kwargs
         )
 
     def ngrams_tokenize(self, n=2, delimiter=" ", separator="_", **kwargs):
         delimiter = _massage_string_arg(delimiter, "delimiter")
         separator = _massage_string_arg(separator, "separator")
-
+        kwargs.setdefault("retain_index", False)
         return self._return_or_inplace(
-            cpp_ngrams_tokenize(self._column, n, delimiter, separator)
+            cpp_ngrams_tokenize(self._column, n, delimiter, separator),
+            **kwargs,
         )
 
 
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 16523054c17..166cc1fdbb2 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -38,6 +38,7 @@ def test_tokenize():
 
     actual = strings.str.tokenize()
 
+    assert type(expected) == type(actual)
     assert_series_equal(expected.to_pandas(), actual.to_pandas())
 
 
@@ -66,6 +67,7 @@ def test_token_count(delimiter, expected_token_counts):
 
     actual = strings.str.token_count(delimiter)
 
+    assert type(expected) == type(actual)
     assert_series_equal(
         expected.to_pandas(), actual.to_pandas(), check_dtype=False
     )
@@ -91,6 +93,7 @@ def test_normalize_spaces():
 
     actual = strings.str.normalize_spaces()
 
+    assert type(expected) == type(actual)
     assert_series_equal(expected.to_pandas(), actual.to_pandas())
 
 
@@ -128,8 +131,12 @@ def test_ngrams(n, separator, expected_values):
     strings = cudf.Series(
         ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"]
     )
+
     expected = cudf.Series(expected_values)
+
     actual = strings.str.ngrams(n=n, separator=separator)
+
+    assert type(expected) == type(actual)
     assert_series_equal(expected.to_pandas(), actual.to_pandas())
 
 
@@ -157,6 +164,10 @@ def test_ngrams(n, separator, expected_values):
 )
 def test_ngrams_tokenize(n, separator, expected_values):
     strings = cudf.Series(["this is my favorite", "book on my bookshelf"])
+
     expected = cudf.Series(expected_values)
+
     actual = strings.str.ngrams_tokenize(n=n, separator=separator)
+
+    assert type(expected) == type(actual)
     assert_series_equal(expected.to_pandas(), actual.to_pandas())

From e6616af76ba271b5f13d5a674364116116438b78 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sun, 15 Mar 2020 20:22:20 -0700
Subject: [PATCH 62/79] remove condition check for nsmallest & nlargest

---
 python/cudf/cudf/core/series.py        | 13 +++++++----
 python/cudf/cudf/tests/test_sorting.py | 32 +++++++++++---------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 07fab5e2d9d..9bef799c132 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1462,13 +1462,18 @@ def sort_values(self, ascending=True, na_position="last"):
         return vals.set_index(index)
 
     def _n_largest_or_smallest(self, largest, n, keep):
-        if not (0 <= n <= len(self)):
-            raise ValueError("n out-of-bound")
         direction = largest
         if keep == "first":
-            return self.sort_values(ascending=not direction)[:n]
+            if n < 0:
+                n = 0
+            return self.sort_values(ascending=not direction).head(n)
         elif keep == "last":
-            return self.sort_values(ascending=direction)[-n:].reverse()
+            data = self.sort_values(ascending=direction)
+            if n <= 0:
+                data = data[-n:-n]
+            else:
+                data = data.tail(n)
+            return data.reverse()
         else:
             raise ValueError('keep must be either "first", "last"')
 
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 49e273b76e8..87a148553ec 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -75,34 +75,30 @@ def test_series_sort_index(nelem, asc):
     np.testing.assert_array_equal(orig, got)
 
 
-def test_series_nlargest():
+@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]])
+@pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 7])
+def test_series_nlargest(data, n):
     """Indirectly tests Series.sort_values()
     """
-    sr = Series([0, 1, 1, 2, 2, 2, 3, 3])
-    got = sr.nlargest(3)  # default keep='first'
-    assert list(got) == [3, 3, 2]
-    assert list(got.index.values) == [6, 7, 3]
-
-    got = sr.nlargest(3, keep="last")
-    assert list(got) == [3, 3, 2]
-    assert list(got.index.values) == [7, 6, 5]
+    sr = Series(data)
+    psr = pd.Series(data)
+    assert_eq(sr.nlargest(n), psr.nlargest(n))
+    assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last"))
 
     with pytest.raises(ValueError) as raises:
         sr.nlargest(3, keep="what")
     assert raises.match('keep must be either "first", "last"')
 
 
-def test_series_nsmallest():
+@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]])
+@pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 9])
+def test_series_nsmallest(data, n):
     """Indirectly tests Series.sort_values()
     """
-    sr = Series([0, 1, 1, 2, 2, 2, 3, 3])
-    got = sr.nsmallest(3)  # default keep='first'
-    assert list(got) == [0, 1, 1]
-    assert list(got.index.values) == [0, 1, 2]
-
-    got = sr.nsmallest(3, keep="last")
-    assert list(got) == [0, 1, 1]
-    assert list(got.index.values) == [0, 2, 1]
+    sr = Series(data)
+    psr = pd.Series(data)
+    assert_eq(sr.nsmallest(n), psr.nsmallest(n))
+    assert_eq(sr.nsmallest(n, keep="last"), psr.nsmallest(n, keep="last"))
 
     with pytest.raises(ValueError) as raises:
         sr.nsmallest(3, keep="what")

From 36a67835bf564bfccaf943bc80a464885e12fb42 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sun, 15 Mar 2020 20:27:17 -0700
Subject: [PATCH 63/79] remove n range check in
 DataFrame._n_largest_or_smallest

---
 python/cudf/cudf/core/dataframe.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4c31547cb1b..b22b5570c74 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2137,8 +2137,7 @@ def _n_largest_or_smallest(self, method, n, columns, keep):
             [column] = columns
         else:
             column = columns
-        if not (0 <= n <= len(self)):
-            raise ValueError("n out-of-bound")
+
         col = self[column].reset_index(drop=True)
         # Operate
         sorted_series = getattr(col, method)(n=n, keep=keep)

From f81e63dee8526f29470d6df1978369e205906f04 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Sun, 15 Mar 2020 22:33:44 -0500
Subject: [PATCH 64/79] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c2d1d394bac..a726fb40f9d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -258,6 +258,7 @@
 - PR #4474 Fix to not materialize RangeIndex in copy_categories
 - PR #4496 Skip tests which require 2+ GPUs
 - PR #4494 Update Java memory event handler for new RMM resource API
+- PR #4519 Remove `n` validation for `nlargest` & `nsmallest` and add negative support for `n`
 
 
 # cuDF 0.12.0 (04 Feb 2020)

From f74adf84709922f684f2ce644f9da8b17bc157fe Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Mon, 16 Mar 2020 16:39:06 +0530
Subject: [PATCH 65/79] Document dropping mask of ARGMIN/MAX gather map

---
 cpp/src/groupby/hash/groupby.cu | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index a039000ef2a..ed4daec7f68 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -170,6 +170,11 @@ void sparse_to_dense_results(
       auto tranformed_agg = std::make_unique<aggregation>(agg_kind);
       auto arg_result = to_dense_agg_result(tranformed_agg);
       if (arg_result->nullable()) {
+        // We make a view of ARG(MIN/MAX) result without a null mask and gather 
+        // using this map. The values in data buffer of ARG(MIN/MAX) result 
+        // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
+        // which is an out of bounds index value (-1) and causes the gathered
+        // value to be null.
         column_view null_removed_map(data_type(type_to_id<size_type>()),
           arg_result->size(), 
           static_cast<void const*>(arg_result->view().template data<size_type>()));

From 3a800ca7879210c9a1296637e9647a64f9843f2e Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)" <ramakrishnap@nvidia.com>
Date: Mon, 16 Mar 2020 11:50:47 -0500
Subject: [PATCH 66/79] test cases pass

---
 python/cudf/cudf/core/dataframe.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d0acd24a369..5bc8a0bb802 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -501,7 +501,6 @@ def mask(self, other):
     def __setitem__(self, arg, value):
         """Add/set column by *arg or DataFrame*
         """
-        print("__setitem__ is being called ___________________")
         if isinstance(arg, DataFrame):
             # not handling set_item where arg = df & value = df
             if isinstance(value, DataFrame):
@@ -531,6 +530,10 @@ def __setitem__(self, arg, value):
             else:
                 if arg in self._data:
                     if len(self) == 0:
+                        if isinstance(value, (pd.Series, Series)):
+                            self._index = as_index(value.index)
+                        elif len(value) > 0:
+                            self._index = RangeIndex(start=0, stop=len(value))
                         value = column.as_column(value)
                         new_data = self._data.__class__()
                         for key in self._data:
@@ -549,15 +552,11 @@ def __setitem__(self, arg, value):
                         value = Series(value)._align_to_index(
                             self._index, how="right", allow_non_unique=True
                         )
-                    if is_scalar(arg):
-                        arg=[arg]
                     if is_scalar(value):
-                        for key in arg:
-                            self._data[key][:] = value
+                        self._data[arg][:] = value
                     else:
                         value = as_column(value)
-                        for key in arg:
-                            self._data[key] = value
+                        self._data[arg] = value
                 else:
                     # disc. with pandas here
                     # pandas raises key error here

From be55bc6c505e280029790525efc9a688b8b2764f Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)" <ramakrishnap@nvidia.com>
Date: Mon, 16 Mar 2020 12:23:19 -0500
Subject: [PATCH 67/79] CHANGELOG.md

---
 CHANGELOG.md                       |  1 +
 python/cudf/cudf/core/dataframe.py | 14 ++++++--------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 95d5443981d..f8db84bb07e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -169,6 +169,7 @@
 - PR #4503 Port binaryop.pyx to libcudf++ API
 - PR #4499 Adding changes to handle include `keep_index` and `RangeIndex`
 - PR #4493 Skip legacy testing in CI
+- PR #4524 Updating `__setitem__` for DataFrame to use scalar scatter
 
 ## Bug Fixes
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 5ed38e1cf87..13143e4a927 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -537,13 +537,13 @@ def __setitem__(self, arg, value):
                         value = column.as_column(value)
                         new_data = self._data.__class__()
                         for key in self._data:
-                            if key in arg or key == arg:
+                            if key == arg:
                                 new_data[key] = value
                             else:
                                 new_data[key] = column.column_empty_like(
-                                            self._data[key],
-                                            masked=True,
-                                            newsize=len(value),
+                                    self._data[key],
+                                    masked=True,
+                                    newsize=len(value),
                                 )
 
                         self._data = new_data
@@ -568,7 +568,7 @@ def __setitem__(self, arg, value):
             mask = arg
             if isinstance(mask, list):
                 mask = np.array(mask)
-            mask = np.array(arg)
+
             if mask.dtype == "bool":
                 mask = column.as_column(arg)
 
@@ -593,14 +593,12 @@ def __setitem__(self, arg, value):
                         mask=None,
                     )
                 else:
-                    if not is_scalar(value):
-                        value = column.as_column(value)
                     for col in arg:
                         # we will raise a key error if col not in dataframe
                         # this behavior will make it
                         # consistent to pandas >0.21.0
                         if not is_scalar(value):
-                            self._data[col] = value
+                            self._data[col] = column.as_column(value)
                         else:
                             self._data[col][:] = value
 

From 76bc0e823a8abbf224b2a65862133dcd56298fc4 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <dmakkar@nvidia.com>
Date: Mon, 16 Mar 2020 23:35:29 +0530
Subject: [PATCH 68/79] Review code cleanup requested by karthikeyan
 https://github.com/rapidsai/cudf/pull/4456#discussion_r393186736

---
 cpp/src/groupby/hash/groupby.cu | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index ed4daec7f68..66e8138d45b 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -167,26 +167,19 @@ void sparse_to_dense_results(
     auto transformed_result =
     [&col, to_dense_agg_result, mr, stream]
     (auto const& agg_kind) {
-      auto tranformed_agg = std::make_unique<aggregation>(agg_kind);
-      auto arg_result = to_dense_agg_result(tranformed_agg);
-      if (arg_result->nullable()) {
-        // We make a view of ARG(MIN/MAX) result without a null mask and gather 
-        // using this map. The values in data buffer of ARG(MIN/MAX) result 
-        // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
-        // which is an out of bounds index value (-1) and causes the gathered
-        // value to be null.
-        column_view null_removed_map(data_type(type_to_id<size_type>()),
-          arg_result->size(), 
-          static_cast<void const*>(arg_result->view().template data<size_type>()));
-        auto transformed_result = experimental::detail::gather(
-          table_view({col}), null_removed_map, false, true, false, mr, stream);
-        return std::move(transformed_result->release()[0]);
-      }
-      else {
-        auto transformed_result = experimental::detail::gather(
-          table_view({col}), *arg_result, false, false, false, mr, stream);
-        return std::move(transformed_result->release()[0]);
-      }
+      auto transformed_agg = std::make_unique<aggregation>(agg_kind);
+      auto arg_result = to_dense_agg_result(transformed_agg);
+      // We make a view of ARG(MIN/MAX) result without a null mask and gather 
+      // using this map. The values in data buffer of ARG(MIN/MAX) result 
+      // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
+      // which is an out of bounds index value (-1) and causes the gathered
+      // value to be null.
+      column_view null_removed_map(data_type(type_to_id<size_type>()),
+        arg_result->size(), 
+        static_cast<void const*>(arg_result->view().template data<size_type>()));
+      auto transformed_result = experimental::detail::gather(table_view({col}),
+        null_removed_map, false, arg_result->nullable(), false, mr, stream);
+      return std::move(transformed_result->release()[0]);
     };
 
     for (auto &&agg : agg_v) {

From 50ecc6253885890c77b2a028be2ff6cf86f79076 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)"
 <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Mon, 16 Mar 2020 14:15:53 -0500
Subject: [PATCH 69/79] Update dataframe.py

---
 python/cudf/cudf/core/dataframe.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 13143e4a927..bacb6bfdcc5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -593,12 +593,14 @@ def __setitem__(self, arg, value):
                         mask=None,
                     )
                 else:
+                    if not is_scalar(value):	
+                        value = column.as_column(value)
                     for col in arg:
                         # we will raise a key error if col not in dataframe
                         # this behavior will make it
                         # consistent to pandas >0.21.0
                         if not is_scalar(value):
-                            self._data[col] = column.as_column(value)
+                            self._data[col] = value
                         else:
                             self._data[col][:] = value
 

From 38b19531cdd3d56f1b74a95d2622cf5a0bedd61f Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 16 Mar 2020 12:25:31 -0700
Subject: [PATCH 70/79] fix issue related to index slicing when the dataframe
 is empty

---
 python/cudf/cudf/core/indexing.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index 63ba9ce0f89..ea17b7f5f62 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -313,7 +313,10 @@ def _getitem_tuple_arg(self, arg):
         if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
             from cudf.core.index import RangeIndex
 
-            slice_len = arg[0].stop or len(self._df)
+            if len(self._df) > 0:
+                slice_len = arg[0].stop or len(self._df)
+            else:
+                slice_len = len(self._df)
             start, stop, step = arg[0].indices(slice_len)
             df._index = RangeIndex(start, stop)
         return df

From d944fd2e7943fc49f6efd21e6c8154f25a67b932 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 16 Mar 2020 12:26:22 -0700
Subject: [PATCH 71/79] add test related to empty dataframe head/tail

---
 python/cudf/cudf/tests/test_index.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 8291859fa93..3205b686863 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -323,3 +323,10 @@ def test_index_names():
 )
 def test_range_index_from_range(data):
     assert_eq(pd.Index(data), cudf.core.index.as_index(data))
+
+
+def test_empty_df_head_tail_index():
+    df = cudf.DataFrame()
+    pdf = pd.DataFrame()
+    assert_eq(df.head().index.values, pdf.head().index.values)
+    assert_eq(df.tail().index.values, pdf.tail().index.values)

From f8726c78d5aa9138ac6767276aa3bd2b66c635c2 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 16 Mar 2020 14:34:57 -0500
Subject: [PATCH 72/79] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c2d1d394bac..f17f914d17b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -258,6 +258,7 @@
 - PR #4474 Fix to not materialize RangeIndex in copy_categories
 - PR #4496 Skip tests which require 2+ GPUs
 - PR #4494 Update Java memory event handler for new RMM resource API
+- PR #4526 Fix index slicing issue for index incase of an empty dataframe
 
 
 # cuDF 0.12.0 (04 Feb 2020)

From 7e06a000016fa67da12b61afe607cd53c7be86f9 Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)"
 <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Mon, 16 Mar 2020 15:06:01 -0500
Subject: [PATCH 73/79] Update dataframe.py

---
 python/cudf/cudf/core/dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bacb6bfdcc5..56f84506356 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -593,7 +593,7 @@ def __setitem__(self, arg, value):
                         mask=None,
                     )
                 else:
-                    if not is_scalar(value):	
+                    if not is_scalar(value):
                         value = column.as_column(value)
                     for col in arg:
                         # we will raise a key error if col not in dataframe

From 7bcacde03e3bbcb335d9cfcb63c250cfa80c5af5 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 16 Mar 2020 14:22:53 -0700
Subject: [PATCH 74/79] handling all cases

---
 python/cudf/cudf/core/indexing.py    |  5 +----
 python/cudf/cudf/tests/test_index.py | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index ea17b7f5f62..201a331779a 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -313,10 +313,7 @@ def _getitem_tuple_arg(self, arg):
         if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
             from cudf.core.index import RangeIndex
 
-            if len(self._df) > 0:
-                slice_len = arg[0].stop or len(self._df)
-            else:
-                slice_len = len(self._df)
+            slice_len = len(self._df)
             start, stop, step = arg[0].indices(slice_len)
             df._index = RangeIndex(start, stop)
         return df
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 3205b686863..2bf2bb76dd3 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -325,8 +325,21 @@ def test_range_index_from_range(data):
     assert_eq(pd.Index(data), cudf.core.index.as_index(data))
 
 
-def test_empty_df_head_tail_index():
+@pytest.mark.parametrize(
+    "n", [-10, -5, -2, 0, 1, 0, 2, 5, 10],
+)
+def test_empty_df_head_tail_index(n):
     df = cudf.DataFrame()
     pdf = pd.DataFrame()
-    assert_eq(df.head().index.values, pdf.head().index.values)
-    assert_eq(df.tail().index.values, pdf.tail().index.values)
+    assert_eq(df.head(n).index.values, pdf.head(n).index.values)
+    assert_eq(df.tail(n).index.values, pdf.tail(n).index.values)
+
+    df = cudf.DataFrame({"a": [11, 2, 33, 44, 55]})
+    pdf = pd.DataFrame({"a": [11, 2, 33, 44, 55]})
+    assert_eq(df.head(n).index.values, pdf.head(n).index.values)
+    assert_eq(df.tail(n).index.values, pdf.tail(n).index.values)
+
+    df = cudf.DataFrame(index=[1, 2, 3])
+    pdf = pd.DataFrame(index=[1, 2, 3])
+    assert_eq(df.head(n).index.values, pdf.head(n).index.values)
+    assert_eq(df.tail(n).index.values, pdf.tail(n).index.values)

From a4839caaafcc879e2a77699e89cd11498890beb7 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Mon, 16 Mar 2020 16:32:17 -0500
Subject: [PATCH 75/79] string.py: default tokenize arguments to `""`, meaning
 "all whitespace".

---
 python/cudf/cudf/core/column/string.py | 6 +++---
 python/cudf/cudf/tests/test_text.py    | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 7d12c98e0c9..b34c6192d99 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1801,11 +1801,11 @@ def translate(self, table, **kwargs):
     def normalize_spaces(self):
         return cpp_normalize_spaces(self._column)
 
-    def tokenize(self, delimiter=" "):
+    def tokenize(self, delimiter=""):
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
         return cpp_tokenize(self._column, delimiter)
 
-    def token_count(self, delimiter=" "):
+    def token_count(self, delimiter=""):
         delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
         return cpp_count_tokens(self._column, delimiter)
 
@@ -1813,7 +1813,7 @@ def ngrams(self, n=2, separator="_"):
         separator = _massage_string_arg(separator, "separator")
         return cpp_generate_ngrams(self._column, n, separator)
 
-    def ngrams_tokenize(self, n=2, delimiter=" ", separator="_"):
+    def ngrams_tokenize(self, n=2, delimiter="", separator="_"):
         delimiter = _massage_string_arg(delimiter, "delimiter")
         separator = _massage_string_arg(separator, "separator")
 
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 16523054c17..d6a2bfe48b3 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -44,11 +44,10 @@ def test_tokenize():
 @pytest.mark.parametrize(
     "delimiter, expected_token_counts",
     [
-        (" ", [10, 9, 0, 0, 1]),  # TODO: verify last count should be 1, not 5
+        ("", [10, 9, 0, 0, 5]),
         ("o", [6, 3, 0, 0, 1]),
         (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]),
         (["a", "e", "i", "o"], [12, 11, 0, 0, 6]),
-        # ([], [10, 9, 0, 0, 5]), # throws
     ],
 )
 def test_token_count(delimiter, expected_token_counts):

From 20a03f9fe7cf5a232aeea615eb05ac1fdff42a28 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Mon, 16 Mar 2020 16:25:55 -0700
Subject: [PATCH 76/79] Import `tlz` for optional `cytoolz` support

---
 CHANGELOG.md                           | 1 +
 python/dask_cudf/dask_cudf/accessor.py | 2 +-
 python/dask_cudf/dask_cudf/core.py     | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc38ac15cbf..178df1cb738 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -168,6 +168,7 @@
 - PR #4461 Port nvstrings Miscellaneous functions to cuDF Python/Cython
 - PR #4503 Port binaryop.pyx to libcudf++ API
 - PR #4499 Adding changes to handle include `keep_index` and `RangeIndex`
+- PR #4533 Import `tlz` for optional `cytoolz` support
 - PR #4493 Skip legacy testing in CI
 
 ## Bug Fixes
diff --git a/python/dask_cudf/dask_cudf/accessor.py b/python/dask_cudf/dask_cudf/accessor.py
index eb6e50ea1ad..fceb2c74470 100644
--- a/python/dask_cudf/dask_cudf/accessor.py
+++ b/python/dask_cudf/dask_cudf/accessor.py
@@ -12,7 +12,7 @@
 
 """
 
-from toolz import partial
+from tlz import partial
 
 import cudf
 from cudf.core.column.categorical import (
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index b245970173d..549e319fc90 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 import pandas as pd
-from toolz import partition_all
+from tlz import partition_all
 
 import dask
 import dask.dataframe as dd

From 27a0520b598e569c06b313f2a5d84a03a0592185 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Mon, 16 Mar 2020 19:16:10 -0500
Subject: [PATCH 77/79] Disable errors from deprecation warnings.

---
 cpp/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fe0cce676ba..f3ab25bd243 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -48,7 +48,7 @@ set(CMAKE_CUDA_STANDARD 14)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-error=deprecated-declarations")
 
     # Suppress parentheses warning which causes gmock to fail
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wno-parentheses")
@@ -110,7 +110,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-
 
 # set warnings as errors
 # TODO: remove `no-maybe-unitialized` used to suppress warnings in rmm::exec_policy
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror cross-execution-space-call -Xcompiler -Wall,-Werror")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror cross-execution-space-call -Xcompiler -Wall,-Werror,-Wno-error=deprecated-declarations")
 
 # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking
 option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF)

From 1915d0a6c71b4b158bac3f037af7dbafc3ca9309 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Mon, 16 Mar 2020 19:37:15 -0500
Subject: [PATCH 78/79] changelog.

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc38ac15cbf..ad851d17160 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -169,6 +169,7 @@
 - PR #4503 Port binaryop.pyx to libcudf++ API
 - PR #4499 Adding changes to handle include `keep_index` and `RangeIndex`
 - PR #4493 Skip legacy testing in CI
+- PR #4534 Disable deprecation warnings as errors.
 
 ## Bug Fixes
 

From c9a3acbdeecacf83ae2df0e430bb5c742351f1e9 Mon Sep 17 00:00:00 2001
From: Keith Kraus <kkraus@nvidia.com>
Date: Mon, 16 Mar 2020 22:35:11 -0400
Subject: [PATCH 79/79] Add deprecation warning handling to pyniNVStrings as
 well

---
 python/nvstrings/cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/nvstrings/cpp/CMakeLists.txt b/python/nvstrings/cpp/CMakeLists.txt
index 15c70849dcd..ade21cdfaa7 100644
--- a/python/nvstrings/cpp/CMakeLists.txt
+++ b/python/nvstrings/cpp/CMakeLists.txt
@@ -41,7 +41,7 @@ set(CMAKE_CXX_COMPILER $ENV{CXX})
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-error=deprecated-declarations")
 
     option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON)
     if(CMAKE_CXX11_ABI)