From 6cd01678b04a32aa72605644fc7b79a1fff1a797 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Fri, 20 Aug 2021 15:54:43 -0700
Subject: [PATCH 01/20] Added method to remove null_masks if the column has no
 nulls (#9061)

This PR adds a method to remove the validity vector in cases where there are columns in a Table with no nulls but still have a validity vector.

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9061
---
 .../main/java/ai/rapids/cudf/ColumnView.java  |  56 +++++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  11 +-
 java/src/main/native/src/TableJni.cpp         |  63 ++++++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 170 +++++++++++++++---
 4 files changed, 269 insertions(+), 31 deletions(-)
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 55bd5ec5ff9..4d9991d0dd9 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -101,11 +101,39 @@ public ColumnView(DType type, long rows, Optional<Long> nullCount,
         || !nullCount.isPresent();
   }
 
+  /**
+   * Create a new column view based off of data already on the device. Ref count on the buffers
+   * is not incremented and none of the underlying buffers are owned by this view. The returned
+   * ColumnView is only valid as long as the underlying buffers remain valid. If the buffers are
+   * closed before this ColumnView is closed, it will result in undefined behavior.
+   *
+   * If ownership is needed, call {@link ColumnView#copyToColumnVector}
+   *
+   * @param type           the type of the vector
+   * @param rows           the number of rows in this vector.
+   * @param nullCount      the number of nulls in the dataset.
+   * @param dataBuffer     a host buffer required for nested types including strings and string
+   *                       categories. The ownership doesn't change on this buffer
+   * @param validityBuffer an optional validity buffer. Must be provided if nullCount != 0.
+   *                       The ownership doesn't change on this buffer
+   * @param offsetBuffer   The offsetbuffer for columns that need an offset buffer
+   */
+  public ColumnView(DType type, long rows, Optional<Long> nullCount,
+                    BaseDeviceMemoryBuffer dataBuffer,
+                    BaseDeviceMemoryBuffer validityBuffer, BaseDeviceMemoryBuffer offsetBuffer) {
+    this(type, (int) rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        dataBuffer, validityBuffer, offsetBuffer, null);
+    assert (!type.isNestedType());
+    assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
+        || !nullCount.isPresent();
+  }
+
   private ColumnView(DType type, long rows, int nullCount,
                      BaseDeviceMemoryBuffer dataBuffer, BaseDeviceMemoryBuffer validityBuffer,
                      BaseDeviceMemoryBuffer offsetBuffer, ColumnView[] children) {
     this(ColumnVector.initViewHandle(type, (int) rows, nullCount, dataBuffer, validityBuffer,
-        offsetBuffer, Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
+        offsetBuffer, children == null ? new long[]{} :
+            Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
   }
 
   /** Creates a ColumnVector from a column view handle
@@ -140,6 +168,32 @@ public final DType getType() {
     return type;
   }
 
+  /**
+   * Returns the child column views for this view
+   * Please note that it is the responsibility of the caller to close these views.
+   * @return an array of child column views
+   */
+  public final ColumnView[] getChildColumnViews() {
+    int numChildren = getNumChildren();
+    if (!getType().isNestedType()) {
+      return null;
+    }
+    ColumnView[] views = new ColumnView[numChildren];
+    try {
+      for (int i = 0; i < numChildren; i++) {
+        views[i] = getChildColumnView(i);
+      }
+      return views;
+    } catch(Throwable t) {
+      for (ColumnView v: views) {
+        if (v != null) {
+          v.close();
+        }
+      }
+      throw t;
+    }
+  }
+
   /**
    * Returns the child column view at a given index.
    * Please note that it is the responsibility of the caller to close this view.
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index e725932ed5e..eeb2d308f1a 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -170,10 +170,19 @@ public long getDeviceMemorySize() {
     return total;
   }
 
+  /**
+   * This method is internal and exposed purely for testing purpopses
+   */
+  static Table removeNullMasksIfNeeded(Table table) {
+    return new Table(removeNullMasksIfNeeded(table.nativeHandle));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // NATIVE APIs
   /////////////////////////////////////////////////////////////////////////////
-  
+
+  private static native long[] removeNullMasksIfNeeded(long tableView) throws CudfException;
+
   private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);
 
   private static native long[] partition(long inputTable, long partitionView,
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index f642a87b445..2bb56565f7a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -929,6 +929,45 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
   return combine_join_results(env, std::move(left_cols), std::move(right_cols));
 }
 
+cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
+  if (!cudf::is_compound(column_view.type())) {
+    if (column_view.nullable() && column_view.null_count() == 0) {
+      // null_mask is allocated but no nulls present therefore we create a new column_view without
+      // the null_mask to avoid things blowing up in reading the parquet file
+      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
+                               0, column_view.offset());
+    } else {
+      return cudf::column_view(column_view);
+    }
+  } else {
+    std::unique_ptr<cudf::column_view> ret;
+    std::vector<cudf::column_view> children;
+    children.reserve(column_view.num_children());
+    for (auto it = column_view.child_begin(); it != column_view.child_end(); it++) {
+      children.push_back(remove_validity_from_col(*it));
+    }
+    if (!column_view.nullable() || column_view.null_count() != 0) {
+      ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr,
+                                      column_view.null_mask(), column_view.null_count(),
+                                      column_view.offset(), children));
+    } else {
+      ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0,
+                                      column_view.offset(), children));
+    }
+    return *ret.release();
+  }
+}
+
+cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
+  std::vector<cudf::column_view> views;
+  views.reserve(input_table_view->num_columns());
+  for (auto it = input_table_view->begin(); it != input_table_view->end(); it++) {
+    views.push_back(remove_validity_from_col(*it));
+  }
+
+  return cudf::table_view(views);
+}
+
 } // namespace
 
 } // namespace jni
@@ -936,6 +975,25 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
 
 extern "C" {
 
+// This is a method purely added for testing remove_validity_if_needed method
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv *env, jclass,
+                                                                               jlong j_table_view) {
+  JNI_NULL_CHECK(env, j_table_view, "table view handle is null", 0);
+  try {
+    cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table_view);
+    cudf::table_view result = cudf::jni::remove_validity_if_needed(tview);
+    cudf::table m_tbl(result);
+    std::vector<std::unique_ptr<cudf::column>> cols = m_tbl.release();
+    auto results = cudf::jni::native_jlongArray(env, cols.size());
+    int i = 0;
+    for (auto it = cols.begin(); it != cols.end(); it++) {
+      results[i++] = reinterpret_cast<jlong>(it->release());
+    }
+    return results.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *env, jclass,
                                                                       jlongArray j_cudf_columns) {
   JNI_NULL_CHECK(env, j_cudf_columns, "columns are null", 0);
@@ -1357,7 +1415,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view *tview_with_empty_nullmask = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_with_empty_nullmask);
   cudf::jni::native_parquet_writer_handle *state =
       reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
 
@@ -1367,7 +1426,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   }
   try {
     cudf::jni::auto_set_device(env);
-    state->writer->write(*tview);
+    state->writer->write(tview);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index aeb94e4824a..cc030c392cb 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -49,19 +49,14 @@
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.stream.Collectors;
 
 import static ai.rapids.cudf.ParquetColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
+import static ai.rapids.cudf.Table.removeNullMasksIfNeeded;
 import static org.junit.jupiter.api.Assertions.assertArrayEquals;
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -111,7 +106,7 @@ public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
    * @param colName The name of the column
    */
   public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true);
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
   }
 
   /**
@@ -121,7 +116,7 @@ public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, Str
    * @param colName The name of the host column
    */
   public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true);
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
   }
 
   /**
@@ -130,7 +125,7 @@ public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVe
    * @param cv The input Struct column
    */
   public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
-    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true);
+    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
   }
 
   /**
@@ -140,13 +135,14 @@ public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView c
    * @param length The number of rows to consider
    * @param cv The input Struct column
    * @param colName The name of the column
-   * @param enableNullCheck Whether to check for nulls in the Struct column
+   * @param enableNullCountCheck Whether to check for nulls in the Struct column
+   * @param enableNullabilityCheck Whether the table have a validity mask
    */
   public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck) {
+      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
     try (HostColumnVector hostExpected = expected.copyToHost();
          HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck);
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
     }
   }
 
@@ -156,12 +152,13 @@ public static void assertPartialStructColumnsAreEqual(ColumnView expected, long
    * @param cv The input column
    * @param colName The name of the column
    * @param enableNullCheck Whether to check for nulls in the column
+   * @param enableNullabilityCheck Whether the table have a validity mask
    */
   public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck) {
+      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
     try (HostColumnVector hostExpected = expected.copyToHost();
          HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck);
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
     }
   }
 
@@ -172,18 +169,21 @@ public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOff
    * @param length  number of rows from starting offset
    * @param cv The input host column
    * @param colName The name of the host column
-   * @param enableNullCheck Whether to check for nulls in the host column
+   * @param enableNullCountCheck Whether to check for nulls in the host column
    */
   public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
-                                                  HostColumnVectorCore cv, String colName, boolean enableNullCheck) {
+                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
     assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
     assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
     assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
-    if (enableNullCheck) {
+    if (enableNullCountCheck) {
       assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
     } else {
       // TODO add in a proper check when null counts are supported by serializing a partitioned column
     }
+    if (enableNullabilityCheck) {
+      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
+    }
     DType type = expected.getType();
     for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
       long tableRow = expectedRow - rowOffset;
@@ -269,7 +269,7 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
           }
           assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
               numChildRows, cv.getNestedChildren().get(0), colName + " list child",
-              enableNullCheck);
+              enableNullCountCheck, enableNullabilityCheck);
           break;
         case STRUCT:
           List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
@@ -280,7 +280,7 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
             String childName = colName + " child " + i;
             assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
             assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
-                colName, enableNullCheck);
+                colName, enableNullCountCheck, enableNullabilityCheck);
           }
           break;
         default:
@@ -296,9 +296,10 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
    * @param length the number of rows to check
    * @param table the input table to compare against expected
    * @param enableNullCheck whether to check for nulls or not
+   * @param enableNullabilityCheck whether the table have a validity mask
    */
   public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
-                                                 boolean enableNullCheck) {
+                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
     assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
     assertEquals(length, table.getRowCount(), "ROW COUNT");
     for (int col = 0; col < expected.getNumberOfColumns(); col++) {
@@ -308,7 +309,7 @@ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, l
       if (rowOffset != 0 || length != expected.getRowCount()) {
         name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
       }
-      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck);
+      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
     }
   }
 
@@ -318,7 +319,7 @@ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, l
    * @param table the input table to compare against expected
    */
   public static void assertTablesAreEqual(Table expected, Table table) {
-    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true);
+    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
   }
 
   void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table table) {
@@ -3235,7 +3236,7 @@ void testSerializationRoundTripConcatHostSide() throws IOException {
           try (Table found = JCudfSerialization.readAndConcat(
               headers.toArray(new JCudfSerialization.SerializedTableHeader[headers.size()]),
               buffers.toArray(new HostMemoryBuffer[buffers.size()]))) {
-            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found, false);
+            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found, false, false);
           }
         } finally {
           for (HostMemoryBuffer buff: buffers) {
@@ -3288,7 +3289,7 @@ void testConcatHost() throws IOException {
         try (Table result = JCudfSerialization.readAndConcat(
             new JCudfSerialization.SerializedTableHeader[] {header, header},
             new HostMemoryBuffer[] {buff, buff})) {
-          assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), result, false);
+          assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), result, false, false);
         }
       }
     }
@@ -3329,7 +3330,7 @@ void testSerializationRoundTripSlicedHostSide() throws IOException {
               buffers.toArray(new HostMemoryBuffer[buffers.size()]), bout2);
           ByteArrayInputStream bin2 = new ByteArrayInputStream(bout2.toByteArray());
           try (JCudfSerialization.TableAndRowCountPair found = JCudfSerialization.readTableFrom(bin2)) {
-            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found.getTable(), false);
+            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found.getTable(), false, false);
             assertEquals(found.getTable(), found.getContiguousTable().getTable());
             assertNotNull(found.getContiguousTable().getBuffer());
           }
@@ -3355,7 +3356,7 @@ void testSerializationRoundTripSliced() throws IOException {
           JCudfSerialization.writeToStream(t, bout, i, len);
           ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray());
           try (JCudfSerialization.TableAndRowCountPair found = JCudfSerialization.readTableFrom(bin)) {
-            assertPartialTablesAreEqual(t, i, len, found.getTable(), i == 0 && len == t.getRowCount());
+            assertPartialTablesAreEqual(t, i, len, found.getTable(), i == 0 && len == t.getRowCount(), false);
             assertEquals(found.getTable(), found.getContiguousTable().getTable());
             assertNotNull(found.getContiguousTable().getBuffer());
           }
@@ -6360,6 +6361,121 @@ void testAllFilteredFromValidity() {
     }
   }
 
+  ColumnView replaceValidity(ColumnView cv, DeviceMemoryBuffer validity, long nullCount) {
+    assert (validity.length >= BitVectorHelper.getValidityAllocationSizeInBytes(cv.rows));
+    if (cv.type.isNestedType()) {
+      ColumnView[] children = cv.getChildColumnViews();
+      try {
+        return new ColumnView(cv.type,
+            cv.rows,
+            Optional.of(nullCount),
+            validity,
+            cv.getOffsets(),
+            children);
+      } finally {
+        for (ColumnView v : children) {
+          if (v != null) {
+            v.close();
+          }
+        }
+      }
+    } else {
+      return new ColumnView(cv.type, cv.rows, Optional.of(nullCount), cv.getData(), validity, cv.getOffsets());
+    }
+  }
+
+  @Test
+  void testRemoveNullMasksIfNeeded() {
+    ListType nestedType = new ListType(true, new StructType(false,
+        new BasicType(true, DType.INT32),
+        new BasicType(true, DType.INT64)));
+
+    List data1 = Arrays.asList(10, 20L);
+    List data2 = Arrays.asList(50, 60L);
+    HostColumnVector.StructData structData1 = new HostColumnVector.StructData(data1);
+    HostColumnVector.StructData structData2 = new HostColumnVector.StructData(data2);
+
+    //First we create ColumnVectors
+    try (ColumnVector nonNullVector0 = ColumnVector.fromBoxedInts(1, 2, 3);
+         ColumnVector nonNullVector2 = ColumnVector.fromStrings("1", "2", "3");
+         ColumnVector nonNullVector1 = ColumnVector.fromLists(nestedType,
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2))) {
+      //Then we take the created ColumnVectors and add validity masks even though the nullCount = 0
+      long allocSize = BitVectorHelper.getValidityAllocationSizeInBytes(nonNullVector0.rows);
+      try (DeviceMemoryBuffer dm0 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm1 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm2 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm3_child =
+               DeviceMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(2))) {
+        Cuda.memset(dm0.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm1.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm2.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm3_child.address, (byte) 0xFF,
+            BitVectorHelper.getValidityAllocationSizeInBytes(2));
+
+        try (ColumnView cv0View = replaceValidity(nonNullVector0, dm0, 0);
+             ColumnVector cv0 = cv0View.copyToColumnVector();
+             ColumnView struct = nonNullVector1.getChildColumnView(0);
+             ColumnView structChild0 = struct.getChildColumnView(0);
+             ColumnView newStructChild0 = replaceValidity(structChild0, dm3_child, 0);
+             ColumnView newStruct = struct.replaceChildrenWithViews(new int[]{0}, new ColumnView[]{newStructChild0});
+             ColumnView list = nonNullVector1.replaceChildrenWithViews(new int[]{0}, new ColumnView[]{newStruct});
+             ColumnView cv1View = replaceValidity(list, dm1, 0);
+             ColumnVector cv1 = cv1View.copyToColumnVector();
+             ColumnView cv2View = replaceValidity(nonNullVector2, dm2, 0);
+             ColumnVector cv2 = cv2View.copyToColumnVector()) {
+
+          try (Table t = new Table(new ColumnVector[]{cv0, cv1, cv2});
+               Table tableWithoutNullMask = removeNullMasksIfNeeded(t);
+               ColumnView tableStructChild0 = t.getColumn(1).getChildColumnView(0).getChildColumnView(0);
+               ColumnVector tableStructChild0Cv = tableStructChild0.copyToColumnVector();
+               Table expected = new Table(new ColumnVector[]{nonNullVector0, nonNullVector1,
+                nonNullVector2})) {
+            assertTrue(t.getColumn(0).hasValidityVector());
+            assertTrue(t.getColumn(1).hasValidityVector());
+            assertTrue(t.getColumn(2).hasValidityVector());
+            assertTrue(tableStructChild0Cv.hasValidityVector());
+
+            assertPartialTablesAreEqual(expected,
+                0,
+                expected.getRowCount(),
+                tableWithoutNullMask,
+                true,
+                true);
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  void testRemoveNullMasksIfNeededWithNulls() {
+    ListType nestedType = new ListType(true, new StructType(true,
+        new BasicType(true, DType.INT32),
+        new BasicType(true, DType.INT64)));
+
+    List data1 = Arrays.asList(0, 10L);
+    List data2 = Arrays.asList(50, null);
+    HostColumnVector.StructData structData1 = new HostColumnVector.StructData(data1);
+    HostColumnVector.StructData structData2 = new HostColumnVector.StructData(data2);
+
+    //First we create ColumnVectors
+    try (ColumnVector nonNullVector0 = ColumnVector.fromBoxedInts(1, null, 2, 3);
+         ColumnVector nonNullVector1 = ColumnVector.fromStrings("1", "2", null, "3");
+         ColumnVector nonNullVector2 = ColumnVector.fromLists(nestedType,
+             Arrays.asList(structData1, structData2),
+             null,
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2))) {
+      try (Table expected = new Table(new ColumnVector[]{nonNullVector0, nonNullVector1, nonNullVector2});
+           Table unchangedTable = removeNullMasksIfNeeded(expected)) {
+        assertTablesAreEqual(expected, unchangedTable);
+      }
+    }
+  }
+
   @Test
   void testMismatchedSizesForFilter() {
     Boolean[] maskVals = new Boolean[3];

From e42464ce44b1728b69b0df2f104b6df924052041 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 23 Aug 2021 09:59:36 -0400
Subject: [PATCH 02/20] Fix memcheck read error in libcudf contiguous_split
 (#9067)

Reference #8883

The `cudf::contiguous_split` was failing on memcheck using the `compute-sanitizer` with a 4-byte out-of-bounds read. This was traced to the `copy_buffer` device function that was reading 1 past the end of the input buffer when performing a value-shift. The ternary check was incorrectly protecting the out-of-bounds read. The logic is corrected by this PR.

Also, I fixed some `const` removal casts from the same source file by adding appropriate `const` qualifiers to the input data variables.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9067
---
 cpp/src/copying/contiguous_split.cu | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 779a6a74f1d..a9194ceea93 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -132,7 +132,7 @@ struct dst_buf_info {
  */
 template <int block_size>
 __device__ void copy_buffer(uint8_t* __restrict__ dst,
-                            uint8_t* __restrict__ src,
+                            uint8_t const* __restrict__ src,
                             int t,
                             std::size_t num_elements,
                             std::size_t element_size,
@@ -193,11 +193,12 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
     // and will never both be true at the same time.
     if (value_shift || bit_shift) {
       std::size_t idx = (num_bytes - remainder) / 4;
-      uint32_t v      = remainder > 0 ? (reinterpret_cast<uint32_t*>(src)[idx] - value_shift) : 0;
+      uint32_t v = remainder > 0 ? (reinterpret_cast<uint32_t const*>(src)[idx] - value_shift) : 0;
       while (remainder) {
-        uint32_t const next =
-          remainder > 0 ? (reinterpret_cast<uint32_t*>(src)[idx + 1] - value_shift) : 0;
-        uint32_t const val = (v >> bit_shift) | (next << (32 - bit_shift));
+        uint32_t const next = bit_shift > 0 || remainder > 4
+                                ? (reinterpret_cast<uint32_t const*>(src)[idx + 1] - value_shift)
+                                : 0;
+        uint32_t const val  = (v >> bit_shift) | (next << (32 - bit_shift));
         if (valid_count) { thread_valid_count += __popc(val); }
         reinterpret_cast<uint32_t*>(dst)[idx] = val;
         v                                     = next;
@@ -207,7 +208,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
     } else {
       while (remainder) {
         std::size_t const idx = num_bytes - remainder--;
-        uint32_t const val    = reinterpret_cast<uint8_t*>(src)[idx];
+        uint32_t const val    = reinterpret_cast<uint8_t const*>(src)[idx];
         if (valid_count) { thread_valid_count += __popc(val); }
         reinterpret_cast<uint8_t*>(dst)[idx] = val;
       }
@@ -255,7 +256,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  */
 template <int block_size>
 __global__ void copy_partition(int num_src_bufs,
-                               uint8_t** src_bufs,
+                               uint8_t const** src_bufs,
                                uint8_t** dst_bufs,
                                dst_buf_info* buf_info)
 {
@@ -349,13 +350,13 @@ OutputIter setup_src_buf_data(InputIter begin, InputIter end, OutputIter out_buf
 {
   std::for_each(begin, end, [&out_buf](column_view const& col) {
     if (col.nullable()) {
-      *out_buf = reinterpret_cast<uint8_t*>(const_cast<bitmask_type*>(col.null_mask()));
+      *out_buf = reinterpret_cast<uint8_t const*>(col.null_mask());
       out_buf++;
     }
     // NOTE: we're always returning the base pointer here.  column-level offset is accounted
     // for later. Also, for some column types (string, list, struct) this pointer will be null
     // because there is no associated data with the root column.
-    *out_buf = const_cast<uint8_t*>(col.head<uint8_t>());
+    *out_buf = col.head<uint8_t>();
     out_buf++;
 
     out_buf = setup_src_buf_data(col.child_begin(), col.child_end(), out_buf);
@@ -1020,14 +1021,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
     cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align);
   // host-side
   std::vector<uint8_t> h_src_and_dst_buffers(src_bufs_size + dst_bufs_size);
-  uint8_t** h_src_bufs = reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data());
+  uint8_t const** h_src_bufs = reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data());
   uint8_t** h_dst_bufs = reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size);
   // device-side
   rmm::device_buffer d_src_and_dst_buffers(src_bufs_size + dst_bufs_size + offset_stack_size,
                                            stream,
                                            rmm::mr::get_current_device_resource());
-  uint8_t** d_src_bufs = reinterpret_cast<uint8_t**>(d_src_and_dst_buffers.data());
-  uint8_t** d_dst_bufs = reinterpret_cast<uint8_t**>(
+  uint8_t const** d_src_bufs = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
+  uint8_t** d_dst_bufs       = reinterpret_cast<uint8_t**>(
     reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size);
 
   // setup src buffers

From d4c3f32af00388dd41b78428486e4a2f53257384 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 23 Aug 2021 10:09:00 -0400
Subject: [PATCH 03/20] Fix memory write error in
 get_list_child_to_list_row_mapping utility (#8994)

Reference issue #8883 and depends on fixes in PR #8884

The `get_list_child_to_list_row_mapping` builds a map for rolling operation on a lists column. In the `thrust::scatter` call a map value includes the last offset which will always be out-of-bounds to given output vector. This output vector is used to build the resultant output map by calling `thrust::inclusive_scan` but the out-of-bounds offset value is not used -- which is why the utility does not fail. The fix in this PR simply allocates an extra row in the intermediate vector so the `thrust::scatter` will not write to out-of-bounds memory. Since the value is eventually ignored, it does not effect the result.

The code in this function was creating many temporary columns incorrectly using the passed in `device_resource_manager` variable `mr`. The code was corrected by changing these to be just `device_uvector's` instead making it more clear that these are internal temporary memory buffers. Further the code calling `get_list_child_to_list_row_mapping` utility is using the output as a temporary column and so this PR fixes the logic to correct the memory resource usage.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - MithunR (https://github.com/mythrocks)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/8994
---
 cpp/CMakeLists.txt                       |   3 +-
 cpp/src/rolling/rolling_collect_list.cu  | 157 ++++++++++++++++++++++
 cpp/src/rolling/rolling_collect_list.cuh | 163 +++--------------------
 3 files changed, 176 insertions(+), 147 deletions(-)
 create mode 100644 cpp/src/rolling/rolling_collect_list.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3eee1147414..d6b457a94d4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -368,8 +368,9 @@ add_library(cudf
     src/reshape/interleave_columns.cu
     src/reshape/tile.cu
     src/rolling/grouped_rolling.cu
-    src/rolling/rolling.cu
     src/rolling/range_window_bounds.cpp
+    src/rolling/rolling.cu
+    src/rolling/rolling_collect_list.cu
     src/round/round.cu
     src/scalar/scalar.cpp
     src/scalar/scalar_factories.cpp
diff --git a/cpp/src/rolling/rolling_collect_list.cu b/cpp/src/rolling/rolling_collect_list.cu
new file mode 100644
index 00000000000..ecef90dc8e1
--- /dev/null
+++ b/cpp/src/rolling/rolling_collect_list.cu
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rolling/rolling_collect_list.cuh>
+
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @see cudf::detail::get_list_child_to_list_row_mapping
+ */
+std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view const& offsets,
+                                                           rmm::cuda_stream_view stream)
+{
+  // First, scatter the count for each repeated offset (except the first and last),
+  // into a column of N `0`s, where N == number of child rows.
+  // For example:
+  //   offsets        == [0, 2, 5, 8, 11, 13]
+  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
+  //
+  // An example with empty list row at index 2:
+  //   offsets        == [0, 2, 5, 5, 8, 11, 13]
+  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
+  //
+  auto const num_child_rows{
+    cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
+  auto per_row_mapping = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, num_child_rows, mask_state::UNALLOCATED, stream);
+  auto per_row_mapping_begin = per_row_mapping->mutable_view().template begin<size_type>();
+  thrust::fill_n(rmm::exec_policy(stream), per_row_mapping_begin, num_child_rows, 0);
+
+  auto const begin = thrust::make_counting_iterator<size_type>(0);
+  thrust::scatter_if(rmm::exec_policy(stream),
+                     begin,
+                     begin + offsets.size() - 1,
+                     offsets.begin<size_type>(),
+                     begin,  // stencil iterator
+                     per_row_mapping_begin,
+                     [offset = offsets.begin<size_type>()] __device__(auto i) {
+                       return offset[i] != offset[i + 1];
+                     });  // [0,0,1,0,0,3,...]
+
+  // Next, generate mapping with inclusive_scan(max) on the scatter result.
+  // For the example above:
+  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0]
+  //   inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4]
+  //
+  // For the case with an empty list at index 2:
+  //   scatter result == [0, 0, 1, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0]
+  //   inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5]
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         per_row_mapping_begin,
+                         per_row_mapping_begin + num_child_rows,
+                         per_row_mapping_begin,
+                         thrust::maximum<size_type>{});
+  return per_row_mapping;
+}
+
+/**
+ * @see cudf::detail::count_child_nulls
+ */
+size_type count_child_nulls(column_view const& input,
+                            std::unique_ptr<column> const& gather_map,
+                            rmm::cuda_stream_view stream)
+{
+  auto input_device_view = column_device_view::create(input, stream);
+
+  auto input_row_is_null = [d_input = *input_device_view] __device__(auto i) {
+    return d_input.is_null_nocheck(i);
+  };
+
+  return thrust::count_if(rmm::exec_policy(stream),
+                          gather_map->view().begin<size_type>(),
+                          gather_map->view().end<size_type>(),
+                          input_row_is_null);
+}
+
+/**
+ * @see cudf::detail::rolling_collect_list
+ */
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
+  column_view const& input,
+  column_view const& gather_map,
+  column_view const& offsets,
+  size_type num_child_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto input_device_view = column_device_view::create(input, stream);
+
+  auto input_row_not_null = [d_input = *input_device_view] __device__(auto i) {
+    return d_input.is_valid_nocheck(i);
+  };
+
+  // Purge entries in gather_map that correspond to null input.
+  auto new_gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
+                                                gather_map.size() - num_child_nulls,
+                                                mask_state::UNALLOCATED,
+                                                stream);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  gather_map.template begin<size_type>(),
+                  gather_map.template end<size_type>(),
+                  new_gather_map->mutable_view().template begin<size_type>(),
+                  input_row_not_null);
+
+  // Recalculate offsets after null entries are purged.
+  auto new_sizes = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, input.size(), mask_state::UNALLOCATED, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    new_sizes->mutable_view().template begin<size_type>(),
+                    [d_gather_map  = gather_map.template begin<size_type>(),
+                     d_old_offsets = offsets.template begin<size_type>(),
+                     input_row_not_null] __device__(auto i) {
+                      return thrust::count_if(thrust::seq,
+                                              d_gather_map + d_old_offsets[i],
+                                              d_gather_map + d_old_offsets[i + 1],
+                                              input_row_not_null);
+                    });
+
+  auto new_offsets =
+    strings::detail::make_offsets_child_column(new_sizes->view().template begin<size_type>(),
+                                               new_sizes->view().template end<size_type>(),
+                                               stream,
+                                               mr);
+
+  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(std::move(new_gather_map),
+                                                                          std::move(new_offsets));
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/rolling/rolling_collect_list.cuh b/cpp/src/rolling/rolling_collect_list.cuh
index 0ffafe349b9..95eb1a124c6 100644
--- a/cpp/src/rolling/rolling_collect_list.cuh
+++ b/cpp/src/rolling/rolling_collect_list.cuh
@@ -16,24 +16,20 @@
 
 #pragma once
 
-#include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
 
-namespace {
 /**
  * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation
  *
@@ -97,73 +93,7 @@ std::unique_ptr<column> create_collect_offsets(size_type input_size,
  *  Mapping back to `input`    == [0,1,0,1,2,1,2,3,2,3,4,3,4]
  */
 std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view const& offsets,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
-{
-  auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
-
-  // First, reduce offsets column by key, to identify the number of times
-  // an offset appears.
-  // Next, scatter the count for each offset (except the first and last),
-  // into a column of N `0`s, where N == number of child rows.
-  // For the example above:
-  //   offsets        == [0, 2, 5, 8, 11, 13]
-  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
-  //
-  // If the above example had an empty list row at index 2,
-  // the same columns would look as follows:
-  //   offsets        == [0, 2, 5, 5, 8, 11, 13]
-  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
-  //
-  // Note: To correctly handle null list rows at the beginning of
-  // the output column, care must be taken to skip the first `0`
-  // in the offsets column, when running `reduce_by_key()`.
-  // This accounts for the `0` added by default to the offsets
-  // column, marking the beginning of the column.
-
-  auto const num_child_rows{
-    cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
-
-  auto scatter_values =
-    make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
-  auto scatter_keys =
-    make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
-  auto reduced_by_key =
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          offsets.template begin<size_type>() + 1,  // Skip first 0 in offsets.
-                          offsets.template end<size_type>(),
-                          thrust::make_constant_iterator<size_type>(1),
-                          scatter_keys->mutable_view().template begin<size_type>(),
-                          scatter_values->mutable_view().template begin<size_type>());
-  auto scatter_values_end = reduced_by_key.second;
-  auto scatter_output =
-    make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
-  thrust::fill_n(rmm::exec_policy(stream),
-                 scatter_output->mutable_view().template begin<size_type>(),
-                 num_child_rows,
-                 0);  // [0,0,0,...0]
-  thrust::scatter(rmm::exec_policy(stream),
-                  scatter_values->mutable_view().template begin<size_type>(),
-                  scatter_values_end,
-                  scatter_keys->view().template begin<size_type>(),
-                  scatter_output->mutable_view().template begin<size_type>());  // [0,0,1,0,0,1,...]
-
-  // Next, generate mapping with inclusive_scan() on scatter() result.
-  // For the example above:
-  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
-  //   inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4]
-  //
-  // For the case with an empty list at index 3:
-  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
-  //   inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5]
-  auto per_row_mapping =
-    make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
-  thrust::inclusive_scan(rmm::exec_policy(stream),
-                         scatter_output->view().template begin<size_type>(),
-                         scatter_output->view().template end<size_type>(),
-                         per_row_mapping->mutable_view().template begin<size_type>());
-  return per_row_mapping;
-}
+                                                           rmm::cuda_stream_view stream);
 
 /**
  * @brief Create gather map to generate the child column of the result of
@@ -173,14 +103,10 @@ template <typename PrecedingIter>
 std::unique_ptr<column> create_collect_gather_map(column_view const& child_offsets,
                                                   column_view const& per_row_mapping,
                                                   PrecedingIter preceding_iter,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::cuda_stream_view stream)
 {
-  auto gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                            per_row_mapping.size(),
-                                            mask_state::UNALLOCATED,
-                                            stream,
-                                            mr);
+  auto gather_map = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, per_row_mapping.size(), mask_state::UNALLOCATED, stream);
   thrust::transform(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
@@ -205,19 +131,7 @@ std::unique_ptr<column> create_collect_gather_map(column_view const& child_offse
  */
 size_type count_child_nulls(column_view const& input,
                             std::unique_ptr<column> const& gather_map,
-                            rmm::cuda_stream_view stream)
-{
-  auto input_device_view = column_device_view::create(input, stream);
-
-  auto input_row_is_null = [d_input = *input_device_view] __device__(auto i) {
-    return d_input.is_null_nocheck(i);
-  };
-
-  return thrust::count_if(rmm::exec_policy(stream),
-                          gather_map->view().template begin<size_type>(),
-                          gather_map->view().template end<size_type>(),
-                          input_row_is_null);
-}
+                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Purge entries for null inputs from gather_map, and adjust offsets.
@@ -228,54 +142,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_child_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  auto input_device_view = column_device_view::create(input, stream);
-
-  auto input_row_not_null = [d_input = *input_device_view] __device__(auto i) {
-    return d_input.is_valid_nocheck(i);
-  };
-
-  // Purge entries in gather_map that correspond to null input.
-  auto new_gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                                gather_map.size() - num_child_nulls,
-                                                mask_state::UNALLOCATED,
-                                                stream,
-                                                mr);
-  thrust::copy_if(rmm::exec_policy(stream),
-                  gather_map.template begin<size_type>(),
-                  gather_map.template end<size_type>(),
-                  new_gather_map->mutable_view().template begin<size_type>(),
-                  input_row_not_null);
-
-  // Recalculate offsets after null entries are purged.
-  auto new_sizes = make_fixed_width_column(
-    data_type{type_to_id<size_type>()}, input.size(), mask_state::UNALLOCATED, stream, mr);
-
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(input.size()),
-                    new_sizes->mutable_view().template begin<size_type>(),
-                    [d_gather_map  = gather_map.template begin<size_type>(),
-                     d_old_offsets = offsets.template begin<size_type>(),
-                     input_row_not_null] __device__(auto i) {
-                      return thrust::count_if(thrust::seq,
-                                              d_gather_map + d_old_offsets[i],
-                                              d_gather_map + d_old_offsets[i + 1],
-                                              input_row_not_null);
-                    });
-
-  auto new_offsets =
-    strings::detail::make_offsets_child_column(new_sizes->view().template begin<size_type>(),
-                                               new_sizes->view().template end<size_type>(),
-                                               stream,
-                                               mr);
-
-  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(std::move(new_gather_map),
-                                                                          std::move(new_offsets));
-}
-
-}  // anonymous namespace
+  rmm::mr::device_memory_resource* mr);
 
 template <typename PrecedingIter, typename FollowingIter>
 std::unique_ptr<column> rolling_collect_list(column_view const& input,
@@ -313,11 +180,11 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
 
   // Map each element of the collect() result's child column
   // to the index where it appears in the input.
-  auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr);
+  auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream);
 
   // Generate gather map to produce the collect() result's child column.
-  auto gather_map = create_collect_gather_map(
-    offsets->view(), per_row_mapping->view(), preceding_begin, stream, mr);
+  auto gather_map =
+    create_collect_gather_map(offsets->view(), per_row_mapping->view(), preceding_begin, stream);
 
   // If gather_map collects null elements, and null_policy == EXCLUDE,
   // those elements must be filtered out, and offsets recomputed.
@@ -330,8 +197,12 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
   }
 
   // gather(), to construct child column.
-  auto gather_output =
-    cudf::gather(table_view{std::vector<column_view>{input}}, gather_map->view());
+  auto gather_output = cudf::detail::gather(table_view{std::vector<column_view>{input}},
+                                            gather_map->view(),
+                                            cudf::out_of_bounds_policy::DONT_CHECK,
+                                            cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                            stream,
+                                            mr);
 
   rmm::device_buffer null_mask;
   size_type null_count;

From 332dedf0ff6cf61e9426b5e1958a2f19f5eebb02 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 23 Aug 2021 20:04:23 +0530
Subject: [PATCH 04/20] Enable compiled binary ops in libcudf, python and java
 (#8741)

closes https://github.com/rapidsai/cudf/issues/7801

`cudf::binary_operation` calls compiled binary ops.
`cudf::jit::binary_operation` calls  jit binary ops
So, compiled binary ops is called in libcudf (groupby, rescale), python (binary ops) and java (binary ops)

**Breaking change:**
New: Logical and Comparison operators can have output type to be only bool type.
Old: Logical operators can have integer or any other output type that can be constructed from bool type. Comparison operators required bool type only.



In this release (21.10), `experimental` namespace is dropped, and compiled binary ops replaces jit binary ops in libcudf, except for user defined op.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/8741
---
 .../binaryop/binaryop_benchmark.cpp           |   8 +-
 .../binaryop/compiled_binaryop_benchmark.cpp  |   4 +-
 .../binaryop/jit_binaryop_benchmark.cpp       |   4 +-
 cpp/include/cudf/binaryop.hpp                 |  28 +-
 cpp/include/cudf/detail/binaryop.hpp          |  51 ++-
 cpp/src/binaryop/binaryop.cpp                 | 358 +++++++++--------
 cpp/src/binaryop/compiled/binary_ops.cuh      |   4 +-
 cpp/src/binaryop/compiled/binary_ops.hpp      |  23 +-
 cpp/src/binaryop/compiled/util.cpp            |   3 +-
 .../binop-compiled-fixed_point-test.cpp       | 146 +++----
 cpp/tests/binaryop/binop-compiled-test.cpp    |  71 +++-
 cpp/tests/binaryop/binop-integration-test.cpp | 377 +++++++++---------
 cpp/tests/binaryop/binop-null-test.cpp        |  32 +-
 .../binaryop/binop-verify-input-test.cpp      |  12 +-
 cpp/tests/fixed_point/fixed_point_tests.cpp   |  12 +-
 python/cudf/cudf/_lib/cpp/binaryop.pxd        |  24 ++
 python/cudf/cudf/core/column/numerical.py     |  12 +-
 17 files changed, 640 insertions(+), 529 deletions(-)

diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
index 314d657679b..9de1112a9db 100644
--- a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
@@ -74,14 +74,14 @@ static void BM_binaryop_transform(benchmark::State& state)
     auto const op         = cudf::binary_operator::ADD;
     auto result_data_type = cudf::data_type(cudf::type_to_id<key_type>());
     if (reuse_columns) {
-      auto result = cudf::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
+      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(0), op, result_data_type);
       for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-        result = cudf::binary_operation(result->view(), columns.at(0), op, result_data_type);
+        result = cudf::jit::binary_operation(result->view(), columns.at(0), op, result_data_type);
       }
     } else {
-      auto result = cudf::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
+      auto result = cudf::jit::binary_operation(columns.at(0), columns.at(1), op, result_data_type);
       std::for_each(std::next(columns.cbegin(), 2), columns.cend(), [&](auto const& col) {
-        result = cudf::binary_operation(result->view(), col, op, result_data_type);
+        result = cudf::jit::binary_operation(result->view(), col, op, result_data_type);
       });
     }
   }
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
index aa86f3bedf8..bc0818ace4b 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
@@ -41,11 +41,11 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
   auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
 
   // Call once for hot cache.
-  cudf::experimental::binary_operation(lhs, rhs, binop, output_dtype);
+  cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    cudf::experimental::binary_operation(lhs, rhs, binop, output_dtype);
+    cudf::binary_operation(lhs, rhs, binop, output_dtype);
   }
 }
 
diff --git a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
index 3c02f47eeb7..7fda4a50ea1 100644
--- a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
@@ -41,11 +41,11 @@ void BM_binaryop(benchmark::State& state, cudf::binary_operator binop)
   auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
 
   // Call once for hot cache.
-  cudf::binary_operation(lhs, rhs, binop, output_dtype);
+  cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    cudf::binary_operation(lhs, rhs, binop, output_dtype);
+    cudf::jit::binary_operation(lhs, rhs, binop, output_dtype);
   }
 }
 
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index e6ff6b0eadc..fe548a36cf0 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -82,7 +82,7 @@ enum class binary_operator : int32_t {
  * This distinction is significant in case of non-commutative binary operations
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
  *
  * @param lhs         The left operand scalar
  * @param rhs         The right operand column
@@ -92,6 +92,8 @@ enum class binary_operator : int32_t {
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
  */
 std::unique_ptr<column> binary_operation(
   scalar const& lhs,
@@ -108,7 +110,7 @@ std::unique_ptr<column> binary_operation(
  * This distinction is significant in case of non-commutative binary operations
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand scalar
@@ -118,6 +120,8 @@ std::unique_ptr<column> binary_operation(
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
  */
 std::unique_ptr<column> binary_operation(
   column_view const& lhs,
@@ -132,7 +136,7 @@ std::unique_ptr<column> binary_operation(
  * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()`
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands
+ * AND of the validity of the two operands except NullMin and NullMax (logical OR).
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand column
@@ -142,6 +146,8 @@ std::unique_ptr<column> binary_operation(
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p lhs and @p rhs are different sizes
+ * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
+ * operations.
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
  */
 std::unique_ptr<column> binary_operation(
@@ -204,7 +210,7 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
                                                          cudf::data_type const& lhs,
                                                          cudf::data_type const& rhs);
 
-namespace experimental {
+namespace jit {
 /**
  * @brief Performs a binary operation between a scalar and a column.
  *
@@ -213,7 +219,7 @@ namespace experimental {
  * This distinction is significant in case of non-commutative binary operations
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ * AND of the validity of the two operands
  *
  * @param lhs         The left operand scalar
  * @param rhs         The right operand column
@@ -223,8 +229,6 @@ namespace experimental {
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
- * operations.
  */
 std::unique_ptr<column> binary_operation(
   scalar const& lhs,
@@ -241,7 +245,7 @@ std::unique_ptr<column> binary_operation(
  * This distinction is significant in case of non-commutative binary operations
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ * AND of the validity of the two operands
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand scalar
@@ -251,8 +255,6 @@ std::unique_ptr<column> binary_operation(
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
- * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
- * operations.
  */
 std::unique_ptr<column> binary_operation(
   column_view const& lhs,
@@ -267,7 +269,7 @@ std::unique_ptr<column> binary_operation(
  * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()`
  *
  * Regardless of the operator, the validity of the output value is the logical
- * AND of the validity of the two operands except NullMin and NullMax (logical OR).
+ * AND of the validity of the two operands
  *
  * @param lhs         The left operand column
  * @param rhs         The right operand column
@@ -277,8 +279,6 @@ std::unique_ptr<column> binary_operation(
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
  * @throw cudf::logic_error if @p lhs and @p rhs are different sizes
- * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical
- * operations.
  * @throw cudf::logic_error if @p output_type dtype isn't fixed-width
  */
 std::unique_ptr<column> binary_operation(
@@ -287,6 +287,6 @@ std::unique_ptr<column> binary_operation(
   binary_operator op,
   data_type output_type,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-}  // namespace experimental
+}  // namespace jit
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index c12482967e1..ce7731ef7d2 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -22,8 +22,9 @@
 namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
+namespace jit {
 /**
- * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
+ * @copydoc cudf::jit::binary_operation(scalar const&, column_view const&, binary_operator,
  * data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -37,7 +38,7 @@ std::unique_ptr<column> binary_operation(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
+ * @copydoc cudf::jit::binary_operation(column_view const&, scalar const&, binary_operator,
  * data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -51,7 +52,7 @@ std::unique_ptr<column> binary_operation(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::binary_operation(column_view const&, column_view const&,
+ * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
  * binary_operator, data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -63,9 +64,10 @@ std::unique_ptr<column> binary_operation(
   data_type output_type,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+}  // namespace jit
 
 /**
- * @copydoc cudf::binary_operation(column_view const&, column_view const&,
+ * @copydoc cudf::jit::binary_operation(column_view const&, column_view const&,
  * std::string const&, data_type, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -78,5 +80,46 @@ std::unique_ptr<column> binary_operation(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
+ * data_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> binary_operation(
+  scalar const& lhs,
+  column_view const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
+ * data_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> binary_operation(
+  column_view const& lhs,
+  scalar const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::binary_operation(column_view const&, column_view const&,
+ * binary_operator, data_type, rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> binary_operation(
+  column_view const& lhs,
+  column_view const& rhs,
+  binary_operator op,
+  data_type output_type,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index aaf193ff5cf..a1b00a4cd6b 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -47,9 +47,7 @@
 #include <thrust/optional.h>
 
 namespace cudf {
-
 namespace binops {
-namespace detail {
 
 /**
  * @brief Computes output valid mask for op between a column and a scalar
@@ -69,7 +67,63 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
     return rmm::device_buffer{0, stream, mr};
   }
 }
-}  // namespace detail
+
+/**
+ * @brief Does the binop need to know if an operand is null/invalid to perform special
+ * processing?
+ */
+inline bool is_null_dependent(binary_operator op)
+{
+  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
+         op == binary_operator::NULL_MAX;
+}
+
+/**
+ * @brief Returns `true` if `binary_operator` `op` is a basic arithmetic binary operation
+ */
+bool is_basic_arithmetic_binop(binary_operator op)
+{
+  return op == binary_operator::ADD or       // operator +
+         op == binary_operator::SUB or       // operator -
+         op == binary_operator::MUL or       // operator *
+         op == binary_operator::DIV or       // operator / using common type of lhs and rhs
+         op == binary_operator::NULL_MIN or  // 2 null = null, 1 null = value, else min
+         op == binary_operator::NULL_MAX;    // 2 null = null, 1 null = value, else max
+}
+
+/**
+ * @brief Returns `true` if `binary_operator` `op` is a comparison binary operation
+ */
+bool is_comparison_binop(binary_operator op)
+{
+  return op == binary_operator::EQUAL or          // operator ==
+         op == binary_operator::NOT_EQUAL or      // operator !=
+         op == binary_operator::LESS or           // operator <
+         op == binary_operator::GREATER or        // operator >
+         op == binary_operator::LESS_EQUAL or     // operator <=
+         op == binary_operator::GREATER_EQUAL or  // operator >=
+         op == binary_operator::NULL_EQUALS;      // 2 null = true; 1 null = false; else ==
+}
+
+/**
+ * @brief Returns `true` if `binary_operator` `op` is supported by `fixed_point`
+ */
+bool is_supported_fixed_point_binop(binary_operator op)
+{
+  return is_basic_arithmetic_binop(op) or is_comparison_binop(op);
+}
+
+/**
+ * @brief Helper predicate function that identifies if `op` requires scales to be the same
+ *
+ * @param op `binary_operator`
+ * @return true `op` requires scales of lhs and rhs to be the same
+ * @return false `op` does not require scales of lhs and rhs to be the same
+ */
+bool is_same_scale_necessary(binary_operator op)
+{
+  return op != binary_operator::MUL && op != binary_operator::DIV;
+}
 
 namespace jit {
 
@@ -208,8 +262,47 @@ void binary_operation(mutable_column_view& out,
              cudf::jit::get_data_ptr(lhs),
              cudf::jit::get_data_ptr(rhs));
 }
-
 }  // namespace jit
+
+// Compiled Binary operation
+namespace compiled {
+/**
+ * @copydoc cudf::binary_operation(column_view const&, column_view const&,
+ * binary_operator, data_type, rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+template <typename LhsType, typename RhsType>
+std::unique_ptr<column> binary_operation(LhsType const& lhs,
+                                         RhsType const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+
+  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
+      output_type.id() == type_id::STRING and
+      (op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN))
+    return cudf::binops::compiled::string_null_min_max(lhs, rhs, op, output_type, stream, mr);
+
+  if (not cudf::binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op))
+    CUDF_FAIL("Unsupported operator for these types");
+
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
+
+  if constexpr (std::is_same_v<LhsType, column_view>)
+    if (lhs.is_empty()) return out;
+  if constexpr (std::is_same_v<RhsType, column_view>)
+    if (rhs.is_empty()) return out;
+
+  auto out_view = out->mutable_view();
+  cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream);
+  return out;
+}
+}  // namespace compiled
 }  // namespace binops
 
 namespace detail {
@@ -245,7 +338,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(scalar const& lhs,
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
   } else {
-    auto new_mask = binops::detail::scalar_col_valid_mask_and(rhs, lhs, stream, mr);
+    auto new_mask = binops::scalar_col_valid_mask_and(rhs, lhs, stream, mr);
     return make_fixed_width_column(
       output_type, rhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
   }
@@ -272,7 +365,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, lhs.size(), mask_state::ALL_VALID, stream, mr);
   } else {
-    auto new_mask = binops::detail::scalar_col_valid_mask_and(lhs, rhs, stream, mr);
+    auto new_mask = binops::scalar_col_valid_mask_and(lhs, rhs, stream, mr);
     return make_fixed_width_column(
       output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
   }
@@ -305,53 +398,6 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
   }
 };
 
-/**
- * @brief Returns `true` if `binary_operator` `op` is a basic arithmetic binary operation
- */
-bool is_basic_arithmetic_binop(binary_operator op)
-{
-  return op == binary_operator::ADD or       // operator +
-         op == binary_operator::SUB or       // operator -
-         op == binary_operator::MUL or       // operator *
-         op == binary_operator::DIV or       // operator / using common type of lhs and rhs
-         op == binary_operator::NULL_MIN or  // 2 null = null, 1 null = value, else min
-         op == binary_operator::NULL_MAX;    // 2 null = null, 1 null = value, else max
-}
-
-/**
- * @brief Returns `true` if `binary_operator` `op` is a comparison binary operation
- */
-bool is_comparison_binop(binary_operator op)
-{
-  return op == binary_operator::EQUAL or          // operator ==
-         op == binary_operator::NOT_EQUAL or      // operator !=
-         op == binary_operator::LESS or           // operator <
-         op == binary_operator::GREATER or        // operator >
-         op == binary_operator::LESS_EQUAL or     // operator <=
-         op == binary_operator::GREATER_EQUAL or  // operator >=
-         op == binary_operator::NULL_EQUALS;      // 2 null = true; 1 null = false; else ==
-}
-
-/**
- * @brief Returns `true` if `binary_operator` `op` is supported by `fixed_point`
- */
-bool is_supported_fixed_point_binop(binary_operator op)
-{
-  return is_basic_arithmetic_binop(op) or is_comparison_binop(op);
-}
-
-/**
- * @brief Helper predicate function that identifies if `op` requires scales to be the same
- *
- * @param op `binary_operator`
- * @return true `op` requires scales of lhs and rhs to be the same
- * @return false `op` does not require scales of lhs and rhs to be the same
- */
-bool is_same_scale_necessary(binary_operator op)
-{
-  return op != binary_operator::MUL && op != binary_operator::DIV;
-}
-
 template <typename Lhs, typename Rhs>
 void fixed_point_binary_operation_validation(binary_operator op,
                                              Lhs lhs,
@@ -360,10 +406,11 @@ void fixed_point_binary_operation_validation(binary_operator op,
 {
   CUDF_EXPECTS(is_fixed_point(lhs), "Input must have fixed_point data_type.");
   CUDF_EXPECTS(is_fixed_point(rhs), "Input must have fixed_point data_type.");
-  CUDF_EXPECTS(is_supported_fixed_point_binop(op), "Unsupported fixed_point binary operation");
+  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
+               "Unsupported fixed_point binary operation");
   CUDF_EXPECTS(lhs.id() == rhs.id(), "Data type mismatch");
   if (output_type.has_value()) {
-    if (is_comparison_binop(op))
+    if (binops::is_comparison_binop(op))
       CUDF_EXPECTS(output_type == cudf::data_type{type_id::BOOL8},
                    "Comparison operations require boolean output type.");
     else
@@ -372,6 +419,7 @@ void fixed_point_binary_operation_validation(binary_operator op,
   }
 }
 
+namespace jit {
 /**
  * @brief Function to compute binary operation of one `column_view` and one `scalar`
  *
@@ -397,12 +445,12 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
     return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type =
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{rhs.type().id(), scale};
-  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view = out->mutable_view();
+  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
+                                                     : cudf::data_type{rhs.type().id(), scale};
+  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
+  auto out_view    = out->mutable_view();
 
-  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
+  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
     // Adjust scalar/column so they have they same scale
     if (rhs.type().scale() < lhs.type().scale()) {
       auto const diff = lhs.type().scale() - rhs.type().scale();
@@ -426,12 +474,12 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
@@ -467,12 +515,12 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
     return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type =
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{lhs.type().id(), scale};
-  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view = out->mutable_view();
+  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
+                                                     : cudf::data_type{lhs.type().id(), scale};
+  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
+  auto out_view    = out->mutable_view();
 
-  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
+  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
     // Adjust scalar/column so they have they same scale
     if (rhs.type().scale() > lhs.type().scale()) {
       auto const diff = rhs.type().scale() - lhs.type().scale();
@@ -496,12 +544,12 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
         if (rhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
@@ -537,24 +585,24 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
     return make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   auto const scale = binary_operation_fixed_point_scale(op, lhs.type().scale(), rhs.type().scale());
-  auto const type =
-    is_comparison_binop(op) ? data_type{type_id::BOOL8} : cudf::data_type{lhs.type().id(), scale};
-  auto out      = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
-  auto out_view = out->mutable_view();
+  auto const type  = binops::is_comparison_binop(op) ? data_type{type_id::BOOL8}
+                                                     : cudf::data_type{lhs.type().id(), scale};
+  auto out         = make_fixed_width_column_for_output(lhs, rhs, op, type, stream, mr);
+  auto out_view    = out->mutable_view();
 
-  if (lhs.type().scale() != rhs.type().scale() && is_same_scale_necessary(op)) {
+  if (lhs.type().scale() != rhs.type().scale() && binops::is_same_scale_necessary(op)) {
     if (rhs.type().scale() < lhs.type().scale()) {
       auto const diff   = lhs.type().scale() - rhs.type().scale();
       auto const result = [&] {
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
@@ -564,12 +612,12 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
         if (lhs.type().id() == type_id::DECIMAL32) {
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal32>(factor, scale_type{-diff});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar = make_fixed_point_scalar<decimal64>(factor, scale_type{-diff});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
+          return jit::binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
@@ -587,8 +635,9 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
+  // calls compiled ops for string types
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
+    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -614,8 +663,9 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
+  // calls compiled ops for string types
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
+    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -643,8 +693,9 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 {
   CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
 
+  // calls compiled ops for string types
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return experimental::binary_operation(lhs, rhs, op, output_type, mr);
+    return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
     return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -662,6 +713,72 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   binops::jit::binary_operation(out_view, lhs, rhs, op, stream);
   return out;
 }
+}  // namespace jit
+}  // namespace detail
+
+namespace jit {
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::jit::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+}  // namespace jit
+
+namespace detail {
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return binops::compiled::binary_operation<scalar, column_view>(
+    lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return binops::compiled::binary_operation<column_view, scalar>(
+    lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return binops::compiled::binary_operation<column_view, column_view>(
+    lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
+}
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
@@ -693,14 +810,13 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   binops::jit::binary_operation(out_view, lhs, rhs, ptx, stream);
   return out;
 }
-
 }  // namespace detail
 
 int32_t binary_operation_fixed_point_scale(binary_operator op,
                                            int32_t left_scale,
                                            int32_t right_scale)
 {
-  CUDF_EXPECTS(cudf::detail::is_supported_fixed_point_binop(op),
+  CUDF_EXPECTS(binops::is_supported_fixed_point_binop(op),
                "Unsupported fixed_point binary operation.");
   if (op == binary_operator::MUL) return left_scale + right_scale;
   if (op == binary_operator::DIV) return left_scale - right_scale;
@@ -726,7 +842,6 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
 }
-
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
                                          binary_operator op,
@@ -736,7 +851,6 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
 }
-
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
@@ -757,78 +871,4 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   return detail::binary_operation(lhs, rhs, ptx, output_type, rmm::cuda_stream_default, mr);
 }
 
-// Experimental Compiled Binary operation
-namespace experimental {
-namespace detail {
-/**
- * @copydoc cudf::experimental::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource*)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-template <typename LhsType, typename RhsType>
-std::unique_ptr<column> binary_operation(LhsType const& lhs,
-                                         RhsType const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
-
-  if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
-      output_type.id() == type_id::STRING and
-      (op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN))
-    return binops::compiled::string_null_min_max(lhs, rhs, op, output_type, stream, mr);
-
-  if (not binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op))
-    CUDF_FAIL("Unsupported operator for these types");
-
-  // TODO check if scale conversion required?
-  // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-  //  CUDF_FAIL("Not yet supported fixed_point");
-  // return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr);
-
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
-
-  if constexpr (std::is_same_v<LhsType, column_view>)
-    if (lhs.is_empty()) return out;
-  if constexpr (std::is_same_v<RhsType, column_view>)
-    if (rhs.is_empty()) return out;
-
-  auto out_view = out->mutable_view();
-  cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream);
-  return out;
-}
-}  // namespace detail
-
-std::unique_ptr<column> binary_operation(scalar const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         scalar const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-std::unique_ptr<column> binary_operation(column_view const& lhs,
-                                         column_view const& rhs,
-                                         binary_operator op,
-                                         data_type output_type,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
-}
-}  // namespace experimental
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index b17f3eddc5d..84147fc9220 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -68,7 +68,9 @@ struct typed_casted_writer {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
                   std::is_constructible_v<Element, FromType>) {
       col.element<Element>(i) = static_cast<Element>(val);
-    } else if constexpr (is_fixed_point<Element>() and std::is_constructible_v<Element, FromType>) {
+    } else if constexpr (is_fixed_point<Element>() and
+                         (is_fixed_point<FromType>() or
+                          std::is_constructible_v<Element, FromType>)) {
       if constexpr (is_fixed_point<FromType>())
         col.data<Element::rep>()[i] = val.rescaled(numeric::scale_type{col.type().scale()}).value();
       else
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 2a814c16d57..cf3a6025847 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -29,26 +29,6 @@ class column_device_view;
 class mutable_column_device_view;
 
 namespace binops {
-namespace detail {
-/**
- * @brief Computes output valid mask for op between a column and a scalar
- */
-rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
-                                             scalar const& s,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
-}  // namespace detail
-
-/**
- * @brief Does the binop need to know if an operand is null/invalid to perform special
- * processing?
- */
-inline bool is_null_dependent(binary_operator op)
-{
-  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
-         op == binary_operator::NULL_MAX;
-}
-
 namespace compiled {
 
 std::unique_ptr<column> string_null_min_max(
@@ -132,8 +112,7 @@ std::unique_ptr<column> binary_operation(
  *
  * @note The sizes of @p lhs and @p rhs should be the same
  *
- * The output contains the result of op(lhs[i], rhs[i]) for all 0 <= i <
- * lhs.size()
+ * The output contains the result of op(lhs[i], rhs[i]) for all 0 <= i < lhs.size()
  *
  * Regardless of the operator, the validity of the output value is the logical
  * AND of the validity of the two operands
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index d6ce4d3edeb..f89941a3d68 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -89,7 +89,8 @@ struct is_binary_operation_supported {
         using common_t = std::common_type_t<TypeLhs, TypeRhs>;
         if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
           using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
-          return std::is_constructible_v<TypeOut, ReturnType>;
+          return std::is_constructible_v<TypeOut, ReturnType> or
+                 (is_fixed_point<ReturnType>() and is_fixed_point<TypeOut>());
         }
       } else {
         if constexpr (std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index feb75cc3f09..a6477247356 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -68,8 +68,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -102,8 +101,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiply)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -125,8 +123,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiply2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -145,8 +142,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -165,8 +161,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -183,8 +178,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv3)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -204,8 +198,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -224,8 +217,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -244,8 +236,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd3)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -262,8 +253,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -280,8 +270,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd5)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -298,10 +287,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd6)
   auto const expected2 = fp_wrapper<RepType>{{6, 0, 1, 1, 1, 1}, scale_type{1}};
   auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
   auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1 =
-    cudf::experimental::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2 =
-    cudf::experimental::binary_operation(col, col, cudf::binary_operator::ADD, type2);
+  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
+  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
@@ -333,8 +320,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiplyScalar)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -353,8 +339,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpSimplePlus)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -370,8 +355,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimple)
   auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result = cudf::experimental::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -386,8 +371,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
   auto const col      = fp_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result = cudf::experimental::binary_operation(
-    col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -402,8 +387,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale0Nu
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result = cudf::experimental::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -418,8 +403,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale2Nu
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result = cudf::experimental::binary_operation(
-    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result =
+    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -445,8 +430,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualLessGreater)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(iota_3).type(),
                                                    static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add =
-    cudf::experimental::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
+  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
 
@@ -455,17 +439,17 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualLessGreater)
   auto const trues    = std::vector<bool>(sz, true);
   auto const true_col = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const btype        = cudf::data_type{type_id::BOOL8};
-  auto const equal_result = cudf::experimental::binary_operation(
-    iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
+  auto const btype = cudf::data_type{type_id::BOOL8};
+  auto const equal_result =
+    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
 
-  auto const less_result = cudf::experimental::binary_operation(
-    zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
+  auto const less_result =
+    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
 
-  auto const greater_result = cudf::experimental::binary_operation(
-    iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
+  auto const greater_result =
+    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
 }
 
@@ -484,8 +468,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullMaxSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result =
-    cudf::experimental::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -505,8 +488,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullMinSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result =
-    cudf::experimental::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
+  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -522,7 +504,7 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullEqualsSimple)
   auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
   auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -538,9 +520,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div)
   auto const rhs      = fp_wrapper<RepType>{{4, 4, 4, 4}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), -2};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), -2};
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -555,9 +536,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div2)
   auto const rhs      = fp_wrapper<RepType>{{20, 20, 20, 20}, scale_type{-1}};
   auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), -2};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), -2};
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -572,9 +552,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div3)
   auto const rhs      = fp_wrapper<RepType>{{3, 9, 3, 3}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), -2};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), -2};
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -589,9 +568,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div4)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(3, scale_type{0});
   auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 1};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -607,9 +585,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div6)
 
   auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), -2};
-  auto const result =
-    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), -2};
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -625,9 +602,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div7)
 
   auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 2};
-  auto const result =
-    cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 2};
+  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -642,9 +618,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div8)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(5000, scale_type{-3});
   auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 2};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 2};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -659,9 +634,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div9)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{1});
   auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 1};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -676,9 +650,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div10)
   auto const rhs      = make_fixed_point_scalar<decimalXX>(7, scale_type{0});
   auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 1};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -693,9 +666,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div11)
   auto const rhs      = fp_wrapper<RepType>{{7, 7, 7}, scale_type{0}};
   auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
 
-  auto const type = data_type{type_to_id<decimalXX>(), 1};
-  auto const result =
-    cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const type   = data_type{type_to_id<decimalXX>(), 1};
+  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -708,14 +680,8 @@ TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpThrows)
 
   auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
   auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
-  auto const float_type    = data_type{type_id::FLOAT32};
-  EXPECT_THROW(
-    cudf::experimental::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
-    cudf::logic_error);
-  // Allowed now, but not allowed in jit.
-  // EXPECT_THROW(cudf::experimental::binary_operation(col, col, cudf::binary_operator::MUL,
-  // float_type),
-  //              cudf::logic_error);
+  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
+               cudf::logic_error);
 }
 
 }  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 081ae41fef1..25d2f1d2c24 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -79,15 +79,24 @@ struct BinaryOperationCompiledTest : public BinaryOperationTest {
     auto lhs = lhs_random_column<TypeLhs>(col_size);
     auto rhs = rhs_random_column<TypeRhs>(col_size);
 
-    auto out = cudf::experimental::binary_operation(lhs, rhs, op, data_type(type_to_id<TypeOut>()));
+    auto out = cudf::binary_operation(lhs, rhs, op, data_type(type_to_id<TypeOut>()));
     ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OPERATOR());
 
     auto s_lhs = this->template make_random_wrapped_scalar<TypeLhs>();
     auto s_rhs = this->template make_random_wrapped_scalar<TypeRhs>();
+    s_lhs.set_valid_async(true);
+    s_rhs.set_valid_async(true);
 
-    out = cudf::experimental::binary_operation(lhs, s_rhs, op, data_type(type_to_id<TypeOut>()));
+    out = cudf::binary_operation(lhs, s_rhs, op, data_type(type_to_id<TypeOut>()));
     ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, s_rhs, OPERATOR());
-    out = cudf::experimental::binary_operation(s_lhs, rhs, op, data_type(type_to_id<TypeOut>()));
+    out = cudf::binary_operation(s_lhs, rhs, op, data_type(type_to_id<TypeOut>()));
+    ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, s_lhs, rhs, OPERATOR());
+
+    s_lhs.set_valid_async(false);
+    s_rhs.set_valid_async(false);
+    out = cudf::binary_operation(lhs, s_rhs, op, data_type(type_to_id<TypeOut>()));
+    ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, s_rhs, OPERATOR());
+    out = cudf::binary_operation(s_lhs, rhs, op, data_type(type_to_id<TypeOut>()));
     ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, s_lhs, rhs, OPERATOR());
   }
 };
@@ -305,8 +314,8 @@ TYPED_TEST(BinaryOperationCompiledTest_FloatOps, Pow_Vector_Vector)
   }();
   auto rhs = rhs_random_column<TypeRhs>(100);
 
-  auto out = cudf::experimental::binary_operation(
-    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
+  auto out =
+    cudf::binary_operation(lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, POW(), NearEqualComparator<TypeOut>{2});
 }
@@ -333,7 +342,7 @@ TYPED_TEST(BinaryOperationCompiledTest_FloatOps, LogBase_Vector_Vector)
   auto rhs_elements = cudf::detail::make_counting_transform_iterator(0, [](auto) { return 7; });
   fixed_width_column_wrapper<TypeRhs> rhs(rhs_elements, rhs_elements + 50);
 
-  auto out = cudf::experimental::binary_operation(
+  auto out = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
@@ -355,7 +364,7 @@ TYPED_TEST(BinaryOperationCompiledTest_FloatOps, ATan2_Vector_Vector)
   auto lhs = lhs_random_column<TypeLhs>(col_size);
   auto rhs = rhs_random_column<TypeRhs>(col_size);
 
-  auto out = cudf::experimental::binary_operation(
+  auto out = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ATAN2(), NearEqualComparator<TypeOut>{2});
@@ -519,6 +528,11 @@ struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<
 };
 TYPED_TEST_CASE(BinaryOperationCompiledTest_NullOps, Null_types);
 
+template <typename T>
+using column_wrapper = std::conditional_t<std::is_same_v<T, std::string>,
+                                          cudf::test::strings_column_wrapper,
+                                          cudf::test::fixed_width_column_wrapper<T>>;
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, class OP>
 auto NullOp_Result(column_view lhs, column_view rhs)
 {
@@ -537,8 +551,7 @@ auto NullOp_Result(column_view lhs, column_view rhs)
                    result_mask.push_back(output_valid);
                    return result;
                  });
-  return cudf::test::fixed_width_column_wrapper<TypeOut>(
-    result.cbegin(), result.cend(), result_mask.cbegin());
+  return column_wrapper<TypeOut>(result.cbegin(), result.cend(), result_mask.cbegin());
 }
 
 TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
@@ -552,7 +565,7 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
   auto rhs            = rhs_random_column<TypeRhs>(col_size);
   auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_EQUALS>(lhs, rhs);
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -570,7 +583,7 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullEquals_Vector_Vector)
   auto rhs            = rhs_random_column<TypeRhs>(col_size);
   auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_EQUALS>(lhs, rhs);
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -586,7 +599,7 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullMax_Vector_Vector)
   auto rhs            = rhs_random_column<TypeRhs>(col_size);
   auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MAX>(lhs, rhs);
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -602,9 +615,41 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullMin_Vector_Vector)
   auto rhs            = rhs_random_column<TypeRhs>(col_size);
   auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MIN>(lhs, rhs);
 
-  auto const result = cudf::experimental::binary_operation(
+  auto const result = cudf::binary_operation(
     lhs, rhs, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMax_Vector_Vector)
+{
+  using TypeOut  = std::string;
+  using TypeLhs  = std::string;
+  using TypeRhs  = std::string;
+  using NULL_MAX = cudf::library::operation::NullMax<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MAX>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_MAX, data_type(type_to_id<cudf::string_view>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
+TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector)
+{
+  using TypeOut  = std::string;
+  using TypeLhs  = std::string;
+  using TypeRhs  = std::string;
+  using NULL_MIN = cudf::library::operation::NullMin<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_MIN>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_MIN, data_type(type_to_id<cudf::string_view>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
+}
+
 }  // namespace cudf::test::binop
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index 68a8845132b..ec011a84037 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -53,8 +53,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_SI32_FP32_SI64)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -70,8 +70,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP32_FP32)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -87,8 +87,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_SI32_FP32_FP32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -103,8 +103,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_SI08_SI16_SI32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -119,8 +119,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_SI32_FP64_SI08)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -135,8 +135,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -152,8 +152,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_SI64_FP64_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -168,8 +168,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Vector_TimepointD_DurationS_Ti
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -184,8 +184,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Vector_Scalar_TimepointD_TimepointS_D
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -200,8 +200,8 @@ TEST_F(BinaryOperationIntegrationTest, Sub_Scalar_Vector_DurationS_DurationD_Dur
 
   auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::SUB, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SUB());
 }
@@ -216,8 +216,8 @@ TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
 }
@@ -232,8 +232,8 @@ TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_SI64_FP32_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
 }
@@ -249,8 +249,8 @@ TEST_F(BinaryOperationIntegrationTest, Mul_Scalar_Vector_SI32_DurationD_Duration
 
   auto lhs = cudf::scalar_type_t<TypeLhs>(2);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
 }
@@ -266,8 +266,8 @@ TEST_F(BinaryOperationIntegrationTest, Mul_Vector_Vector_DurationS_SI32_Duration
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MUL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MUL());
 }
@@ -282,8 +282,8 @@ TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
 }
@@ -298,8 +298,8 @@ TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_SI64_FP32_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
 }
@@ -315,8 +315,8 @@ TEST_F(BinaryOperationIntegrationTest, Div_Scalar_Vector_DurationD_SI32_Duration
   // Divide 2 days by an integer and convert the ticks to seconds
   auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{2});
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
 }
@@ -331,8 +331,8 @@ TEST_F(BinaryOperationIntegrationTest, Div_Vector_Vector_DurationD_DurationS_Dur
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, DIV());
 }
@@ -347,7 +347,7 @@ TEST_F(BinaryOperationIntegrationTest, TrueDiv_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::TRUE_DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, TRUEDIV());
@@ -363,7 +363,7 @@ TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
@@ -379,7 +379,7 @@ TEST_F(BinaryOperationIntegrationTest, FloorDiv_Vector_Vector_SI64_FP32_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::FLOOR_DIV, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, FLOORDIV());
@@ -395,8 +395,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -411,8 +411,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -427,8 +427,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_SI64_FP32_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -443,8 +443,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Vector_FP64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -460,8 +460,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationD_SI32_Duration
   // Half the number of days and convert the remainder ticks to microseconds
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = cudf::scalar_type_t<TypeRhs>(2);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -476,8 +476,8 @@ TEST_F(BinaryOperationIntegrationTest, Mod_Vector_Scalar_DurationS_DurationMS_Du
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::MOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, MOD());
 }
@@ -492,8 +492,8 @@ TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP64_SI64_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
 
   /**
    * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
@@ -513,8 +513,8 @@ TEST_F(BinaryOperationIntegrationTest, Pow_Vector_Vector_FP32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::POW, data_type(type_to_id<TypeOut>()));
   /**
    * According to CUDA Programming Guide, 'E.1. Standard Functions', 'Table 7 - Double-Precision
    * Mathematical Standard Library Functions with Maximum ULP Error'
@@ -533,7 +533,7 @@ TEST_F(BinaryOperationIntegrationTest, And_Vector_Vector_SI16_SI64_SI32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::BITWISE_AND, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
@@ -549,7 +549,7 @@ TEST_F(BinaryOperationIntegrationTest, Or_Vector_Vector_SI64_SI16_SI32)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::BITWISE_OR, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
@@ -565,7 +565,7 @@ TEST_F(BinaryOperationIntegrationTest, Xor_Vector_Vector_SI32_SI16_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::BITWISE_XOR, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, XOR());
@@ -581,7 +581,7 @@ TEST_F(BinaryOperationIntegrationTest, Logical_And_Vector_Vector_SI16_FP64_SI8)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOGICAL_AND, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, AND());
@@ -597,7 +597,7 @@ TEST_F(BinaryOperationIntegrationTest, Logical_Or_Vector_Vector_B8_SI16_SI64)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOGICAL_OR, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, OR());
@@ -613,8 +613,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_TSS_TSS)
 
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -629,7 +629,7 @@ TEST_F(BinaryOperationIntegrationTest, Greater_Scalar_Vector_B8_TSMS_TSS)
 
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
@@ -645,8 +645,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_TSS_TSS)
 
   auto lhs = make_random_wrapped_column<TypeLhs>(10);
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -667,7 +667,7 @@ TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_TSMS_TSS)
     itr, itr + 100, make_validity_iter());
 
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
@@ -683,8 +683,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Scalar_Vector_B8_STR_STR)
 
   auto lhs = cudf::string_scalar("eee");
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -699,8 +699,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Vector_Scalar_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
   auto rhs = cudf::string_scalar("eee");
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -715,8 +715,8 @@ TEST_F(BinaryOperationIntegrationTest, Less_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::LESS, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS());
 }
@@ -731,7 +731,7 @@ TEST_F(BinaryOperationIntegrationTest, Greater_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::GREATER, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER());
@@ -747,7 +747,7 @@ TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
@@ -763,7 +763,7 @@ TEST_F(BinaryOperationIntegrationTest, Equal_Vector_Scalar_B8_STR_STR)
 
   auto rhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto lhs = cudf::string_scalar("");
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::EQUAL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, EQUAL());
@@ -779,7 +779,7 @@ TEST_F(BinaryOperationIntegrationTest, LessEqual_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LESS_EQUAL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LESS_EQUAL());
@@ -795,7 +795,7 @@ TEST_F(BinaryOperationIntegrationTest, GreaterEqual_Vector_Vector_B8_STR_STR)
 
   auto lhs = cudf::test::strings_column_wrapper({"eee", "bb", "<null>", "", "aa", "bbb", "ééé"});
   auto rhs = cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "<null>", "bb", "eee"});
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::GREATER_EQUAL, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, GREATER_EQUAL());
@@ -812,7 +812,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
@@ -829,7 +829,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Vector_SI32_SI16_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
@@ -846,7 +846,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Scalar_Vector_SI32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
@@ -863,7 +863,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftLeft_Vector_Scalar_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_LEFT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_LEFT());
@@ -880,7 +880,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
@@ -897,7 +897,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Vector_SI32_SI16_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
@@ -914,7 +914,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRight_Scalar_Vector_SI32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
@@ -931,7 +931,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRight_Vector_Scalar_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT());
@@ -954,7 +954,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32)
   TypeOut expected[] = {2147483644, 39, 536870900, 0, 32768};
   cudf::test::fixed_width_column_wrapper<TypeOut> expected_w(expected, expected + num_els);
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs_w, shift_w, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*out, expected_w);
@@ -972,7 +972,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Vector_SI32_SI1
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
@@ -990,7 +990,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
@@ -1008,7 +1008,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Vector_Scalar_SI32)
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
@@ -1027,7 +1027,7 @@ TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Scalar_SI32_SI32_float)
   fixed_width_column_wrapper<TypeLhs> lhs(elements, elements + 100);
   // Find log to the base 10
   auto rhs = numeric_scalar<TypeRhs>(10);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
@@ -1046,7 +1046,7 @@ TEST_F(BinaryOperationIntegrationTest, LogBase_Scalar_Vector_float_SI32)
   fixed_width_column_wrapper<TypeRhs> rhs(elements, elements + 100);
   // Find log to the base 2
   auto lhs = numeric_scalar<TypeLhs>(2);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
@@ -1068,7 +1068,7 @@ TEST_F(BinaryOperationIntegrationTest, LogBase_Vector_Vector_double_SI64_SI32)
   // Find log to the base 7
   auto rhs_elements = cudf::detail::make_counting_transform_iterator(0, [](auto) { return 7; });
   fixed_width_column_wrapper<TypeRhs> rhs(rhs_elements, rhs_elements + 50);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::LOG_BASE, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, LOG_BASE());
@@ -1084,7 +1084,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_SI32_SI32
     fixed_width_column_wrapper<TypeLhs>{{999, -37, 0, INT32_MAX}, {true, true, true, false}};
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1105,7 +1105,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_ScalarInvalid_B8_SI
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(999);
   int_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, int_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1137,7 +1137,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_tsD_tsD)
     {false, true, true, true, false, true, true, false}};
   auto ts_scalar = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{44376}, true);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     ts_scalar, ts_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1158,7 +1158,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_st
   // Empty string
   cudf::string_scalar str_scalar("");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1178,7 +1178,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
   // Match a valid string
   cudf::string_scalar str_scalar("<null>");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1199,7 +1199,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_st
   // Matching a string that isn't present
   cudf::string_scalar str_scalar("foo");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1221,7 +1221,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
   cudf::string_scalar str_scalar("foo");
   str_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1243,7 +1243,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Scalar_B8_string_st
   // Matching a scalar that is valid
   cudf::string_scalar str_scalar("foo");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1266,7 +1266,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
   cudf::string_scalar str_scalar("foo");
   str_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1286,7 +1286,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Scalar_Vector_B8_string_st
   // Matching an invalid string
   cudf::string_scalar str_scalar("bb");
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, str_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1307,7 +1307,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_InvalidScalar_B8_st
   cudf::string_scalar str_scalar("bb");
   str_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_col, str_scalar, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1340,7 +1340,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_tsD_tsD_N
     22270,  // 2030-12-22 00:00:00 GMT
   };
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1364,7 +1364,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
     cudf::test::strings_column_wrapper({"foo", "valid", "<null>", "", "invalid", "inv", "ééé"},
                                        {true, true, true, true, false, false, true});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1385,7 +1385,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
   auto rhs_col =
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1407,7 +1407,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
                                        {false, false, false, false, false, false, false});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1427,7 +1427,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
   auto rhs_col =
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1448,7 +1448,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
                                        {false, false, false, false, false, false, false});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1470,7 +1470,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_Vector_B8_string_st
     cudf::test::strings_column_wrapper({"eee", "invalid", "<null>", "", "aa", "invalid", "ééé"},
                                        {false, false, false, false, false, false, false});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1491,7 +1491,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareEqual_Vector_VectorAllInvalid_B8
   auto rhs_col = fixed_width_column_wrapper<TypeLhs>{{-47, 37, 12, 99, 4, -INT32_MAX},
                                                      {false, false, false, false, false, false}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_EQUALS, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1514,7 +1514,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_SI8)
   };
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(77);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, int_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1535,7 +1535,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_FP64_SI32_SI64
                                         {false, true, false, true, false, true, false}};
   auto int_scalar = cudf::scalar_type_t<TypeRhs>(INT32_MAX);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1559,7 +1559,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_SI64_SI32_FP32
   auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
   float_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, float_scalar, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1581,7 +1581,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_SI8_SI8_FP32)
   auto float_scalar = cudf::scalar_type_t<TypeRhs>(-3.14f);
   float_scalar.set_valid_async(false);
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     float_scalar, int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1603,7 +1603,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_SI64_SI32_SI8)
   auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
     {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, another_int_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1624,7 +1624,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI64_SI32_SI8)
   auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
     {9, -37, 0, 32, -47, -4, 55}, {false, false, false, false, false, false, false}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1656,7 +1656,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Vector_tsD_tsD_tsD)
     },
     {false, true, true, true, false}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_MIN, data_type(type_to_id<cudf::timestamp_D>()));
 
   // Every row has a value
@@ -1678,7 +1678,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_SI32_SI64_SI8)
   auto another_int_col = fixed_width_column_wrapper<TypeLhs>{
     {9, -37, 0, 32, -47, -4, 55}, {true, false, true, false, true, false, true}};
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     int_col, another_int_col, cudf::binary_operator::NULL_MAX, data_type(type_to_id<TypeOut>()));
 
   // Every row has a value
@@ -1698,7 +1698,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Vector_Vector_string_string_
     {"eee", "goo", "<null>", "", "", "", "ééé", "bar", "foo", "def", "abc"},
     {false, true, true, true, false, true, true, false, false, true, true});
 
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, rhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
 
   auto exp_col = cudf::test::strings_column_wrapper(
@@ -1717,7 +1717,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMin_Vector_Scalar_string_string_
 
   // Returns a non-nullable column as all elements are valid - it will have the scalar
   // value at the very least
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     lhs_col, str_scalar, cudf::binary_operator::NULL_MIN, data_type{type_id::STRING});
 
   auto exp_col = cudf::test::strings_column_wrapper(
@@ -1735,7 +1735,7 @@ TEST_F(BinaryOperationIntegrationTest, NullAwareMax_Scalar_Vector_string_string_
   str_scalar.set_valid_async(false);
 
   // Returns the lhs_col
-  auto op_col = cudf::binary_operation(
+  auto op_col = cudf::jit::binary_operation(
     str_scalar, lhs_col, cudf::binary_operator::NULL_MAX, data_type{type_id::STRING});
 
   auto exp_col = cudf::test::strings_column_wrapper(
@@ -1757,8 +1757,8 @@ TEST_F(BinaryOperationIntegrationTest, CastAdd_Vector_Vector_SI32_float_float)
   auto rhs      = cudf::test::fixed_width_column_wrapper<float>{1.3f, 1.6f};
   auto expected = cudf::test::fixed_width_column_wrapper<int>{2, 3};
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -1773,8 +1773,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Vector_TimepointD_DurationS_Ti
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -1789,8 +1789,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Vector_Scalar_DurationD_TimepointS_Ti
 
   auto lhs = make_random_wrapped_column<TypeLhs>(100);
   auto rhs = cudf::scalar_type_t<TypeRhs>(typename TypeRhs::duration{34}, true);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -1805,8 +1805,8 @@ TEST_F(BinaryOperationIntegrationTest, Add_Scalar_Vector_DurationS_DurationD_Dur
 
   auto lhs = cudf::scalar_type_t<TypeLhs>(TypeLhs{-9});
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -1823,7 +1823,7 @@ TEST_F(BinaryOperationIntegrationTest, ShiftRightUnsigned_Scalar_Vector_SI64_SI6
   auto lhs = cudf::scalar_type_t<TypeLhs>(-12);
   // this generates values in the range 1-10 which should be reasonable for the shift
   auto rhs = make_random_wrapped_column<TypeRhs>(100);
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::SHIFT_RIGHT_UNSIGNED, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, SHIFT_RIGHT_UNSIGNED());
@@ -1838,8 +1838,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Scalar_Vector_FP32)
   auto lhs = cudf::scalar_type_t<TypeLhs>(-86099.68377);
   auto rhs = fixed_width_column_wrapper<TypeRhs>{{90770.74881, -15456.4335, 32213.22119}};
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   auto expected_result =
     fixed_width_column_wrapper<TypeOut>{{4671.0625, -8817.51953125, 10539.974609375}};
@@ -1855,8 +1855,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Scalar_FP64)
   auto lhs = fixed_width_column_wrapper<TypeLhs>{{90770.74881, -15456.4335, 32213.22119}};
   auto rhs = cudf::scalar_type_t<TypeRhs>(-86099.68377);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   auto expected_result = fixed_width_column_wrapper<TypeOut>{
     {4671.0650400000013178, -15456.433499999999185, 32213.221190000000206}};
@@ -1880,8 +1880,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_FP32_FP64)
                                                   2.1336193413893147E307,
                                                   -2.1336193413893147E307}};
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   auto expected_result = fixed_width_column_wrapper<TypeOut>{{24854.55859375,
                                                               2664.7075000000040745,
@@ -1905,8 +1905,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_FP64_SI32_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(1000);
   auto rhs = make_random_wrapped_column<TypeRhs>(1000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
 }
@@ -1922,8 +1922,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_SI32_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(1000);
   auto rhs = make_random_wrapped_column<TypeRhs>(1000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
 }
@@ -1939,8 +1939,8 @@ TEST_F(BinaryOperationIntegrationTest, PMod_Vector_Vector_SI64_FP64_FP64)
   auto lhs = make_random_wrapped_column<TypeLhs>(1000);
   auto rhs = make_random_wrapped_column<TypeRhs>(1000);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::PMOD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, PMOD());
 }
@@ -1956,7 +1956,7 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Scalar_Vector_FP32)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   // atan2 has a max ULP error of 2 per CUDA programming guide
@@ -1974,7 +1974,7 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Scalar_FP64)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_scalar<TypeRhs>();
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   // atan2 has a max ULP error of 2 per CUDA programming guide
@@ -1992,7 +1992,7 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_FP32_FP64)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   // atan2 has a max ULP error of 2 per CUDA programming guide
@@ -2010,7 +2010,7 @@ TEST_F(BinaryOperationIntegrationTest, ATan2_Vector_Vector_FP64_SI32_SI64)
   auto lhs = make_random_wrapped_column<TypeLhs>(10000);
   auto rhs = make_random_wrapped_column<TypeRhs>(10000);
 
-  auto out = cudf::binary_operation(
+  auto out = cudf::jit::binary_operation(
     lhs, rhs, cudf::binary_operator::ATAN2, data_type(type_to_id<TypeOut>()));
 
   // atan2 has a max ULP error of 2 per CUDA programming guide
@@ -2053,7 +2053,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2086,7 +2086,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view());
 }
@@ -2108,7 +2108,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiply2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2127,7 +2127,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2146,7 +2146,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2163,7 +2163,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv3)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2183,7 +2183,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpDiv4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::DIV, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2202,7 +2202,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd2)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2221,7 +2221,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd3)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2238,7 +2238,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd4)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2255,7 +2255,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd5)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::ADD, lhs->type(), static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2272,8 +2272,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd6)
   auto const expected2 = fp_wrapper<RepType>{{0, 0, 1, 1, 1, 1}, scale_type{1}};
   auto const type1     = cudf::data_type{cudf::type_to_id<decimalXX>(), 0};
   auto const type2     = cudf::data_type{cudf::type_to_id<decimalXX>(), 1};
-  auto const result1   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type1);
-  auto const result2   = cudf::binary_operation(col, col, cudf::binary_operator::ADD, type2);
+  auto const result1   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type1);
+  auto const result2   = cudf::jit::binary_operation(col, col, cudf::binary_operator::ADD, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -2305,7 +2305,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpMultiplyScalar)
 
   auto const type = cudf::binary_operation_fixed_point_output_type(
     cudf::binary_operator::MUL, static_cast<cudf::column_view>(lhs).type(), rhs->type());
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2324,7 +2324,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpSimplePlus)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(lhs).type(),
                                                    static_cast<cudf::column_view>(rhs).type());
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2340,8 +2340,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimple)
   auto const col2     = fp_wrapper<RepType>{{100, 200, 300, 400}, scale_type{-2}};
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result = cudf::jit::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2357,7 +2357,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0)
   auto const expected = wrapper<bool>(trues.begin(), trues.end());
 
   auto const result =
-    cudf::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+    cudf::jit::binary_operation(col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2372,8 +2372,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null)
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result = cudf::jit::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2388,8 +2388,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null)
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
   auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
 
-  auto const result =
-    cudf::binary_operation(col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
+  auto const result = cudf::jit::binary_operation(
+    col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2415,7 +2415,8 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD,
                                                    static_cast<cudf::column_view>(iota_3).type(),
                                                    static_cast<cudf::column_view>(zeros_3).type());
-  auto const iota_3_after_add = cudf::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
+  auto const iota_3_after_add =
+    cudf::jit::binary_operation(zeros_3, iota_3, binary_operator::ADD, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view());
 
@@ -2426,15 +2427,15 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpEqualLessGreater)
 
   auto const btype = cudf::data_type{type_id::BOOL8};
   auto const equal_result =
-    cudf::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
+    cudf::jit::binary_operation(iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view());
 
   auto const less_result =
-    cudf::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
+    cudf::jit::binary_operation(zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view());
 
   auto const greater_result =
-    cudf::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
+    cudf::jit::binary_operation(iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view());
 }
 
@@ -2453,7 +2454,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMaxSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
+  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MAX, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2473,7 +2474,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullMinSimple)
     cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN,
                                                    static_cast<cudf::column_view>(col1).type(),
                                                    static_cast<cudf::column_view>(col2).type());
-  auto const result = cudf::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
+  auto const result = cudf::jit::binary_operation(col1, col2, binary_operator::NULL_MIN, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2489,7 +2490,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpNullEqualsSimple)
   auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
   auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
 
-  auto const result = cudf::binary_operation(
+  auto const result = cudf::jit::binary_operation(
     col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -2506,7 +2507,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div)
   auto const expected = fp_wrapper<RepType>{{25, 75, 125, 175}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2522,7 +2523,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div2)
   auto const expected = fp_wrapper<RepType>{{5000, 15000, 25000, 35000}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2538,7 +2539,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div3)
   auto const expected = fp_wrapper<RepType>{{3333, 3333, 16666, 23333}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2554,7 +2555,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div4)
   auto const expected = fp_wrapper<RepType>{{3, 10, 16, 23}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2571,7 +2572,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div6)
   auto const expected = fp_wrapper<RepType>{{300, 100, 60, 42}, scale_type{-2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), -2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2588,7 +2589,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div7)
   auto const expected = fp_wrapper<RepType>{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2604,7 +2605,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div8)
   auto const expected = fp_wrapper<RepType>{{0, 1, 16}, scale_type{2}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 2};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2620,7 +2621,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div9)
   auto const expected = fp_wrapper<RepType>{{1, 2, 4}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2636,7 +2637,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div10)
   auto const expected = fp_wrapper<RepType>{{14, 28, 42}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2652,7 +2653,7 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOp_Div11)
   auto const expected = fp_wrapper<RepType>{{142, 285, 428}, scale_type{1}};
 
   auto const type   = data_type{type_to_id<decimalXX>(), 1};
-  auto const result = cudf::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
+  auto const result = cudf::jit::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -2666,9 +2667,9 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpThrows)
   auto const col           = fp_wrapper<RepType>{{100, 300, 500, 700}, scale_type{-2}};
   auto const non_bool_type = data_type{type_to_id<decimalXX>(), -2};
   auto const float_type    = data_type{type_id::FLOAT32};
-  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
+  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type),
                cudf::logic_error);
-  EXPECT_THROW(cudf::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
+  EXPECT_THROW(cudf::jit::binary_operation(col, col, cudf::binary_operator::MUL, float_type),
                cudf::logic_error);
 }
 
diff --git a/cpp/tests/binaryop/binop-null-test.cpp b/cpp/tests/binaryop/binop-null-test.cpp
index c91bc12d95f..25ec3b30834 100644
--- a/cpp/tests/binaryop/binop-null-test.cpp
+++ b/cpp/tests/binaryop/binop-null-test.cpp
@@ -66,8 +66,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_Valid)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -83,8 +83,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -101,8 +101,8 @@ TEST_F(BinaryOperationNullTest, Scalar_Null_Vector_NonNullable)
   lhs.set_valid_async(false);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -118,8 +118,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Scalar_Valid)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_NULL);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -135,8 +135,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_Valid)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::ALL_VALID);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -152,8 +152,8 @@ TEST_F(BinaryOperationNullTest, Vector_Null_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_NULL);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -169,8 +169,8 @@ TEST_F(BinaryOperationNullTest, Vector_Valid_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::ALL_VALID);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
@@ -186,8 +186,8 @@ TEST_F(BinaryOperationNullTest, Vector_NonNullable_Vector_NonNullable)
   auto lhs = make_random_wrapped_column<TypeLhs>(100, mask_state::UNALLOCATED);
   auto rhs = make_random_wrapped_column<TypeRhs>(100, mask_state::UNALLOCATED);
 
-  auto out =
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
+  auto out = cudf::jit::binary_operation(
+    lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>()));
 
   ASSERT_BINOP<TypeOut, TypeLhs, TypeRhs>(*out, lhs, rhs, ADD());
 }
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 167fbc22bde..779dc7c4c1f 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -35,9 +35,9 @@ TEST_F(BinopVerifyInputTest, Vector_Scalar_ErrorOutputVectorType)
   auto lhs = make_random_wrapped_scalar<TypeLhs>();
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::jit::binary_operation(
+                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_id::NUM_TYPE_IDS)),
+               cudf::logic_error);
 }
 
 TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
@@ -49,9 +49,9 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
   auto lhs = make_random_wrapped_column<TypeLhs>(1);
   auto rhs = make_random_wrapped_column<TypeRhs>(10);
 
-  EXPECT_THROW(
-    cudf::binary_operation(lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::jit::binary_operation(
+                 lhs, rhs, cudf::binary_operator::ADD, data_type(type_to_id<TypeOut>())),
+               cudf::logic_error);
 }
 
 }  // namespace binop
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 47b2a95e7b5..ced809c243d 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -524,8 +524,8 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -547,8 +547,8 @@ TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
   auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
 
   auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
+  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
@@ -570,10 +570,10 @@ TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
   auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
 
   auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
+  auto const result1 = cudf::jit::binary_operation(a, b, cudf::binary_operator::ADD, type1);
 
   auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
+  auto const result2 = cudf::jit::binary_operation(a, c, cudf::binary_operator::DIV, type2);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/cpp/binaryop.pxd
index c3320b371cd..c36ab124bf8 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/cpp/binaryop.pxd
@@ -61,3 +61,27 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const string& op,
         data_type output_type
     ) except +
+
+    unique_ptr[column] jit_binary_operation \
+        "cudf::jit::binary_operation" (
+        const column_view& lhs,
+        const column_view& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
+
+    unique_ptr[column] jit_binary_operation \
+        "cudf::jit::binary_operation" (
+        const column_view& lhs,
+        const scalar& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
+
+    unique_ptr[column] jit_binary_operation \
+        "cudf::jit::binary_operation" (
+        const scalar& lhs,
+        const column_view& rhs,
+        binary_operator op,
+        data_type output_type
+    ) except +
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 85a9f85ad22..bc12b42a3fa 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -164,7 +164,17 @@ def binary_operator(
                 ):
                     out_dtype = cudf.dtype("float64")
 
-        if binop in {"lt", "gt", "le", "ge", "eq", "ne", "NULL_EQUALS"}:
+        if binop in {
+            "l_and",
+            "l_or",
+            "lt",
+            "gt",
+            "le",
+            "ge",
+            "eq",
+            "ne",
+            "NULL_EQUALS",
+        }:
             out_dtype = "bool"
         lhs, rhs = (self, rhs) if not reflect else (rhs, self)
         return libcudf.binaryop.binaryop(lhs, rhs, binop, out_dtype)

From 406e87bdb8b1976bc8d47794b79901fc35d4803d Mon Sep 17 00:00:00 2001
From: Marlene <57748216+marlenezw@users.noreply.github.com>
Date: Mon, 23 Aug 2021 17:40:17 +0200
Subject: [PATCH 05/20] Allowing %f in format to return nanoseconds (#9081)

This is a quick fix to close PR #7945
This PR checks to see if `%f` is passed as part of `format`  into `cudf.to_datetime`. Previously, cudf would not return nanoseconds, while pandas does.

Authors:
  - Marlene  (https://github.com/marlenezw)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9081
---
 python/cudf/cudf/core/tools/datetimes.py | 3 +++
 python/cudf/cudf/tests/test_datetime.py  | 1 +
 2 files changed, 4 insertions(+)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 946cdcb1ebc..4856995b391 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -123,6 +123,9 @@ def to_datetime(
     if yearfirst:
         raise NotImplementedError("yearfirst support is not yet implemented")
 
+    if format is not None and "%f" in format:
+        format = format.replace("%f", "%9f")
+
     try:
         if isinstance(arg, cudf.DataFrame):
             # we require at least Ymd
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 9f19bf8b960..65e87e88f55 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -717,6 +717,7 @@ def test_to_datetime_units(data, unit):
         (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%m/%d/%Y"),
         (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%d/%m/%Y"),
         (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], None),
+        (["2021-04-13 12:30:04.123456789"], "%Y-%m-%d %H:%M:%S.%f"),
         (pd.Series([2015, 2020, 2021]), "%Y"),
         pytest.param(
             pd.Series(["1", "2", "1"]),

From 8aefeb49bec96e3bc27a05276e66471f8ca7f966 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Mon, 23 Aug 2021 11:21:07 -0700
Subject: [PATCH 06/20] Fix branch_stack calculation in `row_bit_count()`
 (#9076)

Fixes #8938.

For input with a number of rows exceeding  `max_block_size`, `row_bit_count()` currently
reaches past the bounds of its shared-memory allocation, causing illegal memory access
errors like in [cudf/issues/8938](https://github.com/rapidsai/cudf/issues/8938).

This commit corrects the calculation of the branch stack's base address, and adds a
test for this case.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9076
---
 cpp/src/transform/row_bit_count.cu        |  2 +-
 cpp/tests/transform/row_bit_count_test.cu | 65 +++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 620504f5c93..27936ce04b3 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -408,7 +408,7 @@ __global__ void compute_row_sizes(device_span<column_device_view const> cols,
   if (tid >= num_rows) { return; }
 
   // branch stack. points to the last list prior to branching.
-  row_span* my_branch_stack = thread_branch_stacks + (tid * max_branch_depth);
+  row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth);
   size_type branch_depth{0};
 
   // current row span - always starts at 1 row.
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 0081cf0d467..8284def5f13 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -15,7 +15,9 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/io/parquet.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
@@ -25,6 +27,9 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/fill.h>
+#include <thrust/tabulate.h>
+
 using namespace cudf;
 
 template <typename T>
@@ -192,6 +197,66 @@ TEST_F(RowBitCount, StringsWithNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
+namespace {
+
+/**
+ * @brief __device__ functor to multiply input by 2, defined out of line because __device__ lambdas
+ * cannot be defined in a TEST_F().
+ */
+struct times_2 {
+  int32_t __device__ operator()(int32_t i) const { return i * 2; }
+};
+
+}  // namespace
+
+TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
+{
+  // Tests that `row_bit_count()` can handle struct<list<int32_t>> with more
+  // than max_block_size (256) rows.
+  // With a large number of rows, computation spills to multiple thread-blocks,
+  // thus exercising the branch-stack comptutation.
+  // The contents of the input column aren't as pertinent to this test as the
+  // column size. For what it's worth, it looks as follows:
+  //   [ struct({0,1}), struct({2,3}), struct({4,5}), ... ]
+
+  using namespace cudf;
+  auto constexpr num_rows = 1024 * 2;  // Exceeding a block size.
+
+  // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows};
+  auto ints      = make_numeric_column(data_type{type_id::INT32}, num_rows * 2);
+  auto ints_view = ints->mutable_view();
+  thrust::tabulate(thrust::device,
+                   ints_view.begin<int32_t>(),
+                   ints_view.end<int32_t>(),
+                   thrust::identity<int32_t>());
+
+  // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2};
+  auto list_offsets      = make_numeric_column(data_type{type_id::INT32}, num_rows + 1);
+  auto list_offsets_view = list_offsets->mutable_view();
+  thrust::tabulate(thrust::device,
+                   list_offsets_view.begin<offset_type>(),
+                   list_offsets_view.end<offset_type>(),
+                   times_2{});
+
+  // List<int32_t> = {{0,1}, {2,3}, {4,5}, ..., {2*(num_rows-1), 2*num_rows-1}};
+  auto lists_column = make_lists_column(num_rows, std::move(list_offsets), std::move(ints), 0, {});
+
+  // Struct<List<int32_t>.
+  auto struct_members = std::vector<std::unique_ptr<column>>{};
+  struct_members.emplace_back(std::move(lists_column));
+  auto structs_column = make_structs_column(num_rows, std::move(struct_members), 0, {});
+
+  // Compute row_bit_count, and compare.
+  auto row_bit_counts          = row_bit_count(table_view{{structs_column->view()}});
+  auto expected_row_bit_counts = make_numeric_column(data_type{type_id::INT32}, num_rows);
+  thrust::fill_n(thrust::device,
+                 expected_row_bit_counts->mutable_view().begin<int32_t>(),
+                 num_rows,
+                 CHAR_BIT * (2 * sizeof(int32_t) + sizeof(offset_type)));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(row_bit_counts->view(), expected_row_bit_counts->view());
+}
+
 std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_struct_column()
 {
   std::vector<bool> struct_validity{0, 1, 1, 1, 1, 0};

From d7a05dc88950039408152c0f8a75fc4c83a9f95c Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Tue, 24 Aug 2021 09:43:52 +0800
Subject: [PATCH 07/20] Support nested types for nth_element reduction (#9043)

Closes #8967

Current PR supported the construction of default scalar on nested types (LIST_TYPE and STRUCT_TYPE) for reduction, in order to support nested types for nth_element reduction.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9043
---
 cpp/include/cudf/scalar/scalar_factories.hpp |  14 +
 cpp/src/reductions/reductions.cpp            |  19 +-
 cpp/src/scalar/scalar_factories.cpp          |  21 ++
 cpp/tests/groupby/nth_element_tests.cpp      | 107 +++++++-
 cpp/tests/reductions/reduction_tests.cpp     | 264 +++++++++++++++++++
 5 files changed, 416 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index b96a8c65a04..b949f8d542f 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -121,6 +121,20 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
+ *
+ * @throw cudf::logic_error if the `input` column is struct type and empty
+ *
+ * @param input Immutable view of input column to emulate
+ * @param stream CUDA stream used for device memory operations.
+ * @param mr Device memory resource used to allocate the scalar's `data` and `is_valid` bool.
+ */
+std::unique_ptr<scalar> make_empty_scalar_like(
+  column_view const& input,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Construct scalar using the given value of fixed width type
  *
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index a8117373ca4..699494c49c5 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -25,6 +25,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
+#include <cudf/structs/structs_column_view.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -112,15 +113,17 @@ std::unique_ptr<scalar> reduce(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  std::unique_ptr<scalar> result = make_default_constructed_scalar(output_dtype, stream, mr);
-  result->set_valid_async(false, stream);
-
-  // check if input column is empty
-  if (col.size() <= col.null_count()) return result;
+  // Returns default scalar if input column is non-valid. In terms of nested columns, we need to
+  // handcraft the default scalar with input column.
+  if (col.size() <= col.null_count()) {
+    if (col.type().id() == type_id::EMPTY || col.type() != output_dtype) {
+      return make_default_constructed_scalar(output_dtype, stream, mr);
+    }
+    return make_empty_scalar_like(col, stream, mr);
+  }
 
-  result =
-    aggregation_dispatcher(agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
-  return result;
+  return aggregation_dispatcher(
+    agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
 }
 }  // namespace detail
 
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index af78d84d874..25418cf0f7e 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -20,6 +20,7 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cudf/detail/copy.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -165,4 +166,24 @@ std::unique_ptr<scalar> make_default_constructed_scalar(data_type type,
   return type_dispatcher(type, default_scalar_functor{}, stream, mr);
 }
 
+std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  std::unique_ptr<scalar> result;
+  switch (column.type().id()) {
+    case type_id::LIST:
+      result = make_list_scalar(empty_like(column)->view(), stream, mr);
+      result->set_valid_async(false, stream);
+      break;
+    case type_id::STRUCT:
+      // The input column must have at least 1 row to extract a scalar (row) from it.
+      result = detail::get_element(column, 0, stream, mr);
+      result->set_valid_async(false, stream);
+      break;
+    default: result = make_default_constructed_scalar(column.type(), stream, mr);
+  }
+  return result;
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp
index 22f1e14815f..47dfa2426eb 100644
--- a/cpp/tests/groupby/nth_element_tests.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -414,5 +414,110 @@ TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
                   cudf::make_nth_element_aggregation<groupby_aggregation>(2));
 }
 
+struct groupby_nth_element_structs_test : BaseFixture {
+};
+
+TEST_F(groupby_nth_element_structs_test, Basics)
+{
+  using structs = cudf::test::structs_column_wrapper;
+  using ints    = cudf::test::fixed_width_column_wrapper<int>;
+  using doubles = cudf::test::fixed_width_column_wrapper<double>;
+  using strings = cudf::test::strings_column_wrapper;
+
+  auto keys   = ints{0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  auto child0 = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  auto child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
+  auto child2 = strings{"", "a", "b", "c", "d", "e", "f", "g", "HH", "JJJ"};
+  auto values = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};
+
+  auto expected_keys   = ints{0, 1, 2, 3};
+  auto expected_ch0    = ints{1, 4, 7, 0};
+  auto expected_ch1    = doubles{1.2, 4.51, -0.07, 0.0};
+  auto expected_ch2    = strings{"a", "d", "g", ""};
+  auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(1));
+
+  expected_keys   = ints{0, 1, 2, 3};
+  expected_ch0    = ints{0, 4, 6, 9};
+  expected_ch1    = doubles{0.1, 4.51, 6.3231, 9.999};
+  expected_ch2    = strings{"", "d", "f", "JJJ"};
+  expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(0, null_policy::EXCLUDE));
+}
+
+TEST_F(groupby_nth_element_structs_test, NestedStructs)
+{
+  using structs = cudf::test::structs_column_wrapper;
+  using ints    = cudf::test::fixed_width_column_wrapper<int>;
+  using doubles = cudf::test::fixed_width_column_wrapper<double>;
+  using lists   = cudf::test::lists_column_wrapper<int>;
+
+  auto keys             = ints{0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
+  auto child0           = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  auto child0_of_child1 = ints{0, -1, -2, -3, -4, -5, -6, -7, -8, -9};
+  auto child1_of_child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
+  auto child1           = structs{child0_of_child1, child1_of_child1};
+  auto child2           = lists{{0}, {1, 2, 3}, {}, {4}, {5, 6}, {}, {}, {7}, {8, 9}, {}};
+  auto values           = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};
+
+  auto expected_keys       = ints{0, 1, 2, 3};
+  auto expected_ch0        = ints{1, 4, 7, 0};
+  auto expected_ch0_of_ch1 = ints{-1, -4, -7, 0};
+  auto expected_ch1_of_ch1 = doubles{1.2, 4.51, -0.07, 0.0};
+  auto expected_ch1        = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
+  auto expected_ch2        = lists{{1, 2, 3}, {5, 6}, {7}, {}};
+  auto expected_values     = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(1));
+
+  expected_keys       = ints{0, 1, 2, 3};
+  expected_ch0        = ints{0, 4, 6, 9};
+  expected_ch0_of_ch1 = ints{0, -4, -6, -9};
+  expected_ch1_of_ch1 = doubles{0.1, 4.51, 6.3231, 9.999};
+  expected_ch1        = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
+  expected_ch2        = lists{{0}, {5, 6}, {}, {}};
+  expected_values     = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(0, null_policy::EXCLUDE));
+}
+
+TEST_F(groupby_nth_element_structs_test, EmptyInput)
+{
+  using structs = cudf::test::structs_column_wrapper;
+  using ints    = cudf::test::fixed_width_column_wrapper<int>;
+  using doubles = cudf::test::fixed_width_column_wrapper<double>;
+  using strings = cudf::test::strings_column_wrapper;
+
+  auto keys   = ints{};
+  auto child0 = ints{};
+  auto child1 = doubles{};
+  auto child2 = strings{};
+  auto values = structs{{child0, child1, child2}};
+
+  auto expected_keys   = ints{};
+  auto expected_ch0    = ints{};
+  auto expected_ch1    = doubles{};
+  auto expected_ch2    = strings{};
+  auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}};
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(0));
+}
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index da9032737f2..88318a41882 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -24,8 +24,10 @@
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
+#include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -1872,4 +1874,266 @@ TYPED_TEST(DictionaryReductionTest, Quantile)
                        output_type);
 }
 
+struct ListReductionTest : public cudf::test::BaseFixture {
+  void reduction_test(cudf::column_view const& input_data,
+                      cudf::column_view const& expected_value,
+                      bool succeeded_condition,
+                      bool is_valid,
+                      std::unique_ptr<aggregation> const& agg)
+  {
+    auto statement = [&]() {
+      std::unique_ptr<cudf::scalar> result =
+        cudf::reduce(input_data, agg, cudf::data_type(cudf::type_id::LIST));
+      auto list_result = dynamic_cast<cudf::list_scalar*>(result.get());
+      EXPECT_EQ(is_valid, list_result->is_valid());
+      if (is_valid) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); }
+    };
+
+    if (succeeded_condition) {
+      CUDF_EXPECT_NO_THROW(statement());
+    } else {
+      EXPECT_ANY_THROW(statement());
+    }
+  }
+};
+
+TEST_F(ListReductionTest, ListReductionNthElement)
+{
+  using LCW        = cudf::test::lists_column_wrapper<int>;
+  using ElementCol = cudf::test::fixed_width_column_wrapper<int>;
+
+  // test without nulls
+  LCW col{{-3}, {2, 1}, {0, 5, -3}, {-2}, {}, {28}};
+  this->reduction_test(col,
+                       ElementCol{0, 5, -3},  // expected_value,
+                       true,
+                       true,
+                       cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE));
+
+  // test with null-exclude
+  std::vector<bool> validity{1, 0, 0, 1, 1, 0};
+  LCW col_nulls({{-3}, {2, 1}, {0, 5, -3}, {-2}, {}, {28}}, validity.begin());
+  this->reduction_test(col_nulls,
+                       ElementCol{-2},  // expected_value,
+                       true,
+                       true,
+                       cudf::make_nth_element_aggregation(1, cudf::null_policy::EXCLUDE));
+
+  // test with null-include
+  this->reduction_test(col_nulls,
+                       ElementCol{},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(1, cudf::null_policy::INCLUDE));
+}
+
+TEST_F(ListReductionTest, NestedListReductionNthElement)
+{
+  using LCW = cudf::test::lists_column_wrapper<int>;
+
+  // test without nulls
+  auto validity    = std::vector<bool>{1, 0, 0, 1, 1};
+  auto nested_list = LCW(
+    {{LCW{}, LCW{2, 3, 4}}, {}, {LCW{5}, LCW{6}, LCW{7, 8}}, {LCW{9, 10}}, {LCW{11}, LCW{12, 13}}},
+    validity.begin());
+  this->reduction_test(nested_list,
+                       LCW{{}, {2, 3, 4}},  // expected_value,
+                       true,
+                       true,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+
+  // test with null-include
+  this->reduction_test(nested_list,
+                       LCW{},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE));
+
+  // test with null-exclude
+  this->reduction_test(nested_list,
+                       LCW{{11}, {12, 13}},  // expected_value,
+                       true,
+                       true,
+                       cudf::make_nth_element_aggregation(2, cudf::null_policy::EXCLUDE));
+}
+
+TEST_F(ListReductionTest, NonValidListReductionNthElement)
+{
+  using LCW        = cudf::test::lists_column_wrapper<int>;
+  using ElementCol = cudf::test::fixed_width_column_wrapper<int>;
+
+  // test against col.size() <= col.null_count()
+  std::vector<bool> validity{0};
+  this->reduction_test(LCW{{{1, 2}}, validity.begin()},
+                       ElementCol{},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+
+  // test against empty input
+  this->reduction_test(LCW{},
+                       ElementCol{{0}, {0}},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+}
+
+struct StructReductionTest : public cudf::test::BaseFixture {
+  using SCW = cudf::test::structs_column_wrapper;
+
+  void reduction_test(SCW const& struct_column,
+                      cudf::table_view const& expected_value,
+                      bool succeeded_condition,
+                      bool is_valid,
+                      std::unique_ptr<aggregation> const& agg)
+  {
+    auto statement = [&]() {
+      std::unique_ptr<cudf::scalar> result =
+        cudf::reduce(struct_column, agg, cudf::data_type(cudf::type_id::STRUCT));
+      auto struct_result = dynamic_cast<cudf::struct_scalar*>(result.get());
+      EXPECT_EQ(is_valid, struct_result->is_valid());
+      if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUAL(expected_value, struct_result->view()); }
+    };
+
+    if (succeeded_condition) {
+      CUDF_EXPECT_NO_THROW(statement());
+    } else {
+      EXPECT_ANY_THROW(statement());
+    }
+  }
+};
+
+TEST_F(StructReductionTest, StructReductionNthElement)
+{
+  using ICW = cudf::test::fixed_width_column_wrapper<int>;
+
+  // test without nulls
+  auto child0 = *ICW{-3, 2, 1, 0, 5, -3, -2, 28}.release();
+  auto child1 = *ICW{0, 1, 2, 3, 4, 5, 6, 7}.release();
+  auto child2 =
+    *ICW{{-10, 10, -100, 100, -1000, 1000, -10000, 10000}, {1, 0, 0, 1, 1, 1, 0, 1}}.release();
+  std::vector<std::unique_ptr<cudf::column>> input_vector;
+  input_vector.push_back(std::make_unique<cudf::column>(child0));
+  input_vector.push_back(std::make_unique<cudf::column>(child1));
+  input_vector.push_back(std::make_unique<cudf::column>(child2));
+  auto struct_col  = SCW(std::move(input_vector));
+  auto result_col0 = ICW{1};
+  auto result_col1 = ICW{2};
+  auto result_col2 = ICW{{0}, {0}};
+  this->reduction_test(
+    struct_col,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    true,
+    cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE));
+
+  // test with null-include
+  std::vector<bool> validity{1, 1, 1, 0, 1, 0, 0, 1};
+  input_vector.clear();
+  input_vector.push_back(std::make_unique<cudf::column>(child0));
+  input_vector.push_back(std::make_unique<cudf::column>(child1));
+  input_vector.push_back(std::make_unique<cudf::column>(child2));
+  struct_col  = SCW(std::move(input_vector), validity);
+  result_col0 = ICW{{0}, {0}};
+  result_col1 = ICW{{0}, {0}};
+  result_col2 = ICW{{0}, {0}};
+  this->reduction_test(
+    struct_col,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    false,
+    cudf::make_nth_element_aggregation(6, cudf::null_policy::INCLUDE));
+
+  // test with null-exclude
+  result_col0 = ICW{{28}, {1}};
+  result_col1 = ICW{{7}, {1}};
+  result_col2 = ICW{{10000}, {1}};
+  this->reduction_test(
+    struct_col,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    true,
+    cudf::make_nth_element_aggregation(4, cudf::null_policy::EXCLUDE));
+}
+
+TEST_F(StructReductionTest, NestedStructReductionNthElement)
+{
+  using ICW = cudf::test::fixed_width_column_wrapper<int>;
+  using LCW = cudf::test::lists_column_wrapper<int>;
+
+  auto int_col0      = ICW{-4, -3, -2, -1, 0};
+  auto struct_col0   = SCW({int_col0}, std::vector<bool>{1, 0, 0, 1, 1});
+  auto int_col1      = ICW{0, 1, 2, 3, 4};
+  auto list_col      = LCW{{0}, {}, {1, 2}, {3}, {4}};
+  auto struct_col1   = SCW({struct_col0, int_col1, list_col}, std::vector<bool>{1, 1, 1, 0, 1});
+  auto result_child0 = ICW{0};
+  auto result_col0   = SCW({result_child0}, std::vector<bool>{0});
+  auto result_col1   = ICW{{1}, {1}};
+  auto result_col2   = LCW({LCW{}}, std::vector<bool>{1}.begin());
+  // test without nulls
+  this->reduction_test(
+    struct_col1,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    true,
+    cudf::make_nth_element_aggregation(1, cudf::null_policy::INCLUDE));
+
+  // test with null-include
+  result_child0 = ICW{0};
+  result_col0   = SCW({result_child0}, std::vector<bool>{0});
+  result_col1   = ICW{{0}, {0}};
+  result_col2   = LCW({LCW{3}}, std::vector<bool>{0}.begin());
+  this->reduction_test(
+    struct_col1,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    false,
+    cudf::make_nth_element_aggregation(3, cudf::null_policy::INCLUDE));
+
+  // test with null-exclude
+  result_child0 = ICW{0};
+  result_col0   = SCW({result_child0}, std::vector<bool>{1});
+  result_col1   = ICW{{4}, {1}};
+  result_col2   = LCW({LCW{4}}, std::vector<bool>{1}.begin());
+  this->reduction_test(
+    struct_col1,
+    cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
+    true,
+    true,
+    cudf::make_nth_element_aggregation(3, cudf::null_policy::EXCLUDE));
+}
+
+TEST_F(StructReductionTest, NonValidStructReductionNthElement)
+{
+  using ICW = cudf::test::fixed_width_column_wrapper<int>;
+
+  // test against col.size() <= col.null_count()
+  auto child0     = ICW{-3, 3};
+  auto child1     = ICW{0, 0};
+  auto child2     = ICW{{-10, 10}, {0, 1}};
+  auto struct_col = SCW{{child0, child1, child2}, {0, 0}};
+  auto ret_col0   = ICW{{0}, {0}};
+  auto ret_col1   = ICW{{0}, {0}};
+  auto ret_col2   = ICW{{0}, {0}};
+  this->reduction_test(struct_col,
+                       cudf::table_view{{ret_col0, ret_col1, ret_col2}},  // expected_value,
+                       true,
+                       false,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+
+  // test against empty input (would fail because we can not create empty struct scalar)
+  child0     = ICW{};
+  child1     = ICW{};
+  child2     = ICW{};
+  struct_col = SCW{{child0, child1, child2}};
+  ret_col0   = ICW{};
+  ret_col1   = ICW{};
+  ret_col2   = ICW{};
+  this->reduction_test(struct_col,
+                       cudf::table_view{{ret_col0, ret_col1, ret_col2}},  // expected_value,
+                       false,
+                       false,
+                       cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 5fce0841b88059b5df3d76431cf48da881859d08 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 24 Aug 2021 17:25:47 +0200
Subject: [PATCH 08/20] Update to UCX-Py 0.22 (#9099)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9099
---
 ci/gpu/build.sh | 2 +-
 ci/gpu/java.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 326fc2f1119..8e5b4d80115 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -80,7 +80,7 @@ gpuci_mamba_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.21.*"
+                  "ucx-py=0.22.*"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index 8c4b597d12d..b46817bb9ab 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -80,7 +80,7 @@ gpuci_conda_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.21.*" \
+                  "ucx-py=0.22.*" \
                   "openjdk=8.*" \
                   "maven"
 

From c271ce2379d20712e097670a49992a9175747907 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Tue, 24 Aug 2021 11:46:16 -0500
Subject: [PATCH 09/20] move filepath and mmap logic out of json/csv up to
 functions.cpp (#9040)

Removes the filepath-related logic from readers, moving whole-file compression type inference up to `io/functions.cpp`. Also moves the lazy mmap datasource creation logic out csv/json reader and up to `io/functions.cpp`.

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Marlene  (https://github.com/marlenezw)

URL: https://github.com/rapidsai/cudf/pull/9040
---
 cpp/include/cudf/io/csv.hpp            |  36 ++++-
 cpp/include/cudf/io/detail/avro.hpp    |  13 --
 cpp/include/cudf/io/detail/orc.hpp     |  13 --
 cpp/include/cudf/io/detail/parquet.hpp |  13 --
 cpp/include/cudf/io/json.hpp           |  34 ++++-
 cpp/src/io/avro/reader_impl.cu         |  10 --
 cpp/src/io/comp/io_uncomp.h            |   7 +-
 cpp/src/io/comp/uncomp.cpp             |  21 +--
 cpp/src/io/csv/reader_impl.cu          |  83 ++----------
 cpp/src/io/csv/reader_impl.hpp         |   4 -
 cpp/src/io/functions.cpp               | 174 ++++++++++++++++---------
 cpp/src/io/json/reader_impl.cu         |  89 +++----------
 cpp/src/io/json/reader_impl.hpp        |   7 +-
 cpp/src/io/orc/reader_impl.cu          |   9 --
 cpp/src/io/parquet/reader_impl.cu      |   9 --
 cpp/src/io/utilities/parsing_utils.cu  |  34 -----
 cpp/src/io/utilities/parsing_utils.cuh |  18 ---
 python/cudf/cudf/_lib/csv.pyx          |   2 +-
 18 files changed, 232 insertions(+), 344 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 455ffce7ed8..4545972e269 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -176,6 +176,40 @@ class csv_reader_options {
    */
   std::size_t get_byte_range_size() const { return _byte_range_size; }
 
+  /**
+   * @brief Returns number of bytes to read with padding.
+   */
+  std::size_t get_byte_range_size_with_padding() const
+  {
+    if (_byte_range_size == 0) {
+      return 0;
+    } else {
+      return _byte_range_size + get_byte_range_padding();
+    }
+  }
+
+  /**
+   * @brief Returns number of bytes to pad when reading.
+   */
+  std::size_t get_byte_range_padding() const
+  {
+    auto const num_names   = _names.size();
+    auto const num_dtypes  = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+    auto const num_columns = std::max(num_dtypes, num_names);
+
+    auto const max_row_bytes = 16 * 1024;  // 16KB
+    auto const column_bytes  = 64;
+    auto const base_padding  = 1024;  // 1KB
+
+    if (num_columns == 0) {
+      // Use flat size if the number of columns is not known
+      return max_row_bytes;
+    }
+
+    // Expand the size based on the number of columns, if available
+    return base_padding + num_columns * column_bytes;
+  }
+
   /**
    * @brief Returns names of the columns.
    */
@@ -1163,7 +1197,7 @@ class csv_reader_options_builder {
  * @return The set of columns along with metadata.
  */
 table_with_metadata read_csv(
-  csv_reader_options const& options,
+  csv_reader_options options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 98483d1c03e..306c15dcb72 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -38,19 +38,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  avro_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index ab26c01db74..2174b688da2 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -47,19 +47,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  orc_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index d95af7a11da..14f27ef8eef 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -49,19 +49,6 @@ class reader {
   std::unique_ptr<impl> _impl;
 
  public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  parquet_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
   /**
    * @brief Constructor from an array of datasources
    *
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 31201e30ac6..5f34803f28e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -136,6 +136,38 @@ class json_reader_options {
    */
   size_t get_byte_range_size() const { return _byte_range_size; }
 
+  /**
+   * @brief Returns number of bytes to read with padding.
+   */
+  size_t get_byte_range_size_with_padding() const
+  {
+    if (_byte_range_size == 0) {
+      return 0;
+    } else {
+      return _byte_range_size + get_byte_range_padding();
+    }
+  }
+
+  /**
+   * @brief Returns number of bytes to pad when reading.
+   */
+  size_t get_byte_range_padding() const
+  {
+    auto const num_columns = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
+
+    auto const max_row_bytes = 16 * 1024;  // 16KB
+    auto const column_bytes  = 64;
+    auto const base_padding  = 1024;  // 1KB
+
+    if (num_columns == 0) {
+      // Use flat size if the number of columns is not known
+      return max_row_bytes;
+    }
+
+    // Expand the size based on the number of columns, if available
+    return base_padding + num_columns * column_bytes;
+  }
+
   /**
    * @brief Whether to read the file as a json object per line.
    */
@@ -328,7 +360,7 @@ class json_reader_options_builder {
  * @return The set of columns along with metadata.
  */
 table_with_metadata read_json(
-  json_reader_options const& options,
+  json_reader_options options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index f6ffdd99d35..08ea96139a1 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -474,16 +474,6 @@ table_with_metadata reader::impl::read(avro_reader_options const& options,
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata_out)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               avro_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(datasource::create(filepaths[0]), options, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                avro_reader_options const& options,
diff --git a/cpp/src/io/comp/io_uncomp.h b/cpp/src/io/comp/io_uncomp.h
index 8daf73ecd0c..7b1feb84813 100644
--- a/cpp/src/io/comp/io_uncomp.h
+++ b/cpp/src/io/comp/io_uncomp.h
@@ -16,12 +16,13 @@
 
 #pragma once
 
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/span.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include <cudf/utilities/span.hpp>
-
 using cudf::host_span;
 
 namespace cudf {
@@ -42,7 +43,7 @@ enum {
 
 std::vector<char> io_uncompress_single_h2d(void const* src, size_t src_size, int stream_type);
 
-std::vector<char> get_uncompressed_data(host_span<char const> data, std::string const& compression);
+std::vector<char> get_uncompressed_data(host_span<char const> data, compression_type compression);
 
 class HostDecompressor {
  public:
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 2cb99d897fe..e08cf1f8e1b 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -369,6 +369,7 @@ std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int
       // Unsupported format
       break;
   }
+
   CUDF_EXPECTS(comp_data != nullptr, "Unsupported compressed stream type");
   CUDF_EXPECTS(comp_len > 0, "Unsupported compressed stream type");
 
@@ -422,17 +423,17 @@ std::vector<char> io_uncompress_single_h2d(const void* src, size_t src_size, int
  * @return Vector containing the output uncompressed data
  */
 std::vector<char> get_uncompressed_data(host_span<char const> const data,
-                                        std::string const& compression)
+                                        compression_type compression)
 {
-  int comp_type = IO_UNCOMP_STREAM_TYPE_INFER;
-  if (compression == "gzip")
-    comp_type = IO_UNCOMP_STREAM_TYPE_GZIP;
-  else if (compression == "zip")
-    comp_type = IO_UNCOMP_STREAM_TYPE_ZIP;
-  else if (compression == "bz2")
-    comp_type = IO_UNCOMP_STREAM_TYPE_BZIP2;
-  else if (compression == "xz")
-    comp_type = IO_UNCOMP_STREAM_TYPE_XZ;
+  auto const comp_type = [compression]() {
+    switch (compression) {
+      case compression_type::GZIP: return IO_UNCOMP_STREAM_TYPE_GZIP;
+      case compression_type::ZIP: return IO_UNCOMP_STREAM_TYPE_ZIP;
+      case compression_type::BZIP2: return IO_UNCOMP_STREAM_TYPE_BZIP2;
+      case compression_type::XZ: return IO_UNCOMP_STREAM_TYPE_XZ;
+      default: return IO_UNCOMP_STREAM_TYPE_INFER;
+    }
+  }();
 
   return io_uncompress_single_h2d(data.data(), data.size(), comp_type);
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 7f85589a8aa..579a8a5549b 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -56,31 +56,6 @@ namespace csv {
 using namespace cudf::io::csv;
 using namespace cudf::io;
 
-/**
- * @brief Estimates the maximum expected length or a row, based on the number
- * of columns
- *
- * If the number of columns is not available, it will return a value large
- * enough for most use cases
- *
- * @param[in] num_columns Number of columns in the CSV file (optional)
- *
- * @return Estimated maximum size of a row, in bytes
- */
-constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept
-{
-  constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
-  constexpr size_t column_bytes  = 64;
-  constexpr size_t base_padding  = 1024;  // 1KB
-  if (num_columns == 0) {
-    // Use flat size if the number of columns is not known
-    return max_row_bytes;
-  } else {
-    // Expand the size based on the number of columns, if available
-    return base_padding + num_columns * column_bytes;
-  }
-}
-
 /**
  * @brief Translates a dtype string and returns its dtype enumeration and any
  * extended dtype flags that are supported by cuIO. Often, this is a column
@@ -198,35 +173,22 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
 std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
 reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 {
-  auto range_offset  = opts_.get_byte_range_offset();
-  auto range_size    = opts_.get_byte_range_size();
-  auto skip_rows     = opts_.get_skiprows();
-  auto skip_end_rows = opts_.get_skipfooter();
-  auto num_rows      = opts_.get_nrows();
+  auto range_offset      = opts_.get_byte_range_offset();
+  auto range_size        = opts_.get_byte_range_size();
+  auto range_size_padded = opts_.get_byte_range_size_with_padding();
+  auto skip_rows         = opts_.get_skiprows();
+  auto skip_end_rows     = opts_.get_skipfooter();
+  auto num_rows          = opts_.get_nrows();
 
   if (range_offset > 0 || range_size > 0) {
-    CUDF_EXPECTS(compression_type_ == "none",
+    CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE,
                  "Reading compressed data using `byte range` is unsupported");
   }
-  size_t map_range_size = 0;
-  if (range_size != 0) {
-    auto num_given_dtypes =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes());
-    const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes);
-    map_range_size         = range_size + calculateMaxRowSize(num_columns);
-  }
-
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (source_ == nullptr) {
-    assert(!filepath_.empty());
-    source_ = datasource::create(filepath_, range_offset, map_range_size);
-  }
 
   // Transfer source data to GPU
   if (!source_->is_empty()) {
-    auto data_size = (map_range_size != 0) ? map_range_size : source_->size();
-    auto buffer    = source_->host_read(range_offset, data_size);
+    auto const data_size = (range_size_padded != 0) ? range_size_padded : source_->size();
+    auto const buffer    = source_->host_read(range_offset, data_size);
 
     auto h_data = host_span<char const>(  //
       reinterpret_cast<const char*>(buffer->data()),
@@ -234,10 +196,11 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 
     std::vector<char> h_uncomp_data_owner;
 
-    if (compression_type_ != "none") {
-      h_uncomp_data_owner = get_uncompressed_data(h_data, compression_type_);
+    if (opts_.get_compression() != compression_type::NONE) {
+      h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression());
       h_data              = h_uncomp_data_owner;
     }
+
     // None of the parameters for row selection is used, we are parsing the entire file
     const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
                                  skip_end_rows <= 0 && num_rows == -1;
@@ -845,35 +808,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 }
 
 reader::impl::impl(std::unique_ptr<datasource> source,
-                   std::string filepath,
                    csv_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options)
+  : mr_(mr), source_(std::move(source)), opts_(options)
 {
   num_actual_cols_ = opts_.get_names().size();
   num_active_cols_ = num_actual_cols_;
 
-  compression_type_ =
-    infer_compression_type(opts_.get_compression(),
-                           filepath,
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-
   opts = make_parse_options(options, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               csv_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, stream, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                csv_reader_options const& options,
@@ -881,7 +826,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, stream, mr);
+  _impl = std::make_unique<impl>(std::move(sources[0]), options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 4416457be16..de363a46ffe 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -72,13 +72,11 @@ class reader::impl {
    * @brief Constructor from a dataset source with reader options.
    *
    * @param source Dataset source
-   * @param filepath Filepath if reading dataset from a file
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<datasource> source,
-                std::string filepath,
                 csv_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
@@ -213,8 +211,6 @@ class reader::impl {
  private:
   rmm::mr::device_memory_resource* mr_ = nullptr;
   std::unique_ptr<datasource> source_;
-  std::string filepath_;
-  std::string compression_type_;
   const csv_reader_options opts_;
 
   cudf::size_type num_records_ = 0;  // Number of rows with actual data
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index bf51012211c..438cb1762c6 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -106,76 +106,113 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
 }
 
 namespace {
-template <typename reader, typename reader_options>
-std::unique_ptr<reader> make_reader(source_info const& src_info,
-                                    reader_options const& options,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  if (src_info.type == io_type::FILEPATH) {
-    return std::make_unique<reader>(src_info.filepaths, options, stream, mr);
-  }
 
-  std::vector<std::unique_ptr<datasource>> datasources;
-  if (src_info.type == io_type::HOST_BUFFER) {
-    datasources = cudf::io::datasource::create(src_info.buffers);
-  } else if (src_info.type == io_type::USER_IMPLEMENTED) {
-    datasources = cudf::io::datasource::create(src_info.user_sources);
-  } else {
-    CUDF_FAIL("Unsupported source type");
+std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info const& info,
+                                                                    size_t range_offset = 0,
+                                                                    size_t range_size   = 0)
+{
+  switch (info.type) {
+    case io_type::FILEPATH: {
+      auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
+      for (auto const& filepath : info.filepaths) {
+        sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
+      }
+      return sources;
+    }
+    case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers);
+    case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources);
+    default: CUDF_FAIL("Unsupported source type");
   }
-
-  return std::make_unique<reader>(std::move(datasources), options, stream, mr);
 }
 
-template <typename writer, typename... Ts>
-std::unique_ptr<writer> make_writer(sink_info const& sink, Ts&&... args)
+std::unique_ptr<data_sink> make_datasink(sink_info const& info)
 {
-  if (sink.type == io_type::FILEPATH) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.filepath),
-                                    std::forward<Ts>(args)...);
-  }
-  if (sink.type == io_type::HOST_BUFFER) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.buffer),
-                                    std::forward<Ts>(args)...);
-  }
-  if (sink.type == io_type::VOID) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(), std::forward<Ts>(args)...);
+  switch (info.type) {
+    case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath);
+    case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer);
+    case io_type::VOID: return cudf::io::data_sink::create();
+    case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink);
+    default: CUDF_FAIL("Unsupported sink type");
   }
-  if (sink.type == io_type::USER_IMPLEMENTED) {
-    return std::make_unique<writer>(cudf::io::data_sink::create(sink.user_sink),
-                                    std::forward<Ts>(args)...);
-  }
-  CUDF_FAIL("Unsupported sink type");
 }
 
 }  // namespace
 
-table_with_metadata read_avro(avro_reader_options const& opts, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_avro(avro_reader_options const& options,
+                              rmm::mr::device_memory_resource* mr)
 {
   namespace avro = cudf::io::detail::avro;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<avro::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
-  return reader->read(opts);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader =
+    std::make_unique<avro::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+
+  return reader->read(options);
+}
+
+compression_type infer_compression_type(compression_type compression, source_info const& info)
+{
+  if (compression != compression_type::AUTO) { return compression; }
+
+  if (info.type != io_type::FILEPATH) { return compression_type::NONE; }
+
+  auto filepath = info.filepaths[0];
+
+  // Attempt to infer from the file extension
+  const auto pos = filepath.find_last_of('.');
+
+  if (pos == std::string::npos) { return {}; }
+
+  auto str_tolower = [](const auto& begin, const auto& end) {
+    std::string out;
+    std::transform(begin, end, std::back_inserter(out), ::tolower);
+    return out;
+  };
+
+  const auto ext = str_tolower(filepath.begin() + pos + 1, filepath.end());
+
+  if (ext == "gz") { return compression_type::GZIP; }
+  if (ext == "zip") { return compression_type::ZIP; }
+  if (ext == "bz2") { return compression_type::BZIP2; }
+  if (ext == "xz") { return compression_type::XZ; }
+
+  return compression_type::NONE;
 }
 
-table_with_metadata read_json(json_reader_options const& opts, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
 {
   namespace json = cudf::io::detail::json;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<json::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
-  return reader->read(opts);
+
+  options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
+
+  auto datasources = make_datasources(options.get_source(),
+                                      options.get_byte_range_offset(),
+                                      options.get_byte_range_size_with_padding());
+
+  auto reader =
+    std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+
+  return reader->read(options);
 }
 
-table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
 {
   namespace csv = cudf::io::detail::csv;
 
   CUDF_FUNC_RANGE();
+
+  options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
+
+  auto datasources = make_datasources(options.get_source(),
+                                      options.get_byte_range_offset(),
+                                      options.get_byte_range_size_with_padding());
+
   auto reader =
-    make_reader<csv::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
+    std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read();
 }
@@ -185,7 +222,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
 {
   using namespace cudf::io::detail;
 
-  auto writer = make_writer<csv::writer>(options.get_sink(), options, rmm::cuda_stream_default, mr);
+  auto sink = make_datasink(options.get_sink());
+  auto writer =
+    std::make_unique<csv::writer>(std::move(sink), options, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table(), options.get_metadata());
 }
@@ -294,8 +333,10 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info)
 table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader =
-    make_reader<detail_orc::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader      = std::make_unique<detail_orc::reader>(
+    std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -305,11 +346,13 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_
  */
 void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resource* mr)
 {
+  namespace io_detail = cudf::io::detail;
+
   CUDF_FUNC_RANGE();
 
-  namespace io_detail = cudf::io::detail;
-  auto writer         = make_writer<detail_orc::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+  auto sink   = make_datasink(options.get_sink());
+  auto writer = std::make_unique<detail_orc::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
 }
@@ -317,12 +360,15 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
-orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& op,
+orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options,
                                        rmm::mr::device_memory_resource* mr)
 {
   namespace io_detail = cudf::io::detail;
-  writer              = make_writer<detail_orc::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+
+  auto sink = make_datasink(options.get_sink());
+
+  writer = std::make_unique<detail_orc::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -354,8 +400,10 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<detail_parquet::reader>(
-    options.get_source(), options, rmm::cuda_stream_default, mr);
+
+  auto datasources = make_datasources(options.get_source());
+  auto reader      = std::make_unique<detail_parquet::reader>(
+    std::move(datasources), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -392,25 +440,31 @@ table_input_metadata::table_input_metadata(table_view const& table,
 std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const& options,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
   namespace io_detail = cudf::io::detail;
 
-  auto writer = make_writer<detail_parquet::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+  CUDF_FUNC_RANGE();
+
+  auto sink   = make_datasink(options.get_sink());
+  auto writer = std::make_unique<detail_parquet::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
+
   return writer->close(options.get_column_chunks_file_path());
 }
 
 /**
  * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer
  */
-parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& op,
+parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options const& options,
                                                rmm::mr::device_memory_resource* mr)
 {
   namespace io_detail = cudf::io::detail;
-  writer              = make_writer<detail_parquet::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+
+  auto sink = make_datasink(options.get_sink());
+
+  writer = std::make_unique<detail_parquet::writer>(
+    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index f1080342312..bef97edc426 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -50,31 +50,6 @@ namespace detail {
 namespace json {
 using namespace cudf::io;
 
-namespace {
-/**
- * @brief Estimates the maximum expected length or a row, based on the number
- * of columns
- *
- * If the number of columns is not available, it will return a value large
- * enough for most use cases
- *
- * @param[in] num_columns Number of columns in the JSON file (optional)
- *
- * @return Estimated maximum size of a row, in bytes
- */
-constexpr size_t calculate_max_row_size(int num_columns = 0) noexcept
-{
-  constexpr size_t max_row_bytes = 16 * 1024;  // 16KB
-  constexpr size_t column_bytes  = 64;
-  constexpr size_t base_padding  = 1024;  // 1KB
-  return num_columns == 0
-           ? max_row_bytes  // Use flat size if the # of columns is not known
-           : base_padding +
-               num_columns * column_bytes;  // Expand size based on the # of columns, if available
-}
-
-}  // anonymous namespace
-
 /**
  * @brief Aggregate the table containing keys info by their hash values.
  *
@@ -231,25 +206,12 @@ std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_obj
  *
  * @param[in] range_offset Number of bytes offset from the start
  * @param[in] range_size Bytes to read; use `0` for all remaining data
+ * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
  */
-void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
+void reader::impl::ingest_raw_input(size_t range_offset,
+                                    size_t range_size,
+                                    size_t range_size_padded)
 {
-  size_t map_range_size = 0;
-  if (range_size != 0) {
-    auto const dtype_option_size =
-      std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes());
-    map_range_size = range_size + calculate_max_row_size(dtype_option_size);
-  }
-
-  // Support delayed opening of the file if using memory mapping datasource
-  // This allows only mapping of a subset of the file if using byte range
-  if (sources_.empty()) {
-    assert(!filepaths_.empty());
-    for (const auto& path : filepaths_) {
-      sources_.emplace_back(datasource::create(path, range_offset, map_range_size));
-    }
-  }
-
   // Iterate through the user defined sources and read the contents into the local buffer
   CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
@@ -262,14 +224,14 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
   size_t bytes_read = 0;
   for (const auto& source : sources_) {
     if (!source->is_empty()) {
-      auto data_size = (map_range_size != 0) ? map_range_size : source->size();
+      auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
       bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]);
     }
   }
 
   byte_range_offset_ = range_offset;
   byte_range_size_   = range_size;
-  load_whole_file_   = byte_range_offset_ == 0 && byte_range_size_ == 0;
+  load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0;
 }
 
 /**
@@ -280,11 +242,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
  */
 void reader::impl::decompress_input(rmm::cuda_stream_view stream)
 {
-  const auto compression_type =
-    infer_compression_type(options_.get_compression(),
-                           filepaths_.size() > 0 ? filepaths_[0] : "",
-                           {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
-  if (compression_type == "none") {
+  if (options_.get_compression() == compression_type::NONE) {
     // Do not use the owner vector here to avoid extra copy
     uncomp_data_ = reinterpret_cast<const char*>(buffer_.data());
     uncomp_size_ = buffer_.size();
@@ -293,12 +251,12 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream)
       host_span<char const>(                     //
         reinterpret_cast<const char*>(buffer_.data()),
         buffer_.size()),
-      compression_type);
+      options_.get_compression());
 
     uncomp_data_ = uncomp_data_owner_.data();
     uncomp_size_ = uncomp_data_owner_.size();
   }
-  if (load_whole_file_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+  if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
 }
 
 rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_view stream)
@@ -310,7 +268,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); }
   // If not starting at an offset, add an extra row to account for the first row in the file
   cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0);
-  if (load_whole_file_) {
+  if (load_whole_source_) {
     prefilter_count += count_all_from_set(data_, chars_to_count, stream);
   } else {
     prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream);
@@ -328,7 +286,7 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   std::vector<char> chars_to_find{'\n'};
   if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); }
   // Passing offset = 1 to return positions AFTER the found character
-  if (load_whole_file_) {
+  if (load_whole_source_) {
     find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
   } else {
     find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream);
@@ -622,11 +580,10 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
 }
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   std::vector<std::string> const& filepaths,
                    json_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths)
+  : options_(options), mr_(mr), sources_(std::move(sources))
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
@@ -649,10 +606,11 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 table_with_metadata reader::impl::read(json_reader_options const& options,
                                        rmm::cuda_stream_view stream)
 {
-  auto range_offset = options.get_byte_range_offset();
-  auto range_size   = options.get_byte_range_size();
+  auto range_offset      = options.get_byte_range_offset();
+  auto range_size        = options.get_byte_range_size();
+  auto range_size_padded = options.get_byte_range_size_with_padding();
 
-  ingest_raw_input(range_offset, range_size);
+  ingest_raw_input(range_offset, range_size, range_size_padded);
   CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n");
 
   decompress_input(stream);
@@ -674,26 +632,13 @@ table_with_metadata reader::impl::read(json_reader_options const& options,
   return convert_data_to_table(rec_starts, stream);
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               json_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  // Delay actual instantiation of data source until read to allow for
-  // partial memory mapping of file using byte ranges
-  std::vector<std::unique_ptr<datasource>> src = {};  // Empty datasources
-  _impl = std::make_unique<impl>(std::move(src), filepaths, options, stream, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                json_reader_options const& options,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
 {
-  std::vector<std::string> file_paths = {};  // Empty filepaths
-  _impl = std::make_unique<impl>(std::move(sources), file_paths, options, stream, mr);
+  _impl = std::make_unique<impl>(std::move(sources), options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index bbda7e9ba74..4d14edf360a 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -57,7 +57,6 @@ class reader::impl {
   rmm::mr::device_memory_resource* mr_ = nullptr;
 
   std::vector<std::unique_ptr<datasource>> sources_;
-  std::vector<std::string> filepaths_;
   std::vector<uint8_t> buffer_;
 
   const char* uncomp_data_ = nullptr;
@@ -69,7 +68,7 @@ class reader::impl {
 
   size_t byte_range_offset_ = 0;
   size_t byte_range_size_   = 0;
-  bool load_whole_file_     = true;
+  bool load_whole_source_   = true;
 
   table_metadata metadata_;
   std::vector<data_type> dtypes_;
@@ -110,8 +109,9 @@ class reader::impl {
    *
    * @param[in] range_offset Number of bytes offset from the start
    * @param[in] range_size Bytes to read; use `0` for all remaining data
+   * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
    */
-  void ingest_raw_input(size_t range_offset, size_t range_size);
+  void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded);
 
   /**
    * @brief Extract the JSON objects keys from the input file with object rows.
@@ -184,7 +184,6 @@ class reader::impl {
    * @brief Constructor from a dataset source with reader options.
    */
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                std::vector<std::string> const& filepaths,
                 json_reader_options const& options,
                 rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index f7bd5ae86b8..33d19aeeabf 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1438,15 +1438,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               orc_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  _impl = std::make_unique<impl>(datasource::create(filepaths), options, mr);
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index caf11b66206..749ee38e816 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1690,15 +1690,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-// Forward to implementation
-reader::reader(std::vector<std::string> const& filepaths,
-               parquet_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(datasource::create(filepaths), options, mr))
-{
-}
-
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                parquet_reader_options const& options,
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 6c8f01111e5..ba62238c5d3 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -209,39 +209,5 @@ cudf::size_type count_all_from_set(const char* h_data,
   return find_all_from_set<void>(h_data, h_size, keys, 0, nullptr, stream);
 }
 
-std::string infer_compression_type(
-  const compression_type& compression_arg,
-  const std::string& filename,
-  const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map)
-{
-  auto str_tolower = [](const auto& begin, const auto& end) {
-    std::string out;
-    std::transform(begin, end, std::back_inserter(out), ::tolower);
-    return out;
-  };
-
-  // Attempt to infer from user-supplied argument
-  if (compression_arg != compression_type::AUTO) {
-    switch (compression_arg) {
-      case compression_type::GZIP: return "gzip";
-      case compression_type::BZIP2: return "bz2";
-      case compression_type::ZIP: return "zip";
-      case compression_type::XZ: return "xz";
-      default: break;
-    }
-  }
-
-  // Attempt to infer from the file extension
-  const auto pos = filename.find_last_of('.');
-  if (pos != std::string::npos) {
-    const auto ext = str_tolower(filename.begin() + pos + 1, filename.end());
-    for (const auto& mapping : ext_to_comp_map) {
-      if (mapping.first == ext) { return mapping.second; }
-    }
-  }
-
-  return "none";
-}
-
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 88297423b9b..daf23de7eb2 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -454,24 +454,6 @@ cudf::size_type count_all_from_set(const char* h_data,
                                    const std::vector<char>& keys,
                                    rmm::cuda_stream_view stream);
 
-/**
- * @brief Infer file compression type based on user supplied arguments.
- *
- * If the user specifies a valid compression_type for compression arg,
- * compression type will be computed based on that.  Otherwise the filename
- * and ext_to_comp_map will be used.
- *
- * @param[in] compression_arg User specified compression type (if any)
- * @param[in] filename Filename to base compression type (by extension) on
- * @param[in] ext_to_comp_map User supplied mapping of file extension to compression type
- *
- * @return string representing compression type ("gzip, "bz2", etc)
- */
-std::string infer_compression_type(
-  const compression_type& compression_arg,
-  const std::string& filename,
-  const std::vector<std::pair<std::string, std::string>>& ext_to_comp_map);
-
 /**
  * @brief Checks whether the given character is a whitespace character.
  *
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 812d614e6d3..9912a7801a4 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -112,7 +112,7 @@ cdef csv_reader_options make_csv_reader_options(
     bool na_filter,
     object prefix,
     object index_col,
-) except +:
+) except *:
     cdef source_info c_source_info = make_source_info([datasource])
     cdef compression_type c_compression
     cdef size_type c_header

From abba33f3364c7f240e6c8047069b03f4ea591024 Mon Sep 17 00:00:00 2001
From: NV-jpt <86264103+NV-jpt@users.noreply.github.com>
Date: Tue, 24 Aug 2021 12:59:14 -0400
Subject: [PATCH 10/20] Add struct accessor to dask-cudf (#8874)

This PR implements 'Struct Accessor' requested feature in dask-cudf (Issue [#8658](https://github.com/rapidsai/cudf/issues/8658))

StructMethod class implemented to expose 'field(key)' method in dask-cudf

        Examples
        --------
        >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
        >>> ds = dask_cudf.from_cudf(s, 2)
        >>> ds.struct.field(0).compute()
        0    1
        1    3
        dtype: int64
        >>> ds.struct.field('a').compute()
        0    1
        1    3
        dtype: int64

Authors:
  - https://github.com/NV-jpt
  - https://github.com/shaneding

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/8874
---
 python/dask_cudf/dask_cudf/accessors.py       | 37 +++++++++++
 python/dask_cudf/dask_cudf/core.py            |  6 +-
 .../dask_cudf/tests/test_accessor.py          | 62 +++++++++++++++++++
 3 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py
index 04d3e20b844..77973ee34ff 100644
--- a/python/dask_cudf/dask_cudf/accessors.py
+++ b/python/dask_cudf/dask_cudf/accessors.py
@@ -1,6 +1,43 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
 
+class StructMethods:
+    def __init__(self, d_series):
+        self.d_series = d_series
+
+    def field(self, key):
+        """
+        Extract children of the specified struct column
+        in the Series
+        Parameters
+        ----------
+        key: int or str
+            index/position or field name of the respective
+            struct column
+        Returns
+        -------
+        Series
+        Examples
+        --------
+        >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds.struct.field(0).compute()
+        0    1
+        1    3
+        dtype: int64
+        >>> ds.struct.field('a').compute()
+        0    1
+        1    3
+        dtype: int64
+        """
+        typ = self.d_series._meta.struct.field(key).dtype
+
+        return self.d_series.map_partitions(
+            lambda s: s.struct.field(key),
+            meta=self.d_series._meta._constructor([], dtype=typ),
+        )
+
+
 class ListMethods:
     def __init__(self, d_series):
         self.d_series = d_series
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 1a632907047..f1fb408b0d1 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -27,7 +27,7 @@
 from cudf import _lib as libcudf
 
 from dask_cudf import sorting
-from dask_cudf.accessors import ListMethods
+from dask_cudf.accessors import ListMethods, StructMethods
 
 DASK_VERSION = LooseVersion(dask.__version__)
 
@@ -414,6 +414,10 @@ def groupby(self, *args, **kwargs):
     def list(self):
         return ListMethods(self)
 
+    @property
+    def struct(self):
+        return StructMethods(self)
+
 
 class Index(Series, dd.core.Index):
     _partition_type = cudf.Index
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 342f2b60180..8227023aa51 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -438,3 +438,65 @@ def test_sorting(data, ascending, na_position, ignore_index):
         .reset_index(drop=True)
     )
     assert_eq(expect, got)
+
+
+#############################################################################
+#                            Struct Accessor                                #
+#############################################################################
+struct_accessor_data_params = [
+    [{"a": 5, "b": 10}, {"a": 3, "b": 7}, {"a": -3, "b": 11}],
+    [{"a": None, "b": 1}, {"a": None, "b": 0}, {"a": -3, "b": None}],
+    [{"a": 1, "b": 2}],
+    [{"a": 1, "b": 3, "c": 4}],
+]
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_create_struct_series(data):
+    expect = pd.Series(data)
+    ds_got = dgd.from_cudf(Series(data), 2)
+    assert_eq(expect, ds_got.compute())
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_struct_field_str(data):
+    for test_key in ["a", "b"]:
+        expect = Series(data).struct.field(test_key)
+        ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key)
+        assert_eq(expect, ds_got.compute())
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_struct_field_integer(data):
+    for test_key in [0, 1]:
+        expect = Series(data).struct.field(test_key)
+        ds_got = dgd.from_cudf(Series(data), 2).struct.field(test_key)
+        assert_eq(expect, ds_got.compute())
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_dask_struct_field_Key_Error(data):
+    got = dgd.from_cudf(Series(data), 2)
+
+    # import pdb; pdb.set_trace()
+    with pytest.raises(KeyError):
+        got.struct.field("notakey").compute()
+
+
+@pytest.mark.parametrize(
+    "data", struct_accessor_data_params,
+)
+def test_dask_struct_field_Int_Error(data):
+    # breakpoint()
+    got = dgd.from_cudf(Series(data), 2)
+
+    with pytest.raises(IndexError):
+        got.struct.field(1000).compute()

From a15349358ae2d46cd7f93751452607de4aa09f8b Mon Sep 17 00:00:00 2001
From: "Ram (Ramakrishna Prabhu)"
 <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Tue, 24 Aug 2021 23:37:05 +0530
Subject: [PATCH 11/20] Add support for reading ORC file with no row group
 index (#9060)

The ORC reader in cuIO was designed thinking row group index is always available, which resulted in the failure.
Changes have been made to read ORC files even in case group index stream is not available.

closes #8878

Authors:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Devavret Makkar (https://github.com/devavret)
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9060
---
 cpp/src/io/orc/reader_impl.cu                 |  51 ++++++++++++------
 .../TestOrcFile.NoIndStrm.IntWithNulls.orc    | Bin 0 -> 101 bytes
 ...dStrm.StructAndIntWithNulls.TwoStripes.orc | Bin 0 -> 232 bytes
 ...rcFile.NoIndStrm.StructAndIntWithNulls.orc | Bin 0 -> 193 bytes
 ...estOrcFile.NoIndStrm.StructWithNoNulls.orc | Bin 0 -> 167 bytes
 python/cudf/cudf/tests/test_orc.py            |  18 +++++++
 6 files changed, 54 insertions(+), 15 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 33d19aeeabf..1b78d8b8585 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -269,6 +269,7 @@ class aggregate_orc_metadata {
   size_type const num_rows;
   size_type const num_columns;
   size_type const num_stripes;
+  bool row_grp_idx_present = true;
 
   /**
    * @brief Create a metadata object from each element in the source vector
@@ -368,6 +369,8 @@ class aggregate_orc_metadata {
     return per_file_metadata[source_idx].get_column_name(column_idx);
   }
 
+  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
+
   std::vector<cudf::io::orc::metadata::stripe_source_mapping> select_stripes(
     std::vector<std::vector<size_type>> const& user_specified_stripes,
     size_type& row_start,
@@ -457,6 +460,7 @@ class aggregate_orc_metadata {
           ProtobufReader(sf_data, sf_length)
             .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
           mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+          if (stripe->indexLength == 0) { row_grp_idx_present = false; }
         }
       }
     }
@@ -1101,6 +1105,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Association between each ORC column and its cudf::column
     _col_meta.orc_col_map.emplace_back(_metadata->get_num_cols(), -1);
     std::vector<orc_column_meta> nested_col;
+    bool is_data_empty = false;
 
     // Get a list of column data types
     std::vector<data_type> column_types;
@@ -1157,6 +1162,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
       const bool use_index =
         (_use_index == true) &&
+        // Do stripes have row group index
+        _metadata->is_row_grp_idx_present() &&
         // Only use if we don't have much work with complete columns & stripes
         // TODO: Consider nrows, gpu, and tune the threshold
         (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) &&
@@ -1204,13 +1211,21 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                                           stream_info,
                                                           level == 0);
 
-          CUDF_EXPECTS(total_data_size > 0, "Expected streams data within stripe");
+          if (total_data_size == 0) {
+            CUDF_EXPECTS(stripe_info->indexLength == 0, "Invalid index rowgroup stream data");
+            // In case ROW GROUP INDEX is not present and all columns are structs with no null
+            // stream, there is nothing to read at this level.
+            auto fn_check_dtype = [](auto dtype) { return dtype.id() == type_id::STRUCT; };
+            CUDF_EXPECTS(std::all_of(column_types.begin(), column_types.end(), fn_check_dtype),
+                         "Expected streams data within stripe");
+            is_data_empty = true;
+          }
 
           stripe_data.emplace_back(total_data_size, stream);
           auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
 
           // Coalesce consecutive streams into one read
-          while (stream_count < stream_info.size()) {
+          while (not is_data_empty and stream_count < stream_info.size()) {
             const auto d_dst  = dst_base + stream_info[stream_count].dst_pos;
             const auto offset = stream_info[stream_count].offset;
             auto len          = stream_info[stream_count].length;
@@ -1292,8 +1307,10 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             if (chunk.type_kind == orc::TIMESTAMP) {
               chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
             }
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+            if (not is_data_empty) {
+              for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+                chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+              }
             }
           }
           stripe_start_row += num_rows_per_stripe;
@@ -1327,7 +1344,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                          });
         }
         // Setup row group descriptors if using indexes
-        if (_metadata->per_file_metadata[0].ps.compression != orc::NONE) {
+        if (_metadata->per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) {
           auto decomp_data =
             decompress_stripe_data(chunks,
                                    stripe_data,
@@ -1378,19 +1395,23 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr);
         }
 
-        decode_stream_data(chunks,
-                           num_dict_entries,
-                           skip_rows,
-                           tz_table.view(),
-                           row_groups,
-                           _metadata->get_row_index_stride(),
-                           out_buffers[level],
-                           level,
-                           stream);
+        if (not is_data_empty) {
+          decode_stream_data(chunks,
+                             num_dict_entries,
+                             skip_rows,
+                             tz_table.view(),
+                             row_groups,
+                             _metadata->get_row_index_stride(),
+                             out_buffers[level],
+                             level,
+                             stream);
+        }
 
         // Extract information to process nested child columns
         if (nested_col.size()) {
-          scan_null_counts(chunks, null_count_prefix_sums[level], stream);
+          if (not is_data_empty) {
+            scan_null_counts(chunks, null_count_prefix_sums[level], stream);
+          }
           row_groups.device_to_host(stream, true);
           aggregate_child_meta(chunks, row_groups, out_buffers[level], nested_col, level);
         }
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc
new file mode 100644
index 0000000000000000000000000000000000000000..2103e0212fcdcc9a110e0dbe550d2fcb94bb640d
GIT binary patch
literal 101
zcmeYda%N><_<x_5f#DFF2GdjrJqCuyNski*5~d_P1=1{+<~aGt@Hm`dVQ^$%;9wTe
zmf`}E3=(__QW_je92`7CjEqu@iAqcy%my4B$^wiM4GKz33_KbP2AoY_n3?^9oW%eF
CbQJyo

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc
new file mode 100644
index 0000000000000000000000000000000000000000..e57da851820d4b76e6eaa581c3215ce8e397944e
GIT binary patch
literal 232
zcmeYda^_@UIKZyK#LB?%|2{7R!yz^srl}6xAi*E}OkNBOkCPrJh$Ku(cn+im5~d|Q
zW%?MYVsX%9HA}mIFgvq|vEoG^2CyLvNQQ{O47otOAx%Kb62xAlPI>ewK`ddygwP8Y
z9x!aOO~_G|<C9ZVa|_cv=AwBlgrmPRvdhr3%P_LDvpKSpS>w<NrK2;Lo-uB0WMJTM
a5nz;PP*7rG;L%_(;B5NB%<LcJECv7@fJzns

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc
new file mode 100644
index 0000000000000000000000000000000000000000..32d0c85dd25b7cbe4c9e3c434b438f57cab52bc2
GIT binary patch
literal 193
zcmeYda%N><_<x_5f#DFF2Gdjr1kVP@;|B?UV1$W-xj*=U+&Bh?$4QS91QI4CJO$D$
zm*&JM*cneb&-BcgYlBxo<I$T79_0wdIQhu%1e|GcX*Oi20_sQ*J2C0eqXe;p2?n7C
zmmUN>diX#vDJf-glFyVRpXLuUW-OJMyhLL1)EO2t6qR(=1odbJ^{h+~)G&L%z`)@k
Zz$nq6pv1($qrqUn+4O~(*+0lx3;@GaM5X`$

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc
new file mode 100644
index 0000000000000000000000000000000000000000..1c6e53a0b92f68b996b4a249b7050eae715331e9
GIT binary patch
literal 167
zcmeYda%N{>U}a<gkqqn>3=E$=&YnDZ-Sh)PaGSs>4kt-x4PlN{u7zh5rdTp?f|Q<M
zbYWn4ob)(BBw<<th;F)c#vy7Rx5_z{gL+4f9Ar4I<6y|gkOowgAl5PI(W3;hgb4;g
z0aqUcJbLgzFezn9l23>b!=@QCW-OgC`Ejpf%R(Vxc42|d(gh3*9A*NH5)BGUObpx_
P3<jJ{UznNwgPg?x?Q1ut

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 33ce8427a71..0f769d17015 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1171,3 +1171,21 @@ def test_writer_timestamp_stream_size(datadir, tmpdir):
     got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "fname",
+    [
+        "TestOrcFile.NoIndStrm.StructWithNoNulls.orc",
+        "TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc",
+        "TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc",
+        "TestOrcFile.NoIndStrm.IntWithNulls.orc",
+    ],
+)
+def test_no_row_group_index_orc_read(datadir, fname):
+    fpath = datadir / fname
+
+    got = pa.orc.ORCFile(fpath).read()
+    expect = cudf.read_orc(fpath)
+
+    assert got.equals(expect.to_arrow())

From 807519983dc57b103a2b8e081e47fac918813615 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Tue, 24 Aug 2021 16:21:59 -0500
Subject: [PATCH 12/20] multibyte_split (#8702)

Adds `multbyte_split` API, part of #8557. Takes one large text input and splits it in to a single strings column.

- Features:
  - [x] split on multi-byte delimiters
  - [x] split on multiple delimiters simultaneously
  - [ ] erase delimiters from output (will implement later)
  - [ ] replace delimiters with alternate text (will implement later)
- Supported input types
  - [x] `cudf::io::text::data_chunk_source`
    - [x] `cudf::string_scalar` via `cudf::device_span`
    - [x] `std::string` via `std::istream`
    - [x] files via `std::istream`
- Supported delimiter type
  - [x] `std::string`
- Performance Goals
  - [x] ~2G/s from file, ~4G/s on-device. There is room for improvement, but perf is good enough for now.
- Additional goals:
  - [x] add reusable block-level pattern-matching utility.
  - [ ] add reusable block-level utility to "peek" at "future" scan states (will implement with delimiter erasure).

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Elias Stehle (https://github.com/elstehle)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Devavret Makkar (https://github.com/devavret)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/8702
---
 conda/recipes/libcudf/meta.yaml               |  10 +-
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 cpp/benchmarks/io/cuio_benchmark_common.hpp   |   2 +
 .../io/text/multibyte_split_benchmark.cpp     | 164 ++++++++
 cpp/include/cudf/column/column_factories.hpp  |  20 +
 .../cudf/io/text/data_chunk_source.hpp        |  70 ++++
 .../io/text/data_chunk_source_factories.hpp   | 231 ++++++++++
 .../cudf/io/text/detail/multistate.hpp        | 155 +++++++
 .../cudf/io/text/detail/tile_state.hpp        | 134 ++++++
 cpp/include/cudf/io/text/detail/trie.hpp      | 264 ++++++++++++
 cpp/include/cudf/io/text/multibyte_split.hpp  |  37 ++
 cpp/src/io/text/multibyte_split.cu            | 396 ++++++++++++++++++
 cpp/src/strings/strings_column_factories.cu   |  42 ++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/io/text/multibyte_split_test.cpp    | 143 +++++++
 16 files changed, 1673 insertions(+), 2 deletions(-)
 create mode 100644 cpp/benchmarks/io/text/multibyte_split_benchmark.cpp
 create mode 100644 cpp/include/cudf/io/text/data_chunk_source.hpp
 create mode 100644 cpp/include/cudf/io/text/data_chunk_source_factories.hpp
 create mode 100644 cpp/include/cudf/io/text/detail/multistate.hpp
 create mode 100644 cpp/include/cudf/io/text/detail/tile_state.hpp
 create mode 100644 cpp/include/cudf/io/text/detail/trie.hpp
 create mode 100644 cpp/include/cudf/io/text/multibyte_split.hpp
 create mode 100644 cpp/src/io/text/multibyte_split.cu
 create mode 100644 cpp/tests/io/text/multibyte_split_test.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 208c21c2dc0..0f05dcb4bb3 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -118,10 +118,9 @@ test:
     - test -f $PREFIX/include/cudf/hashing.hpp
     - test -f $PREFIX/include/cudf/interop.hpp
     - test -f $PREFIX/include/cudf/io/avro.hpp
+    - test -f $PREFIX/include/cudf/io/csv.hpp
     - test -f $PREFIX/include/cudf/io/data_sink.hpp
     - test -f $PREFIX/include/cudf/io/datasource.hpp
-    - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
-    - test -f $PREFIX/include/cudf/io/csv.hpp
     - test -f $PREFIX/include/cudf/io/detail/avro.hpp
     - test -f $PREFIX/include/cudf/io/detail/csv.hpp
     - test -f $PREFIX/include/cudf/io/detail/json.hpp
@@ -129,8 +128,15 @@ test:
     - test -f $PREFIX/include/cudf/io/detail/parquet.hpp
     - test -f $PREFIX/include/cudf/io/detail/utils.hpp
     - test -f $PREFIX/include/cudf/io/json.hpp
+    - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
     - test -f $PREFIX/include/cudf/io/orc.hpp
     - test -f $PREFIX/include/cudf/io/parquet.hpp
+    - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
+    - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp
+    - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp
+    - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp
     - test -f $PREFIX/include/cudf/io/types.hpp
     - test -f $PREFIX/include/cudf/ipc.hpp
     - test -f $PREFIX/include/cudf/join.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d6b457a94d4..d9a493f57a0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -307,6 +307,7 @@ add_library(cudf
     src/io/parquet/writer_impl.cu
     src/io/statistics/orc_column_statistics.cu
     src/io/statistics/parquet_column_statistics.cu
+    src/io/text/multibyte_split.cu
     src/io/utilities/column_buffer.cpp
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 56f17dc7090..b3b92003573 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -245,3 +245,8 @@ ConfigureBench(STRINGS_BENCH
 # - json benchmark -------------------------------------------------------------------
 ConfigureBench(JSON_BENCH
   string/json_benchmark.cpp)
+
+###################################################################################################
+# - io benchmark ---------------------------------------------------------------------
+ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
+  io/text/multibyte_split_benchmark.cpp)
diff --git a/cpp/benchmarks/io/cuio_benchmark_common.hpp b/cpp/benchmarks/io/cuio_benchmark_common.hpp
index 2c49386a901..7107585dbcc 100644
--- a/cpp/benchmarks/io/cuio_benchmark_common.hpp
+++ b/cpp/benchmarks/io/cuio_benchmark_common.hpp
@@ -33,6 +33,8 @@ using cudf::io::io_type;
   benchmark(name##_buffer_output, type_or_group, static_cast<uint32_t>(io_type::HOST_BUFFER)); \
   benchmark(name##_void_output, type_or_group, static_cast<uint32_t>(io_type::VOID));
 
+std::string random_file_in_dir(std::string const& dir_path);
+
 /**
  * @brief Class to create a coupled `source_info` and `sink_info` of given type.
  */
diff --git a/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp
new file mode 100644
index 00000000000..cb8a61caa57
--- /dev/null
+++ b/cpp/benchmarks/io/text/multibyte_split_benchmark.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf_test/file_utilities.hpp>
+
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/transform.h>
+
+#include <cstdio>
+#include <fstream>
+#include <memory>
+
+using cudf::test::fixed_width_column_wrapper;
+
+temp_directory const temp_dir("cudf_gbench");
+
+enum data_chunk_source_type {
+  device,
+  file,
+  host,
+};
+
+static cudf::string_scalar create_random_input(int32_t num_chars,
+                                               double delim_factor,
+                                               double deviation,
+                                               std::string delim)
+{
+  auto const num_delims      = static_cast<int32_t>((num_chars * delim_factor) / delim.size());
+  auto const num_delim_chars = num_delims * delim.size();
+  auto const num_value_chars = num_chars - num_delim_chars;
+  auto const num_rows        = num_delims;
+  auto const value_size_avg  = static_cast<int32_t>(num_value_chars / num_rows);
+  auto const value_size_min  = static_cast<int32_t>(value_size_avg * (1 - deviation));
+  auto const value_size_max  = static_cast<int32_t>(value_size_avg * (1 + deviation));
+
+  data_profile table_profile;
+
+  table_profile.set_distribution_params(  //
+    cudf::type_id::STRING,
+    distribution_id::NORMAL,
+    value_size_min,
+    value_size_max);
+
+  auto const values_table = create_random_table(  //
+    {cudf::type_id::STRING},
+    1,
+    row_count{num_rows},
+    table_profile);
+
+  auto delim_scalar  = cudf::make_string_scalar(delim);
+  auto delims_column = cudf::make_column_from_scalar(*delim_scalar, num_rows);
+  auto input_table  = cudf::table_view({values_table->get_column(0).view(), delims_column->view()});
+  auto input_column = cudf::strings::concatenate(input_table);
+
+  // extract the chars from the returned strings column.
+  auto input_column_contents = input_column->release();
+  auto chars_column_contents = input_column_contents.children[1]->release();
+  auto chars_buffer          = chars_column_contents.data.release();
+
+  // turn the chars in to a string scalar.
+  return cudf::string_scalar(std::move(*chars_buffer));
+}
+
+static void BM_multibyte_split(benchmark::State& state)
+{
+  auto source_type      = static_cast<data_chunk_source_type>(state.range(0));
+  auto delim_size       = state.range(1);
+  auto delim_percent    = state.range(2);
+  auto file_size_approx = state.range(3);
+
+  CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
+  CUDF_EXPECTS(delim_percent <= 50, "delimiter percent must be at most 50");
+
+  auto delim = std::string(":", delim_size);
+
+  auto delim_factor = static_cast<double>(delim_percent) / 100;
+  auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim);
+  auto host_input   = thrust::host_vector<char>(device_input.size());
+  auto host_string  = std::string(host_input.data(), host_input.size());
+
+  cudaMemcpyAsync(host_input.data(),
+                  device_input.data(),
+                  device_input.size() * sizeof(char),
+                  cudaMemcpyDeviceToHost,
+                  rmm::cuda_stream_default);
+
+  auto temp_file_name = random_file_in_dir(temp_dir.path());
+
+  {
+    auto temp_fostream = std::ofstream(temp_file_name, std::ofstream::out);
+    temp_fostream.write(host_input.data(), host_input.size());
+  }
+
+  cudaDeviceSynchronize();
+
+  auto source = std::unique_ptr<cudf::io::text::data_chunk_source>(nullptr);
+
+  switch (source_type) {
+    case data_chunk_source_type::file:  //
+      source = cudf::io::text::make_source_from_file(temp_file_name);
+      break;
+    case data_chunk_source_type::host:  //
+      source = cudf::io::text::make_source(host_string);
+      break;
+    case data_chunk_source_type::device:  //
+      source = cudf::io::text::make_source(device_input);
+      break;
+    default: CUDF_FAIL();
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);
+    auto output = cudf::io::text::multibyte_split(*source, delim);
+  }
+
+  state.SetBytesProcessed(state.iterations() * device_input.size());
+}
+
+class MultibyteSplitBenchmark : public cudf::benchmark {
+};
+
+#define TRANSPOSE_BM_BENCHMARK_DEFINE(name)                                     \
+  BENCHMARK_DEFINE_F(MultibyteSplitBenchmark, name)(::benchmark::State & state) \
+  {                                                                             \
+    BM_multibyte_split(state);                                                  \
+  }                                                                             \
+  BENCHMARK_REGISTER_F(MultibyteSplitBenchmark, name)                           \
+    ->ArgsProduct({{data_chunk_source_type::device,                             \
+                    data_chunk_source_type::file,                               \
+                    data_chunk_source_type::host},                              \
+                   {1, 4, 7},                                                   \
+                   {1, 25},                                                     \
+                   {1 << 15, 1 << 30}})                                         \
+    ->UseManualTime()                                                           \
+    ->Unit(::benchmark::kMillisecond);
+
+TRANSPOSE_BM_BENCHMARK_DEFINE(multibyte_split_simple);
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index bdb7fd48e60..ebd7f5bbef0 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -442,6 +442,26 @@ std::unique_ptr<column> make_strings_column(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Construct a STRING type column given offsets, columns, and optional null count and null
+ * mask.
+ *
+ * @param[in] num_strings The number of strings the column represents.
+ * @param[in] offsets The offset values for this column. The number of elements is one more than the
+ * total number of strings so the `offset[last] - offset[0]` is the total number of bytes in the
+ * strings vector.
+ * @param[in] chars The char bytes for all the strings for this column. Individual strings are
+ * identified by the offsets and the nullmask.
+ * @param[in] null_mask The bits specifying the null strings in device memory. Arrow format for
+ *  nulls is used for interpreting this bitmask.
+ * @param[in] null_count The number of null string entries.
+ */
+std::unique_ptr<column> make_strings_column(size_type num_strings,
+                                            rmm::device_uvector<size_type>&& offsets,
+                                            rmm::device_uvector<char>&& chars,
+                                            rmm::device_buffer&& null_mask = {},
+                                            size_type null_count = cudf::UNKNOWN_NULL_COUNT);
+
 /**
  * @brief Construct a LIST type column given offsets column, child column, null mask and null
  * count.
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
new file mode 100644
index 00000000000..6ee1fa033d0
--- /dev/null
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/device_buffer.hpp>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+/**
+ * @brief a reader capable of producing views over device memory.
+ *
+ * The data chunk reader API encapsulates the idea of statefully traversing and loading a data
+ * source. A data source may be a file, a region of device memory, or a region of host memory.
+ * Reading data from these data sources efficiently requires different strategies dependings on the
+ * type of data source, type of compression, capabilities of the host and device, the data's
+ * destination. Whole-file decompression should be hidden behind this interface
+ *
+ */
+class data_chunk_reader {
+ public:
+  /**
+   * @brief Get the next chunk of bytes from the data source
+   *
+   * Performs any necessary work to read and prepare the underlying data source for consumption as a
+   * view over device memory. Common implementations may read from a file, copy data from host
+   * memory, allocate temporary memory, perform iterative decompression, or even launch device
+   * kernels.
+   *
+   * @param size number of bytes to read.
+   * @param stream stream to associate allocations or perform work required to obtain chunk
+   * @return a chunk of data up to @param size bytes. May return less than @param size bytes if
+   * reader reaches end of underlying data source. Returned data must be accessed in stream order
+   * relative to the specified @param stream.
+   */
+  virtual device_span<char const> get_next_chunk(std::size_t size,
+                                                 rmm::cuda_stream_view stream) = 0;
+};
+
+/**
+ * @brief a data source capable of creating a reader which can produce views of the data source in
+ * device memory.
+ *
+ */
+class data_chunk_source {
+ public:
+  virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
+};
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
new file mode 100644
index 00000000000..f6807c1c9a8
--- /dev/null
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/io/text/data_chunk_source.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <thrust/host_vector.h>
+#include <thrust/system/cuda/experimental/pinned_allocator.h>
+
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+namespace {
+
+/**
+ * @brief a reader which produces views of device memory which contain a copy of the data from an
+ * istream.
+ *
+ */
+class istream_data_chunk_reader : public data_chunk_reader {
+  struct host_ticket {
+    cudaEvent_t event;
+    thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>> buffer;
+  };
+
+ public:
+  istream_data_chunk_reader(std::unique_ptr<std::istream> datastream)
+    : _datastream(std::move(datastream)), _buffers(), _tickets(2)
+  {
+    // create an event to track the completion of the last device-to-host copy.
+    for (std::size_t i = 0; i < _tickets.size(); i++) {
+      CUDA_TRY(cudaEventCreate(&(_tickets[i].event)));
+    }
+  }
+
+  ~istream_data_chunk_reader()
+  {
+    for (std::size_t i = 0; i < _tickets.size(); i++) {
+      CUDA_TRY(cudaEventDestroy(_tickets[i].event));
+    }
+  }
+
+  device_span<char> find_or_create_data(std::size_t size, rmm::cuda_stream_view stream)
+  {
+    auto search = _buffers.find(stream.value());
+
+    if (search == _buffers.end() || search->second.size() < size) {
+      _buffers[stream.value()] = rmm::device_buffer(size, stream);
+    }
+
+    return device_span<char>(static_cast<char*>(_buffers[stream.value()].data()), size);
+  }
+
+  device_span<char const> get_next_chunk(std::size_t read_size,
+                                         rmm::cuda_stream_view stream) override
+  {
+    CUDF_FUNC_RANGE();
+
+    auto& h_ticket = _tickets[_next_ticket_idx];
+
+    _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size();
+
+    // synchronize on the last host-to-device copy, so we don't clobber the host buffer.
+    CUDA_TRY(cudaEventSynchronize(h_ticket.event));
+
+    // resize the host buffer as necessary to contain the requested number of bytes
+    if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
+
+    // read data from the host istream in to the pinned host memory buffer
+    _datastream->read(h_ticket.buffer.data(), read_size);
+
+    // adjust the read size to reflect how many bytes were actually read from the data stream
+    read_size = _datastream->gcount();
+
+    // get a view over some device memory we can use to buffer the read data on to device.
+    auto chunk_span = find_or_create_data(read_size, stream);
+
+    // copy the host-pinned data on to device
+    CUDA_TRY(cudaMemcpyAsync(  //
+      chunk_span.data(),
+      h_ticket.buffer.data(),
+      read_size,
+      cudaMemcpyHostToDevice,
+      stream.value()));
+
+    // record the host-to-device copy.
+    CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
+
+    // return the view over device memory so it can be processed.
+    return chunk_span;
+  }
+
+ private:
+  std::size_t _next_ticket_idx = 0;
+  std::unique_ptr<std::istream> _datastream;
+  std::unordered_map<cudaStream_t, rmm::device_buffer> _buffers;
+  std::vector<host_ticket> _tickets;
+};
+
+/**
+ * @brief a reader which produces view of device memory which represent a subset of the input device
+ * span
+ *
+ */
+class device_span_data_chunk_reader : public data_chunk_reader {
+ public:
+  device_span_data_chunk_reader(device_span<char const> data) : _data(data) {}
+
+  device_span<char const> get_next_chunk(std::size_t read_size,
+                                         rmm::cuda_stream_view stream) override
+  {
+    // limit the read size to the number of bytes remaining in the device_span.
+    if (read_size > _data.size() - _position) { read_size = _data.size() - _position; }
+
+    // create a view over the device span
+    auto chunk_span = _data.subspan(_position, read_size);
+
+    // increment position
+    _position += read_size;
+
+    // return the view over device memory so it can be processed.
+    return chunk_span;
+  }
+
+ private:
+  device_span<char const> _data;
+  uint64_t _position = 0;
+};
+
+/**
+ * @brief a file data source which creates an istream_data_chunk_reader
+ *
+ */
+class file_data_chunk_source : public data_chunk_source {
+ public:
+  file_data_chunk_source(std::string filename) : _filename(filename) {}
+  std::unique_ptr<data_chunk_reader> create_reader() const override
+  {
+    return std::make_unique<istream_data_chunk_reader>(
+      std::make_unique<std::ifstream>(_filename, std::ifstream::in));
+  }
+
+ private:
+  std::string _filename;
+};
+
+/**
+ * @brief a host string data source which creates an istream_data_chunk_reader
+ */
+class string_data_chunk_source : public data_chunk_source {
+ public:
+  string_data_chunk_source(std::string const& data) : _data(data) {}
+  std::unique_ptr<data_chunk_reader> create_reader() const override
+  {
+    return std::make_unique<istream_data_chunk_reader>(std::make_unique<std::istringstream>(_data));
+  }
+
+ private:
+  std::string const& _data;
+};
+
+/**
+ * @brief a device span data source which creates an istream_data_chunk_reader
+ */
+class device_span_data_chunk_source : public data_chunk_source {
+ public:
+  device_span_data_chunk_source(device_span<char const> data) : _data(data) {}
+  std::unique_ptr<data_chunk_reader> create_reader() const override
+  {
+    return std::make_unique<device_span_data_chunk_reader>(_data);
+  }
+
+ private:
+  device_span<char const> _data;
+};
+
+}  // namespace
+
+/**
+ * @brief Creates a data source capable of producing device-buffered views of the given string.
+ */
+std::unique_ptr<data_chunk_source> make_source(std::string const& data)
+{
+  return std::make_unique<string_data_chunk_source>(data);
+}
+
+/**
+ * @brief Creates a data source capable of producing device-buffered views of the file
+ */
+std::unique_ptr<data_chunk_source> make_source_from_file(std::string const& filename)
+{
+  return std::make_unique<file_data_chunk_source>(filename);
+}
+
+/**
+ * @brief Creates a data source capable of producing views of the given device string scalar
+ */
+std::unique_ptr<data_chunk_source> make_source(cudf::string_scalar& data)
+{
+  auto data_span = device_span<char const>(data.data(), data.size());
+  return std::make_unique<device_span_data_chunk_source>(data_span);
+}
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
new file mode 100644
index 00000000000..d3c8909ab51
--- /dev/null
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cudf {
+namespace io {
+namespace text {
+namespace detail {
+
+/**
+ * @brief Represents up to 7 segments
+ */
+struct multistate {
+ private:
+  /**
+   * @brief represents a (head, tail] segment, stored as a single 8 bit value
+   */
+  struct multistate_segment {
+   public:
+    /**
+     * @brief Creates a segment which represents (0, 0]
+     */
+
+    constexpr multistate_segment() : _data(0) {}
+    /**
+     * @brief Creates a segment which represents (head, tail]
+     *
+     * @param head the (head, ____] value. Undefined behavior for values >= 16
+     * @param tail the (____, tail] value. Undefined behavior for values >= 16
+     */
+
+    constexpr multistate_segment(uint8_t head, uint8_t tail) : _data((head & 0b1111) | (tail << 4))
+    {
+    }
+
+    /**
+     * @brief Get's the (head, ____] value from the segment.
+     */
+    constexpr uint8_t get_head() const { return _data & 0b1111; }
+
+    /**
+     * @brief Get's the (____, tail] value from the segment.
+     */
+    constexpr uint8_t get_tail() const { return _data >> 4; }
+
+   private:
+    uint8_t _data;
+  };
+
+ public:
+  /**
+   * @brief The maximum state (head or tail) this multistate can represent
+   */
+
+  static auto constexpr max_segment_value = 15;
+  /**
+   * @brief The maximum number of segments this multistate can represent
+   */
+  static auto constexpr max_segment_count = 7;
+
+  /**
+   * @brief Enqueues a (head, tail] segment to this multistate
+   *
+   * @note: The behavior of this function is undefined if size() => max_segment_count
+   */
+  constexpr void enqueue(uint8_t head, uint8_t tail)
+  {
+    _segments[_size++] = multistate_segment(head, tail);
+  }
+
+  /**
+   * @brief get's the number of segments this multistate represents
+   */
+  constexpr uint8_t size() const { return _size; }
+
+  /**
+   * @brief get's the highest (____, tail] value this multistate represents
+   */
+  constexpr uint8_t max_tail() const
+  {
+    uint8_t maximum = 0;
+
+    for (uint8_t i = 0; i < _size; i++) {
+      maximum = std::max(maximum, get_tail(i));
+    }
+
+    return maximum;
+  }
+
+  /**
+   * @brief get's the Nth (head, ____] value state this multistate represents
+   */
+  constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); }
+
+  /**
+   * @brief get's the Nth (____, tail] value state this multistate represents
+   */
+  constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); }
+
+ private:
+  uint8_t _size = 0;
+  multistate_segment _segments[max_segment_count];
+};
+
+/**
+ * @brief associatively inner-joins transition histories.
+ *
+ * Examples:
+ *           <(0, 5]> + <(5, 9]>         = <(0, 9]>
+ *           <(0, 5]> + <(6, 9]>         = <>
+ *   <(0, 1], (0, 2]> + <(2, 3], (1, 4]> = <(0, 4], (0, 3]>
+ *   <(0, 1], (0, 2]> + <(1, 3]>         = <(0, 3]>
+ *
+ * Head and tail value are limited to [0, 1, ..., 16]
+ *
+ * @param lhs past segments
+ * @param rhs future segments
+ * @return full join of past and future segments
+ */
+constexpr multistate operator+(multistate const& lhs, multistate const& rhs)
+{
+  // combine two multistates together by full-joining LHS tails to RHS heads,
+  // and taking the corresponding LHS heads and RHS tails.
+
+  multistate result;
+  for (uint8_t lhs_idx = 0; lhs_idx < lhs.size(); lhs_idx++) {
+    auto tail = lhs.get_tail(lhs_idx);
+    for (uint8_t rhs_idx = 0; rhs_idx < rhs.size(); rhs_idx++) {
+      auto head = rhs.get_head(rhs_idx);
+      if (tail == head) { result.enqueue(lhs.get_head(lhs_idx), rhs.get_tail(rhs_idx)); }
+    }
+  }
+  return result;
+}
+
+}  // namespace detail
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
new file mode 100644
index 00000000000..849d857597b
--- /dev/null
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -0,0 +1,134 @@
+
+#pragma once
+
+#include <cub/block/block_scan.cuh>
+
+#include <cuda/atomic>
+
+namespace cudf {
+namespace io {
+namespace text {
+namespace detail {
+
+enum class scan_tile_status : uint8_t {
+  oob,
+  invalid,
+  partial,
+  inclusive,
+};
+
+template <typename T>
+struct scan_tile_state_view {
+  uint64_t num_tiles;
+  cuda::atomic<scan_tile_status, cuda::thread_scope_device>* tile_status;
+  T* tile_partial;
+  T* tile_inclusive;
+
+  __device__ inline void set_status(cudf::size_type tile_idx, scan_tile_status status)
+  {
+    auto const offset = (tile_idx + num_tiles) % num_tiles;
+    tile_status[offset].store(status, cuda::memory_order_relaxed);
+  }
+
+  __device__ inline void set_partial_prefix(cudf::size_type tile_idx, T value)
+  {
+    auto const offset = (tile_idx + num_tiles) % num_tiles;
+    cub::ThreadStore<cub::STORE_CG>(tile_partial + offset, value);
+    tile_status[offset].store(scan_tile_status::partial);
+  }
+
+  __device__ inline void set_inclusive_prefix(cudf::size_type tile_idx, T value)
+  {
+    auto const offset = (tile_idx + num_tiles) % num_tiles;
+    cub::ThreadStore<cub::STORE_CG>(tile_inclusive + offset, value);
+    tile_status[offset].store(scan_tile_status::inclusive);
+  }
+
+  __device__ inline T get_prefix(cudf::size_type tile_idx, scan_tile_status& status)
+  {
+    auto const offset = (tile_idx + num_tiles) % num_tiles;
+
+    while ((status = tile_status[offset].load(cuda::memory_order_relaxed)) ==
+           scan_tile_status::invalid) {}
+
+    if (status == scan_tile_status::partial) {
+      return cub::ThreadLoad<cub::LOAD_CG>(tile_partial + offset);
+    } else {
+      return cub::ThreadLoad<cub::LOAD_CG>(tile_inclusive + offset);
+    }
+  }
+};
+
+template <typename T>
+struct scan_tile_state {
+  rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>> tile_status;
+  rmm::device_uvector<T> tile_state_partial;
+  rmm::device_uvector<T> tile_state_inclusive;
+
+  scan_tile_state(cudf::size_type num_tiles,
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    : tile_status(rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>>(
+        num_tiles, stream, mr)),
+      tile_state_partial(rmm::device_uvector<T>(num_tiles, stream, mr)),
+      tile_state_inclusive(rmm::device_uvector<T>(num_tiles, stream, mr))
+  {
+  }
+
+  operator scan_tile_state_view<T>()
+  {
+    return scan_tile_state_view<T>{tile_status.size(),
+                                   tile_status.data(),
+                                   tile_state_partial.data(),
+                                   tile_state_inclusive.data()};
+  }
+
+  inline T get_inclusive_prefix(cudf::size_type tile_idx, rmm::cuda_stream_view stream) const
+  {
+    auto const offset = (tile_idx + tile_status.size()) % tile_status.size();
+    return tile_state_inclusive.element(offset, stream);
+  }
+};
+
+template <typename T>
+struct scan_tile_state_callback {
+  __device__ inline scan_tile_state_callback(scan_tile_state_view<T>& tile_state,
+                                             cudf::size_type tile_idx)
+    : _tile_state(tile_state), _tile_idx(tile_idx)
+  {
+  }
+
+  __device__ inline T operator()(T const& block_aggregate)
+  {
+    T exclusive_prefix;
+
+    if (threadIdx.x == 0) {
+      _tile_state.set_partial_prefix(_tile_idx, block_aggregate);
+
+      auto predecessor_idx    = _tile_idx - 1;
+      auto predecessor_status = scan_tile_status::invalid;
+
+      // scan partials to form prefix
+
+      auto window_partial = _tile_state.get_prefix(predecessor_idx, predecessor_status);
+      while (predecessor_status != scan_tile_status::inclusive) {
+        predecessor_idx--;
+        auto predecessor_prefix = _tile_state.get_prefix(predecessor_idx, predecessor_status);
+        window_partial          = predecessor_prefix + window_partial;
+      }
+      exclusive_prefix = window_partial;
+
+      _tile_state.set_inclusive_prefix(_tile_idx, exclusive_prefix + block_aggregate);
+    }
+
+    return exclusive_prefix;
+  }
+
+  scan_tile_state_view<T>& _tile_state;
+  cudf::size_type _tile_idx;
+};
+
+}  // namespace detail
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
new file mode 100644
index 00000000000..d14fe15b0a9
--- /dev/null
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/text/detail/multistate.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <algorithm>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace cudf {
+namespace io {
+namespace text {
+namespace detail {
+
+struct trie_node {
+  char token;
+  uint8_t match_length;
+  uint8_t child_begin;
+};
+
+struct trie_device_view {
+  device_span<trie_node const> _nodes;
+
+  /**
+   * @brief create a multistate which contains all partial path matches for the given token.
+   */
+  constexpr multistate transition_init(char c)
+  {
+    auto result = multistate();
+
+    result.enqueue(0, 0);
+
+    for (uint8_t curr = 0; curr < _nodes.size() - 1; curr++) {
+      transition_enqueue_all(c, result, curr, curr);
+    }
+    return result;
+  }
+
+  /**
+   * @brief create a new multistate by transitioning all states in the multistate by the given token
+   *
+   * Eliminates any partial matches that cannot transition using the given token.
+   *
+   * @note always enqueues (0, 0] as the first state of the returned multistate.
+   */
+  constexpr multistate transition(char c, multistate const& states)
+  {
+    auto result = multistate();
+
+    result.enqueue(0, 0);
+
+    for (uint8_t i = 0; i < states.size(); i++) {
+      transition_enqueue_all(c, result, states.get_head(i), states.get_tail(i));
+    }
+
+    return result;
+  }
+
+  /**
+   * @brief returns true if the given index is associated with a matching state.
+   */
+  constexpr bool is_match(uint16_t idx) { return static_cast<bool>(get_match_length(idx)); }
+
+  /**
+   * @brief returns the match length if the given index is associated with a matching state,
+   * otherwise zero.
+   */
+  constexpr uint8_t get_match_length(uint16_t idx) { return _nodes[idx].match_length; }
+
+  /**
+   * @brief returns the longest matching state of any state in the multistate.
+   */
+  template <uint32_t N>
+  constexpr uint8_t get_match_length(multistate const& states)
+  {
+    int8_t val = 0;
+    for (uint8_t i = 0; i < states.size(); i++) {
+      auto match_length = get_match_length(states.get_tail(i));
+      if (match_length > val) { val = match_length; }
+    }
+    return val;
+  }
+
+ private:
+  constexpr void transition_enqueue_all(  //
+    char c,
+    multistate& states,
+    uint8_t head,
+    uint8_t curr)
+  {
+    for (uint32_t tail = _nodes[curr].child_begin; tail < _nodes[curr + 1].child_begin; tail++) {
+      if (_nodes[tail].token == c) {  //
+        states.enqueue(head, tail);
+      }
+    }
+  }
+};
+
+/**
+ * @brief A flat trie contained in device memory.
+ */
+struct trie {
+ private:
+  cudf::size_type _max_duplicate_tokens;
+  rmm::device_uvector<trie_node> _nodes;
+
+  trie(cudf::size_type max_duplicate_tokens, rmm::device_uvector<trie_node>&& nodes)
+    : _max_duplicate_tokens(max_duplicate_tokens), _nodes(std::move(nodes))
+  {
+  }
+
+  /**
+   * @brief Used to build a hierarchical trie which can then be flattened.
+   */
+  struct trie_builder_node {
+    uint8_t match_length;
+    std::unordered_map<char, std::unique_ptr<trie_builder_node>> children;
+
+    /**
+     * @brief Insert the string in to the trie tree, growing the trie as necessary
+     */
+    void insert(std::string s) { insert(s.c_str(), s.size(), 0); }
+
+   private:
+    trie_builder_node& insert(char const* s, uint16_t size, uint8_t depth)
+    {
+      if (size == 0) {
+        match_length = depth;
+        return *this;
+      }
+
+      if (children[*s] == nullptr) { children[*s] = std::make_unique<trie_builder_node>(); }
+
+      return children[*s]->insert(s + 1, size - 1, depth + 1);
+    }
+  };
+
+ public:
+  /**
+   * @brief Gets the number of nodes contained in this trie.
+   */
+  cudf::size_type size() const { return _nodes.size(); }
+
+  /**
+   * @brief A pessimistic count of duplicate tokens in the trie. Used to determine the maximum
+   * possible stack size required to compute matches of this trie in parallel.
+   */
+  cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; }
+
+  /**
+   * @brief Create a trie which represents the given pattern.
+   *
+   * @param pattern The pattern to store in the trie
+   * @param stream The stream to use for allocation and copy
+   * @param mr Memory resource to use for the device memory allocation
+   * @return The trie.
+   */
+  static trie create(std::string const& pattern,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+
+  {
+    return create(std::vector<std::string>{pattern}, stream, mr);
+  }
+
+  /**
+   * @brief Create a trie which represents the given pattern.
+   *
+   * @param pattern The patterns to store in the trie
+   * @param stream The stream to use for allocation and copy
+   * @param mr Memory resource to use for the device memory allocation
+   * @return The trie.
+   */
+  static trie create(std::vector<std::string> const& patterns,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  {
+    std::vector<char> tokens;
+    std::vector<uint8_t> transitions;
+    std::vector<uint8_t> match_length;
+
+    // create the trie tree
+    auto root = std::make_unique<trie_builder_node>();
+    for (auto& pattern : patterns) {
+      root->insert(pattern);
+    }
+
+    // flatten
+    auto sum = 1;
+    transitions.emplace_back(sum);
+    match_length.emplace_back(root->match_length);
+
+    auto builder_nodes = std::queue<std::unique_ptr<trie_builder_node>>();
+    builder_nodes.push(std::move(root));
+
+    tokens.emplace_back(0);
+
+    while (builder_nodes.size()) {
+      auto layer_size = builder_nodes.size();
+      for (uint32_t i = 0; i < layer_size; i++) {
+        auto node = std::move(builder_nodes.front());
+        builder_nodes.pop();
+        sum += node->children.size();
+        transitions.emplace_back(sum);
+        for (auto& item : node->children) {
+          match_length.emplace_back(item.second->match_length);
+          tokens.emplace_back(item.first);
+          builder_nodes.push(std::move(item.second));
+        }
+      }
+    }
+
+    tokens.emplace_back(0);
+
+    match_length.emplace_back(0);
+
+    std::vector<trie_node> trie_nodes;
+    auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
+
+    for (uint32_t i = 0; i < tokens.size(); i++) {
+      trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
+      token_counts[tokens[i]]++;
+    }
+
+    auto most_common_token =
+      std::max_element(token_counts.begin(), token_counts.end(), [](auto const& a, auto const& b) {
+        return a.second < b.second;
+      });
+
+    auto max_duplicate_tokens = most_common_token->second;
+
+    return trie{max_duplicate_tokens,
+                cudf::detail::make_device_uvector_sync(trie_nodes, stream, mr)};
+  }
+
+  trie_device_view view() const { return trie_device_view{_nodes}; }
+};
+
+}  // namespace detail
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
new file mode 100644
index 00000000000..d42ee9f510e
--- /dev/null
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/io/text/data_chunk_source.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace io {
+namespace text {
+
+std::unique_ptr<cudf::column> multibyte_split(
+  data_chunk_source const& source,
+  std::string const& delimiter,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
new file mode 100644
index 00000000000..662ec744680
--- /dev/null
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/text/data_chunk_source.hpp>
+#include <cudf/io/text/detail/multistate.hpp>
+#include <cudf/io/text/detail/tile_state.hpp>
+#include <cudf/io/text/detail/trie.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+
+#include <memory>
+
+namespace {
+
+using cudf::io::text::detail::multistate;
+
+int32_t constexpr ITEMS_PER_THREAD = 32;
+int32_t constexpr THREADS_PER_TILE = 128;
+int32_t constexpr ITEMS_PER_TILE   = ITEMS_PER_THREAD * THREADS_PER_TILE;
+int32_t constexpr TILES_PER_CHUNK  = 1024;
+int32_t constexpr ITEMS_PER_CHUNK  = ITEMS_PER_TILE * TILES_PER_CHUNK;
+
+struct PatternScan {
+  using BlockScan         = cub::BlockScan<multistate, THREADS_PER_TILE>;
+  using BlockScanCallback = cudf::io::text::detail::scan_tile_state_callback<multistate>;
+
+  struct _TempStorage {
+    typename BlockScan::TempStorage scan;
+  };
+
+  _TempStorage& _temp_storage;
+
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
+  __device__ inline PatternScan(TempStorage& temp_storage) : _temp_storage(temp_storage.Alias()) {}
+
+  __device__ inline void Scan(cudf::size_type tile_idx,
+                              cudf::io::text::detail::scan_tile_state_view<multistate> tile_state,
+                              cudf::io::text::detail::trie_device_view trie,
+                              char (&thread_data)[ITEMS_PER_THREAD],
+                              uint32_t (&thread_state)[ITEMS_PER_THREAD])
+  {
+    auto thread_multistate = trie.transition_init(thread_data[0]);
+
+    for (uint32_t i = 1; i < ITEMS_PER_THREAD; i++) {
+      thread_multistate = trie.transition(thread_data[i], thread_multistate);
+    }
+
+    auto prefix_callback = BlockScanCallback(tile_state, tile_idx);
+
+    BlockScan(_temp_storage.scan)
+      .ExclusiveSum(thread_multistate, thread_multistate, prefix_callback);
+
+    for (uint32_t i = 0; i < ITEMS_PER_THREAD; i++) {
+      thread_multistate = trie.transition(thread_data[i], thread_multistate);
+
+      thread_state[i] = thread_multistate.max_tail();
+    }
+  }
+};
+
+// multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming
+// them in to data structures called "multistates". these multistates are created by searching a
+// trie, but instead of a tradition trie where the search begins at a single node at the beginning,
+// we allow our search to begin anywhere within the trie tree. The position within the trie tree is
+// stored as a "partial match path", which indicates "we can get from here to there by a set of
+// specific transitions". By scanning together multistates, we effectively know "we can get here
+// from the beginning by following the inputs". By doing this, each thread knows exactly what state
+// it begins in. From there, each thread can then take deterministic action. In this case, the
+// deterministic action is counting and outputting delimiter offsets when a delimiter is found.
+
+__global__ void multibyte_split_init_kernel(
+  cudf::size_type base_tile_idx,
+  cudf::size_type num_tiles,
+  cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
+  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_status status =
+    cudf::io::text::detail::scan_tile_status::invalid)
+{
+  auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_idx < num_tiles) {
+    auto const tile_idx = base_tile_idx + thread_idx;
+    tile_multistates.set_status(tile_idx, status);
+    tile_output_offsets.set_status(tile_idx, status);
+  }
+}
+
+__global__ void multibyte_split_seed_kernel(
+  cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
+  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  multistate tile_multistate_seed,
+  uint32_t tile_output_offset)
+{
+  auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_idx == 0) {
+    tile_multistates.set_inclusive_prefix(-1, tile_multistate_seed);
+    tile_output_offsets.set_inclusive_prefix(-1, tile_output_offset);
+  }
+}
+
+__global__ void multibyte_split_kernel(
+  cudf::size_type base_tile_idx,
+  cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
+  cudf::io::text::detail::scan_tile_state_view<uint32_t> tile_output_offsets,
+  cudf::io::text::detail::trie_device_view trie,
+  int32_t chunk_input_offset,
+  cudf::device_span<char const> chunk_input_chars,
+  cudf::device_span<int32_t> abs_output_delimiter_offsets,
+  cudf::device_span<char> abs_output_chars)
+{
+  using InputLoad =
+    cub::BlockLoad<char, THREADS_PER_TILE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_VECTORIZE>;
+  using OffsetScan         = cub::BlockScan<uint32_t, THREADS_PER_TILE>;
+  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<uint32_t>;
+
+  __shared__ union {
+    typename InputLoad::TempStorage input_load;
+    typename PatternScan::TempStorage pattern_scan;
+    typename OffsetScan::TempStorage offset_scan;
+  } temp_storage;
+
+  int32_t const tile_idx            = base_tile_idx + blockIdx.x;
+  int32_t const tile_input_offset   = blockIdx.x * ITEMS_PER_TILE;
+  int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD;
+  int32_t const thread_input_size   = chunk_input_chars.size() - thread_input_offset;
+
+  // STEP 1: Load inputs
+
+  char thread_chars[ITEMS_PER_THREAD];
+
+  InputLoad(temp_storage.input_load)
+    .Load(chunk_input_chars.data() + tile_input_offset,
+          thread_chars,
+          chunk_input_chars.size() - tile_input_offset);
+
+  // STEP 2: Scan inputs to determine absolute thread states
+
+  uint32_t thread_states[ITEMS_PER_THREAD];
+
+  __syncthreads();  // required before temp_memory re-use
+  PatternScan(temp_storage.pattern_scan)
+    .Scan(tile_idx, tile_multistates, trie, thread_chars, thread_states);
+
+  // STEP 3: Flag matches
+
+  uint32_t thread_offsets[ITEMS_PER_THREAD];
+
+  for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
+    thread_offsets[i] = i < thread_input_size and trie.is_match(thread_states[i]);
+  }
+
+  // STEP 4: Scan flags to determine absolute thread output offset
+
+  auto prefix_callback = OffsetScanCallback(tile_output_offsets, tile_idx);
+
+  __syncthreads();  // required before temp_memory re-use
+  OffsetScan(temp_storage.offset_scan)
+    .ExclusiveSum(thread_offsets, thread_offsets, prefix_callback);
+
+  // Step 5: Assign outputs from each thread using match offsets.
+
+  if (abs_output_chars.size() > 0) {
+    for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) {
+      abs_output_chars[chunk_input_offset + thread_input_offset + i] = thread_chars[i];
+    }
+  }
+
+  if (abs_output_delimiter_offsets.size() > 0) {
+    for (int32_t i = 0; i < ITEMS_PER_THREAD and i < thread_input_size; i++) {
+      if (trie.is_match(thread_states[i])) {
+        auto const match_end = base_tile_idx * ITEMS_PER_TILE + thread_input_offset + i + 1;
+        abs_output_delimiter_offsets[thread_offsets[i]] = match_end;
+      }
+    }
+  }
+}
+
+}  // namespace
+
+namespace cudf {
+namespace io {
+namespace text {
+namespace detail {
+
+void fork_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
+{
+  cudaEvent_t event;
+  cudaEventCreate(&event);
+  cudaEventRecord(event, stream);
+  for (uint32_t i = 0; i < streams.size(); i++) {
+    cudaStreamWaitEvent(streams[i], event, 0);
+  }
+  cudaEventDestroy(event);
+}
+
+void join_stream(std::vector<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
+{
+  cudaEvent_t event;
+  cudaEventCreate(&event);
+  for (uint32_t i = 0; i < streams.size(); i++) {
+    cudaEventRecord(event, streams[i]);
+    cudaStreamWaitEvent(stream, event, 0);
+  }
+  cudaEventDestroy(event);
+}
+
+std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_pool& stream_pool)
+{
+  auto streams = std::vector<rmm::cuda_stream_view>();
+  for (int32_t i = 0; i < count; i++) {
+    streams.emplace_back(stream_pool.get_stream());
+  }
+  return streams;
+}
+
+cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_source const& source,
+                                                 cudf::io::text::detail::trie const& trie,
+                                                 scan_tile_state<multistate>& tile_multistates,
+                                                 scan_tile_state<uint32_t>& tile_offsets,
+                                                 device_span<cudf::size_type> output_buffer,
+                                                 device_span<char> output_char_buffer,
+                                                 rmm::cuda_stream_view stream,
+                                                 std::vector<rmm::cuda_stream_view> const& streams)
+{
+  CUDF_FUNC_RANGE();
+  cudf::size_type chunk_offset = 0;
+
+  multibyte_split_init_kernel<<<TILES_PER_CHUNK, THREADS_PER_TILE, 0, stream.value()>>>(  //
+    -TILES_PER_CHUNK,
+    TILES_PER_CHUNK,
+    tile_multistates,
+    tile_offsets,
+    cudf::io::text::detail::scan_tile_status::oob);
+
+  auto multistate_seed = multistate();
+  multistate_seed.enqueue(0, 0);  // this represents the first state in the pattern.
+
+  // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
+  // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
+  // would have to follow seperate logic.
+  multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>(  //
+    tile_multistates,
+    tile_offsets,
+    multistate_seed,
+    0);
+
+  fork_stream(streams, stream);
+
+  auto reader = source.create_reader();
+
+  cudaEvent_t last_launch_event;
+  cudaEventCreate(&last_launch_event);
+
+  for (int32_t i = 0; true; i++) {
+    auto base_tile_idx = i * TILES_PER_CHUNK;
+    auto chunk_stream  = streams[i % streams.size()];
+    auto chunk         = reader->get_next_chunk(ITEMS_PER_CHUNK, chunk_stream);
+
+    if (chunk.size() == 0) { break; }
+
+    auto tiles_in_launch =
+      cudf::util::div_rounding_up_safe(chunk.size(), static_cast<std::size_t>(ITEMS_PER_TILE));
+
+    // reset the next chunk of tile state
+    multibyte_split_init_kernel<<<tiles_in_launch, THREADS_PER_TILE, 0, chunk_stream>>>(  //
+      base_tile_idx,
+      tiles_in_launch,
+      tile_multistates,
+      tile_offsets);
+
+    cudaStreamWaitEvent(chunk_stream, last_launch_event, 0);
+
+    multibyte_split_kernel<<<tiles_in_launch, THREADS_PER_TILE, 0, chunk_stream>>>(  //
+      base_tile_idx,
+      tile_multistates,
+      tile_offsets,
+      trie.view(),
+      chunk_offset,
+      chunk,
+      output_buffer,
+      output_char_buffer);
+
+    cudaEventRecord(last_launch_event, chunk_stream);
+
+    chunk_offset += chunk.size();
+  }
+
+  cudaEventDestroy(last_launch_event);
+
+  join_stream(streams, stream);
+
+  return chunk_offset;
+}
+
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr,
+                                              rmm::cuda_stream_pool& stream_pool)
+{
+  CUDF_FUNC_RANGE();
+  auto const trie = cudf::io::text::detail::trie::create({delimiter}, stream);
+
+  CUDF_EXPECTS(trie.max_duplicate_tokens() < multistate::max_segment_count,
+               "delimiter contains too many duplicate tokens to produce a deterministic result.");
+
+  CUDF_EXPECTS(trie.size() < multistate::max_segment_value,
+               "delimiter contains too many total tokens to produce a deterministic result.");
+
+  auto concurrency = 2;
+  // must be at least 32 when using warp-reduce on partials
+  // must be at least 1 more than max possible concurrent tiles
+  // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
+  auto num_tile_states  = std::max(32, TILES_PER_CHUNK * concurrency + 32);
+  auto tile_multistates = scan_tile_state<multistate>(num_tile_states, stream);
+  auto tile_offsets     = scan_tile_state<uint32_t>(num_tile_states, stream);
+
+  auto streams = get_streams(concurrency, stream_pool);
+
+  auto bytes_total =
+    multibyte_split_scan_full_source(source,
+                                     trie,
+                                     tile_multistates,
+                                     tile_offsets,
+                                     cudf::device_span<int32_t>(static_cast<int32_t*>(nullptr), 0),
+                                     cudf::device_span<char>(static_cast<char*>(nullptr), 0),
+                                     stream,
+                                     streams);
+
+  // allocate results
+  auto num_tiles      = cudf::util::div_rounding_up_safe(bytes_total, ITEMS_PER_TILE);
+  auto num_results    = tile_offsets.get_inclusive_prefix(num_tiles - 1, stream);
+  auto string_offsets = rmm::device_uvector<int32_t>(num_results + 2, stream, mr);
+  auto string_chars   = rmm::device_uvector<char>(bytes_total, stream, mr);
+
+  // first and last element are set manually to zero and size of input, respectively.
+  // kernel is only responsible for determining delimiter offsets
+  auto string_count = static_cast<cudf::size_type>(string_offsets.size() - 1);
+  string_offsets.set_element_to_zero_async(0, stream);
+  string_offsets.set_element_async(string_count, bytes_total, stream);
+
+  multibyte_split_scan_full_source(
+    source,
+    trie,
+    tile_multistates,
+    tile_offsets,
+    cudf::device_span<int32_t>(string_offsets).subspan(1, num_results),
+    string_chars,
+    stream,
+    streams);
+
+  return cudf::make_strings_column(
+    string_count, std::move(string_offsets), std::move(string_chars));
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  auto stream      = rmm::cuda_stream_default;
+  auto stream_pool = rmm::cuda_stream_pool(2);
+  auto result      = detail::multibyte_split(source, delimiter, stream, mr, stream_pool);
+
+  stream.synchronize();
+
+  return result;
+}
+
+}  // namespace text
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index abf1f9599dc..c89f1b756d6 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -137,4 +137,46 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
                                   std::move(children));
 }
 
+std::unique_ptr<column> make_strings_column(size_type num_strings,
+                                            rmm::device_uvector<size_type>&& offsets,
+                                            rmm::device_uvector<char>&& chars,
+                                            rmm::device_buffer&& null_mask,
+                                            size_type null_count)
+{
+  CUDF_FUNC_RANGE();
+
+  auto const offsets_size = static_cast<size_type>(offsets.size());
+  auto const chars_size   = static_cast<size_type>(chars.size());
+
+  if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable.");
+
+  CUDF_EXPECTS(num_strings == offsets_size - 1, "Invalid offsets column size for strings column.");
+
+  auto offsets_column = std::make_unique<column>(  //
+    data_type{type_id::INT32},
+    offsets_size,
+    offsets.release(),
+    rmm::device_buffer(),
+    0);
+
+  auto chars_column = std::make_unique<column>(  //
+    data_type{type_id::INT8},
+    chars_size,
+    chars.release(),
+    rmm::device_buffer(),
+    0);
+
+  auto children = std::vector<std::unique_ptr<column>>();
+
+  children.emplace_back(std::move(offsets_column));
+  children.emplace_back(std::move(chars_column));
+
+  return std::make_unique<column>(data_type{type_id::STRING},
+                                  num_strings,
+                                  rmm::device_buffer{},
+                                  std::move(null_mask),
+                                  null_count,
+                                  std::move(children));
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 19421e3115d..edfbba74eb1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -195,6 +195,7 @@ ConfigureTest(ORC_TEST io/orc_test.cpp)
 ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
 ConfigureTest(JSON_TEST io/json_test.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
+ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 if(CUDF_ENABLE_ARROW_S3)
   target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
 endif()
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
new file mode 100644
index 00000000000..d1fa787e000
--- /dev/null
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+using namespace cudf;
+using namespace test;
+
+// 😀 | F0 9F 98 80 | 11110000 10011111 10011000 10000000
+// 😎 | F0 9F 98 8E | 11110000 10011111 10011000 10001110
+
+struct MultibyteSplitTest : public BaseFixture {
+};
+
+TEST_F(MultibyteSplitTest, NondeterministicMatching)
+{
+  auto delimiter  = std::string("abac");
+  auto host_input = std::string("ababacabacab");
+
+  auto expected = strings_column_wrapper{"ababac", "abac", "ab"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterAtEnd)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg:");
+
+  auto expected = strings_column_wrapper{"abcdefg:", ""};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, LargeInput)
+{
+  auto host_input    = std::string();
+  auto host_expected = std::vector<std::string>();
+
+  for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) {
+    host_input += "...:|";
+    host_expected.emplace_back(std::string("...:|"));
+  }
+
+  host_expected.emplace_back(std::string(""));
+
+  auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()};
+
+  auto delimiter = std::string("...:|");
+  auto source    = cudf::io::text::make_source(host_input);
+  auto out       = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, OverlappingMatchErasure)
+{
+  auto delimiter = "::";
+
+  auto host_input = std::string(
+    ":::::"
+    ":::::");
+  auto expected = strings_column_wrapper{":::::", ":::::"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported.
+}
+
+TEST_F(MultibyteSplitTest, HandpickedInput)
+{
+  auto delimiters = "::|";
+  auto host_input = std::string(
+    "aaa::|"
+    "bbb::|"
+    "ccc::|"
+    "ddd::|"
+    "eee::|"
+    "fff::|"
+    "ggg::|"
+    "hhh::|"
+    "___::|"
+    "here::|"
+    "is::|"
+    "another::|"
+    "simple::|"
+    "text::|"
+    "seperated::|"
+    "by::|"
+    "emojis::|"
+    "which::|"
+    "are::|"
+    "multiple::|"
+    "bytes::|"
+    "and::|"
+    "used::|"
+    "as::|"
+    "delimiters.::|"
+    "::|"
+    "::|"
+    "::|");
+
+  auto expected = strings_column_wrapper{
+    "aaa::|",         "bbb::|",      "ccc::|",       "ddd::|",  "eee::|",    "fff::|",
+    "ggg::|",         "hhh::|",      "___::|",       "here::|", "is::|",     "another::|",
+    "simple::|",      "text::|",     "seperated::|", "by::|",   "emojis::|", "which::|",
+    "are::|",         "multiple::|", "bytes::|",     "and::|",  "used::|",   "as::|",
+    "delimiters.::|", "::|",         "::|",          "::|",     ""};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiters);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
+}

From 5647b535b2a1546c56ddfd12e7cbd2fb198e64e8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 24 Aug 2021 17:52:20 -0500
Subject: [PATCH 13/20] Add support for BaseIndexer in Rolling APIs (#9085)

Fixes: #9085

This PR adds support for `BaseIndexer` subclass support in `Rolling` APIs. This also contains a fix related to `fillna` - testcase added.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9085
---
 python/cudf/cudf/core/column/numerical.py |  3 +
 python/cudf/cudf/core/window/rolling.py   | 67 ++++++++++++++++-------
 python/cudf/cudf/tests/test_replace.py    |  2 +
 python/cudf/cudf/tests/test_rolling.py    | 48 ++++++++++++++++
 4 files changed, 99 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index bc12b42a3fa..db1829d5f38 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -375,6 +375,9 @@ def fillna(
         else:
             col = self
 
+        if col.null_count == 0:
+            return col
+
         if method is not None:
             return super(NumericalColumn, col).fillna(fill_value, method)
 
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index e3ed15ba2a6..317ce29d00e 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -4,6 +4,7 @@
 
 import numba
 import pandas as pd
+from pandas.api.indexers import BaseIndexer
 
 import cudf
 from cudf import _lib as libcudf
@@ -20,7 +21,7 @@ class Rolling(GetAttrGetItemMixin):
 
     Parameters
     ----------
-    window : int or offset
+    window : int, offset or a BaseIndexer subclass
         Size of the window, i.e., the number of observations used
         to calculate the statistic.
         For datetime indexes, an offset can be provided instead
@@ -28,6 +29,8 @@ class Rolling(GetAttrGetItemMixin):
         As opposed to a fixed window size, each window will be
         sized to accommodate observations within the time period
         specified by the offset.
+        If a BaseIndexer subclass is passed, calculates the window
+        boundaries based on the defined ``get_window_bounds`` method.
     min_periods : int, optional
         The minimum number of observations in the window that are
         required to be non-null, so that the result is non-null.
@@ -195,26 +198,46 @@ def __getitem__(self, arg):
         )
 
     def _apply_agg_series(self, sr, agg_name):
+        source_column = sr._column
+        min_periods = self.min_periods or 1
         if isinstance(self.window, int):
-            result_col = libcudf.rolling.rolling(
-                sr._column,
-                None,
-                None,
-                self.window,
-                self.min_periods,
-                self.center,
-                agg_name,
+            preceding_window = None
+            following_window = None
+            window = self.window
+        elif isinstance(self.window, BaseIndexer):
+            start, end = self.window.get_window_bounds(
+                num_values=len(self.obj),
+                min_periods=self.min_periods,
+                center=self.center,
+                closed=None,
             )
+            start = as_column(start, dtype="int32")
+            end = as_column(end, dtype="int32")
+
+            idx = cudf.core.column.arange(len(start))
+            preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype(
+                "int32"
+            )
+            following_window = (end - idx - cudf.Scalar(1, "int32")).astype(
+                "int32"
+            )
+            window = None
         else:
-            result_col = libcudf.rolling.rolling(
-                sr._column,
-                as_column(self.window),
-                column.full(self.window.size, 0, dtype=self.window.dtype),
-                None,
-                self.min_periods,
-                self.center,
-                agg_name,
+            preceding_window = as_column(self.window)
+            following_window = column.full(
+                self.window.size, 0, dtype=self.window.dtype
             )
+            window = None
+
+        result_col = libcudf.rolling.rolling(
+            source_column=source_column,
+            pre_column_window=preceding_window,
+            fwd_column_window=following_window,
+            window=window,
+            min_periods=min_periods,
+            center=self.center,
+            op=agg_name,
+        )
         return sr._from_data({sr.name: result_col}, sr._index)
 
     def _apply_agg_dataframe(self, df, agg_name):
@@ -305,15 +328,17 @@ def _normalize(self):
             if self.min_periods is None:
                 min_periods = window
         else:
-            if isinstance(window, numba.cuda.devicearray.DeviceNDArray):
-                # window is a device_array of window sizes
+            if isinstance(
+                window, (numba.cuda.devicearray.DeviceNDArray, BaseIndexer)
+            ):
+                # window is a device_array of window sizes or BaseIndexer
                 self.window = window
                 self.min_periods = min_periods
                 return
 
             if not isinstance(self.obj.index, cudf.core.index.DatetimeIndex):
                 raise ValueError(
-                    "window must be an integer for " "non datetime index"
+                    "window must be an integer for non datetime index"
                 )
 
             self._time_window = True
@@ -326,7 +351,7 @@ def _normalize(self):
                 window = window.to_timedelta64()
             except ValueError as e:
                 raise ValueError(
-                    "window must be integer or " "convertible to a timedelta"
+                    "window must be integer or convertible to a timedelta"
                 ) from e
             if self.min_periods is None:
                 min_periods = 1
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 33bef2c677b..f60baec746f 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -657,6 +657,7 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace):
         pd.DataFrame(
             {"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"]
         ),
+        pd.DataFrame({"a": [1, 2, 3]}),
     ],
 )
 @pytest.mark.parametrize(
@@ -671,6 +672,7 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace):
         {"b": pd.Series([11, 22, 33], index=["a", "p", "z"])},
         {"a": 5, "b": pd.Series([3, 4, 5], index=["a", "p", "z"])},
         {"c": 100},
+        np.nan,
     ],
 )
 @pytest.mark.parametrize("inplace", [True, False])
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 07e7f43c992..8a8293cd090 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -369,3 +369,51 @@ def test_rolling_groupby_offset(agg, window_size):
     )
     got = getattr(gdf.groupby("group").rolling(window_size), agg)().fillna(-1)
     assert_eq(expect, got, check_dtype=False)
+
+
+def test_rolling_custom_index_support():
+    from pandas.api.indexers import BaseIndexer
+
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed):
+            start = np.empty(num_values, dtype=np.int64)
+            end = np.empty(num_values, dtype=np.int64)
+
+            for i in range(num_values):
+                if self.use_expanding[i]:
+                    start[i] = 0
+                    end[i] = i + 1
+                else:
+                    start[i] = i
+                    end[i] = i + self.window_size
+
+            return start, end
+
+    use_expanding = [True, False, True, False, True]
+    indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
+
+    df = pd.DataFrame({"values": range(5)})
+    gdf = cudf.from_pandas(df)
+
+    expected = df.rolling(window=indexer).sum()
+    actual = gdf.rolling(window=indexer).sum()
+
+    assert_eq(expected, actual, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "indexer",
+    [
+        pd.api.indexers.FixedForwardWindowIndexer(window_size=2),
+        pd.core.window.indexers.ExpandingIndexer(),
+        pd.core.window.indexers.FixedWindowIndexer(window_size=3),
+    ],
+)
+def test_rolling_indexer_support(indexer):
+    df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
+    gdf = cudf.from_pandas(df)
+
+    expected = df.rolling(window=indexer, min_periods=2).sum()
+    actual = gdf.rolling(window=indexer, min_periods=2).sum()
+
+    assert_eq(expected, actual)

From 359be0a24f702926c276d70c1e3f7f533ab63551 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 24 Aug 2021 20:02:45 -0400
Subject: [PATCH 14/20] Import fix (#9104)

Closes https://github.com/rapidsai/cudf/issues/9084

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9104
---
 python/cudf/cudf/utils/cudautils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index fb7163c52e3..727bbb1c345 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -240,6 +240,7 @@ def compile_udf(udf, type_signature):
       An numpy type
 
     """
+    import cudf.core.udf
 
     # Check if we've already compiled a similar (but possibly distinct)
     # function before

From 44bf4baead3950a1db5c9aad3c3d4d49a3273673 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 24 Aug 2021 20:52:48 -0400
Subject: [PATCH 15/20] Fix compile warnings found using nvcc 11.4 (#9101)

While looking into a `compute-sanitizer` issue, I found the problem with the tool had been fixed in 11.4. Building libcudf in 11.4 uncovered some new compile warnings which are fixed in this PR. All the warnings were identifying unused variables.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9101
---
 cpp/src/binaryop/compiled/binary_ops.cu  | 3 +--
 cpp/src/io/orc/reader_impl.cu            | 2 --
 cpp/src/merge/merge.cu                   | 9 ++++-----
 cpp/tests/join/conditional_join_tests.cu | 4 +---
 4 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 2b38224864a..7b0139a0082 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -43,7 +43,7 @@ struct scalar_as_column_device_view {
   template <typename T, std::enable_if_t<(is_fixed_width<T>())>* = nullptr>
   return_type operator()(scalar const& s,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v =
@@ -201,7 +201,6 @@ struct null_considering_binop {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    std::unique_ptr<column> out;
     // Create device views for inputs
     auto const lhs_dev_view = get_device_view(lhs);
     auto const rhs_dev_view = get_device_view(rhs);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 1b78d8b8585..83be58f5e56 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -176,7 +176,6 @@ size_t gather_stream_info(const size_t stripe_index,
                           const orc::StripeInformation* stripeinfo,
                           const orc::StripeFooter* stripefooter,
                           const std::vector<int>& orc2gdf,
-                          const std::vector<orc_column_meta>& gdf2orc,
                           const std::vector<orc::SchemaType> types,
                           bool use_index,
                           size_t* num_dictionary_entries,
@@ -1203,7 +1202,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                                           stripe_info,
                                                           stripe_footer,
                                                           _col_meta.orc_col_map[level],
-                                                          selected_columns,
                                                           _metadata->get_types(),
                                                           use_index,
                                                           &num_dict_entries,
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 38025a8a0ed..147db2fdfe7 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -225,11 +225,10 @@ struct column_merger {
   explicit column_merger(index_vector const& row_order) : row_order_(row_order) {}
 
   template <typename Element, CUDF_ENABLE_IF(not is_rep_layout_compatible<Element>())>
-  std::unique_ptr<column> operator()(
-    column_view const& lcol,
-    column_view const& rcol,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const
+  std::unique_ptr<column> operator()(column_view const&,
+                                     column_view const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
   {
     CUDF_FAIL("Unsupported type for merge.");
   }
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 8018d613e05..d566d2086bb 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,8 +44,6 @@ constexpr cudf::size_type JoinNoneValue =
 // Common column references.
 const auto col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
 const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-const auto col_ref_left_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
-const auto col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
 
 // Common expressions.
 auto left_zero_eq_right_zero =

From f0fa255add77daf6fd14b714286d01d5c2b4d082 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 25 Aug 2021 10:52:24 -0500
Subject: [PATCH 16/20] Fix cudf::hash_join output size for struct joins
 (#9107)

Fixes #9095.

This adds calls to `flatten_nested_columns` in the `cudf::hash_join` join output size APIs along with tests for joins on struct columns using `cudf::hash_join`.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/9107
---
 cpp/src/join/hash_join.cu     |  31 +++++---
 cpp/tests/join/join_tests.cpp | 136 ++++++++++++++++++++--------------
 2 files changed, 104 insertions(+), 63 deletions(-)

diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 50cc479fcf4..ee1eaeaed47 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -349,11 +349,15 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(_hash_table, "Hash table of hash join is null.");
 
-  auto build_table = cudf::table_device_view::create(_build, stream);
-  auto probe_table = cudf::table_device_view::create(probe, stream);
+  auto flattened_probe = structs::detail::flatten_nested_columns(
+    probe, {}, {}, structs::detail::column_nullability::FORCE);
+  auto const flattened_probe_table = std::get<0>(flattened_probe);
+
+  auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
+  auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return cudf::detail::compute_join_output_size<cudf::detail::join_kind::INNER_JOIN>(
-    *build_table, *probe_table, *_hash_table, compare_nulls, stream);
+    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream);
 }
 
 std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe,
@@ -365,11 +369,15 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr
   // Trivial left join case - exit early
   if (!_hash_table) { return probe.num_rows(); }
 
-  auto build_table = cudf::table_device_view::create(_build, stream);
-  auto probe_table = cudf::table_device_view::create(probe, stream);
+  auto flattened_probe = structs::detail::flatten_nested_columns(
+    probe, {}, {}, structs::detail::column_nullability::FORCE);
+  auto const flattened_probe_table = std::get<0>(flattened_probe);
+
+  auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
+  auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return cudf::detail::compute_join_output_size<cudf::detail::join_kind::LEFT_JOIN>(
-    *build_table, *probe_table, *_hash_table, compare_nulls, stream);
+    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream);
 }
 
 std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe,
@@ -382,10 +390,15 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr
   // Trivial left join case - exit early
   if (!_hash_table) { return probe.num_rows(); }
 
-  auto build_table = cudf::table_device_view::create(_build, stream);
-  auto probe_table = cudf::table_device_view::create(probe, stream);
+  auto flattened_probe = structs::detail::flatten_nested_columns(
+    probe, {}, {}, structs::detail::column_nullability::FORCE);
+  auto const flattened_probe_table = std::get<0>(flattened_probe);
+
+  auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
+  auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
-  return get_full_join_size(*build_table, *probe_table, *_hash_table, compare_nulls, stream, mr);
+  return get_full_join_size(
+    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index e468368842a..af998e366e9 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -44,6 +44,28 @@ constexpr cudf::size_type NoneValue =
   std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
 
 struct JoinTest : public cudf::test::BaseFixture {
+  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> gather_maps_as_tables(
+    cudf::column_view const& expected_left_map,
+    cudf::column_view const& expected_right_map,
+    std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+              std::unique_ptr<rmm::device_uvector<cudf::size_type>>> const& result)
+  {
+    auto result_table =
+      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.first->size()),
+                                          result.first->data()},
+                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.second->size()),
+                                          result.second->data()}});
+    auto result_sort_order = cudf::sorted_order(result_table);
+    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
+
+    cudf::table_view gold({expected_left_map, expected_right_map});
+    auto gold_sort_order = cudf::sorted_order(gold);
+    auto sorted_gold     = cudf::gather(gold, *gold_sort_order);
+
+    return std::make_pair(std::move(sorted_gold), std::move(sorted_result));
+  }
 };
 
 TEST_F(JoinTest, EmptySentinelRepro)
@@ -1232,27 +1254,9 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     EXPECT_EQ(output_size, size_gold);
 
     auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size);
-    auto result_table =
-      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.first->size()),
-                                          result.first->data()},
-                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.second->size()),
-                                          result.second->data()}});
-    auto result_sort_order = cudf::sorted_order(result_table);
-    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
-
     column_wrapper<int32_t> col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}};
     column_wrapper<int32_t> col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}};
-
-    CVector cols_gold;
-    cols_gold.push_back(col_gold_0.release());
-    cols_gold.push_back(col_gold_1.release());
-
-    Table gold(std::move(cols_gold));
-    auto gold_sort_order = cudf::sorted_order(gold.view());
-    auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 
@@ -1270,27 +1274,9 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     EXPECT_EQ(output_size, size_gold);
 
     auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size);
-    auto result_table =
-      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.first->size()),
-                                          result.first->data()},
-                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.second->size()),
-                                          result.second->data()}});
-    auto result_sort_order = cudf::sorted_order(result_table);
-    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
-
     column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
     column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}};
-
-    CVector cols_gold;
-    cols_gold.push_back(col_gold_0.release());
-    cols_gold.push_back(col_gold_1.release());
-
-    Table gold(std::move(cols_gold));
-    auto gold_sort_order = cudf::sorted_order(gold.view());
-    auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 
@@ -1308,27 +1294,69 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     EXPECT_EQ(output_size, size_gold);
 
     auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size);
-    auto result_table =
-      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.first->size()),
-                                          result.first->data()},
-                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
-                                          static_cast<cudf::size_type>(result.second->size()),
-                                          result.second->data()}});
-    auto result_sort_order = cudf::sorted_order(result_table);
-    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
-
     column_wrapper<int32_t> col_gold_0{{2, 4, 0}};
     column_wrapper<int32_t> col_gold_1{{1, 1, 4}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
+}
 
-    CVector cols_gold;
-    cols_gold.push_back(col_gold_0.release());
-    cols_gold.push_back(col_gold_1.release());
+TEST_F(JoinTest, HashJoinWithStructsAndNulls)
+{
+  auto col0_names_col = strcol_wrapper{
+    "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
+  auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
+
+  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};
 
-    Table gold(std::move(cols_gold));
-    auto gold_sort_order = cudf::sorted_order(gold.view());
-    auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+  auto col0 =
+    cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};
+
+  auto col1_names_col = strcol_wrapper{
+    "Samuel Vimes", "Detritus", "Detritus", "Carrot Ironfoundersson", "Angua von Überwald"};
+  auto col1_ages_col = column_wrapper<int32_t>{{48, 35, 351, 22, 25}};
 
+  auto col1_is_human_col = column_wrapper<bool>{{true, true, false, false, true}, {1, 1, 0, 1, 1}};
+
+  auto col1 =
+    cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
+
+  CVector cols0, cols1;
+  cols0.push_back(col0.release());
+  cols1.push_back(col1.release());
+
+  Table t0(std::move(cols0));
+  Table t1(std::move(cols1));
+
+  auto hash_join = cudf::hash_join(t1, cudf::null_equality::EQUAL);
+
+  {
+    auto output_size = hash_join.left_join_size(t0);
+    EXPECT_EQ(5, output_size);
+    auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, output_size);
+    column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
+    column_wrapper<int32_t> col_gold_1{{0, NoneValue, 2, NoneValue, NoneValue}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
+
+  {
+    auto output_size = hash_join.inner_join_size(t0);
+    EXPECT_EQ(2, output_size);
+    auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, output_size);
+    column_wrapper<int32_t> col_gold_0{{0, 2}};
+    column_wrapper<int32_t> col_gold_1{{0, 2}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+  }
+
+  {
+    auto output_size = hash_join.full_join_size(t0);
+    EXPECT_EQ(8, output_size);
+    auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, output_size);
+    column_wrapper<int32_t> col_gold_0{{NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}};
+    column_wrapper<int32_t> col_gold_1{{1, 3, 4, 0, NoneValue, 2, NoneValue, NoneValue}};
+    auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 }

From 2a566dd3a9bd81277640c71ca7d350a65ca78fc3 Mon Sep 17 00:00:00 2001
From: shaneding <shane200195@gmail.com>
Date: Wed, 25 Aug 2021 16:49:45 -0400
Subject: [PATCH 17/20] Implement timestamp ceil (#8942)

Partly addresses #8682

This adds a `ceil` function for timestamp columns in libcudf. It is applied on fixed resolutions only.

Authors:
  - https://github.com/shaneding

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/8942
---
 cpp/include/cudf/datetime.hpp            |  85 ++++++++++++++
 cpp/include/cudf/wrappers/durations.hpp  |   8 ++
 cpp/src/datetime/datetime_ops.cu         | 135 +++++++++++++++++++++++
 cpp/tests/datetime/datetime_ops_test.cpp |  56 ++++++++++
 4 files changed, 284 insertions(+)

diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 2e4ac870969..52b21c98f75 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -237,5 +237,90 @@ std::unique_ptr<cudf::column> extract_quarter(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
+
+/**
+ * @brief Round up to the nearest day
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> ceil_day(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest hour
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> ceil_hour(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest minute
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> ceil_minute(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest second
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> ceil_second(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest millisecond
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> ceil_millisecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest microsecond
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> ceil_microsecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round up to the nearest nanosecond
+ *
+ * @param cudf::column_view of the input datetime values
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> ceil_nanosecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp
index 07bcc1976a8..8bc8b7a7e6e 100644
--- a/cpp/include/cudf/wrappers/durations.hpp
+++ b/cpp/include/cudf/wrappers/durations.hpp
@@ -33,6 +33,14 @@ namespace cudf {
  * @brief Type alias representing an int32_t duration of days.
  */
 using duration_D = cuda::std::chrono::duration<int32_t, cuda::std::chrono::days::period>;
+/**
+ * @brief Type alias representing an int32_t duration of hours.
+ */
+using duration_h = cuda::std::chrono::duration<int32_t, cuda::std::chrono::hours::period>;
+/**
+ * @brief Type alias representing an int32_t duration of minutes.
+ */
+using duration_m = cuda::std::chrono::duration<int32_t, cuda::std::chrono::minutes::period>;
 /**
  * @brief Type alias representing an int64_t duration of seconds.
  */
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 9879a6c5423..df013be717f 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -24,7 +24,10 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <cudf/wrappers/durations.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -41,6 +44,9 @@ enum class datetime_component {
   HOUR,
   MINUTE,
   SECOND,
+  MILLISECOND,
+  MICROSECOND,
+  NANOSECOND
 };
 
 template <datetime_component Component>
@@ -77,6 +83,35 @@ struct extract_component_operator {
   }
 };
 
+template <datetime_component COMPONENT>
+struct ceil_timestamp {
+  template <typename Timestamp>
+  CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const
+  {
+    using namespace cuda::std::chrono;
+    // want to use this with D, H, T (minute), S, L (millisecond), U
+    switch (COMPONENT) {
+      case datetime_component::DAY:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_D>(ts));
+      case datetime_component::HOUR:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_h>(ts));
+      case datetime_component::MINUTE:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_m>(ts));
+      case datetime_component::SECOND:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_s>(ts));
+      case datetime_component::MILLISECOND:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_ms>(ts));
+      case datetime_component::MICROSECOND:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_us>(ts));
+      case datetime_component::NANOSECOND:
+        return time_point_cast<typename Timestamp::duration>(ceil<duration_ns>(ts));
+      default: cudf_assert(false && "Unexpected resolution");
+    }
+
+    return {};
+  }
+};
+
 // Number of days until month indexed by leap year and month (0-based index)
 static __device__ int16_t const days_until_month[2][13] = {
   {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},  // For non leap years
@@ -155,6 +190,45 @@ struct is_leap_year_op {
   }
 };
 
+// Specific function for applying ceil/floor date ops
+template <typename TransformFunctor>
+struct dispatch_ceil {
+  template <typename Timestamp>
+  std::enable_if_t<cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
+    cudf::column_view const& column,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
+  {
+    auto size            = column.size();
+    auto output_col_type = data_type{cudf::type_to_id<Timestamp>()};
+
+    // Return an empty column if source column is empty
+    if (size == 0) return make_empty_column(output_col_type);
+
+    auto output = make_fixed_width_column(output_col_type,
+                                          size,
+                                          cudf::detail::copy_bitmask(column, stream, mr),
+                                          column.null_count(),
+                                          stream,
+                                          mr);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      column.begin<Timestamp>(),
+                      column.end<Timestamp>(),
+                      output->mutable_view().begin<Timestamp>(),
+                      TransformFunctor{});
+
+    return output;
+  }
+
+  template <typename Timestamp, typename... Args>
+  std::enable_if_t<!cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
+    Args&&...)
+  {
+    CUDF_FAIL("Must be cudf::timestamp");
+  }
+};
+
 // Apply the functor for every element/row in the input column to create the output column
 template <typename TransformFunctor, typename OutputColT>
 struct launch_functor {
@@ -286,6 +360,15 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
   return output;
 }
 
+template <datetime_component Component>
+std::unique_ptr<column> ceil_general(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  return cudf::type_dispatcher(
+    column.type(), dispatch_ceil<detail::ceil_timestamp<Component>>{}, column, stream, mr);
+}
+
 std::unique_ptr<column> extract_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -388,6 +471,58 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 }  // namespace detail
 
+std::unique_ptr<column> ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::DAY>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::HOUR>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::MINUTE>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::SECOND>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_millisecond(column_view const& column,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::MILLISECOND>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_microsecond(column_view const& column,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::MICROSECOND>(
+    column, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> ceil_nanosecond(column_view const& column,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::ceil_general<detail::datetime_component::NANOSECOND>(
+    column, rmm::cuda_stream_default, mr);
+}
+
 std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 39ad5f556d4..4a1c0512643 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -348,6 +348,62 @@ TEST_F(BasicDatetimeOpsTest, TestLastDayOfMonthWithDate)
     verbosity);
 }
 
+TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
+{
+  using T = TypeParam;
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
+
+  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+
+  auto host_val                     = to_host<T>(input);
+  thrust::host_vector<T> timestamps = host_val.first;
+
+  thrust::host_vector<T> ceiled_day(timestamps.size());
+  thrust::transform(timestamps.begin(), timestamps.end(), ceiled_day.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<days>(i));
+  });
+  auto expected_day =
+    fixed_width_column_wrapper<T, typename T::duration::rep>(ceiled_day.begin(), ceiled_day.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_day(input), expected_day);
+
+  thrust::host_vector<T> ceiled_hour(timestamps.size());
+  thrust::transform(timestamps.begin(), timestamps.end(), ceiled_hour.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<hours>(i));
+  });
+  auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(ceiled_hour.begin(),
+                                                                                ceiled_hour.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_hour(input), expected_hour);
+
+  std::vector<T> ceiled_minute(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_minute.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<minutes>(i));
+  });
+  auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_minute.begin(), ceiled_minute.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_minute(input), expected_minute);
+
+  std::vector<T> ceiled_second(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_second.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<seconds>(i));
+  });
+  auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_second.begin(), ceiled_second.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_second(input), expected_second);
+
+  std::vector<T> ceiled_millisecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_millisecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<milliseconds>(i));
+  });
+  auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_millisecond.begin(), ceiled_millisecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_millisecond(input), expected_millisecond);
+}
+
 TEST_F(BasicDatetimeOpsTest, TestDayOfYearWithDate)
 {
   using namespace cudf::test;

From d29c441607d6d546c57a9a7ffcaf40247861398a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 25 Aug 2021 16:54:28 -0400
Subject: [PATCH 18/20] Fix cudf::strings::is_fixed_point checking of overflow
 for decimal32 (#9093)

While working on `decimal128` support, @codereport found a bug in the `cudf::strings::is_fixed_point` logic where a large integer (in a strings column) could return true/valid even though it overflows the `Rep` type for `decimal32 type`. The gtest values did not include a value that would have shown this error. This PR adds the test string and fixes the logic properly check the overflow condition.

The current logic was relying on storing intermediate values into `uint64_t` types so any number that would fit in `uint64_t` would not be detected as overflow for `decimal32`. This PR fixes functions to use the input type storage type more to help identify the overflow correctly and to help with specializing for `decimal128`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9093
---
 .../strings/detail/convert/fixed_point.cuh    | 28 ++++++++++++-------
 .../strings/convert/convert_fixed_point.cu    | 12 ++++----
 cpp/tests/strings/fixed_point_tests.cpp       | 28 ++++++++++++-------
 3 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 53774ed948d..56205c161b1 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -17,6 +17,8 @@
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
+#include <type_traits>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -24,20 +26,25 @@ namespace detail {
 /**
  * @brief Return the integer component of a decimal string.
  *
- * This is reads everything up to the exponent 'e' notation.
+ * This reads everything up to the exponent 'e' notation.
  * The return includes the integer digits and any exponent offset.
  *
+ * @tparam UnsignedDecimalType The unsigned version of the desired decimal type.
+ *                             Use the `std::make_unsigned_t` to create the
+ *                             unsigned type from the storage type.
+ *
  * @param[in,out] iter Start of characters to parse
  * @param[in] end End of characters to parse
  * @return Integer component and exponent offset.
  */
-__device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& iter,
-                                                                char const* iter_end,
-                                                                const char decimal_pt_char = '.')
+template <typename UnsignedDecimalType>
+__device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
+  char const*& iter, char const* iter_end, const char decimal_pt_char = '.')
 {
   // highest value where another decimal digit cannot be appended without an overflow;
-  // this preserves the most digits when scaling the final result
-  constexpr uint64_t decimal_max = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
+  // this preserves the most digits when scaling the final result for this type
+  constexpr UnsignedDecimalType decimal_max =
+    (std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
 
   uint64_t value     = 0;  // for checking overflow
   int32_t exp_offset = 0;
@@ -56,7 +63,7 @@ __device__ inline thrust::pair<uint64_t, int32_t> parse_integer(char const*& ite
     if (value > decimal_max) {
       exp_offset += static_cast<int32_t>(!decimal_found);
     } else {
-      value = (value * 10) + static_cast<uint64_t>(ch - '0');
+      value = (value * 10) + static_cast<UnsignedDecimalType>(ch - '0');
       exp_offset -= static_cast<int32_t>(decimal_found);
     }
   }
@@ -130,7 +137,8 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   // if string begins with a sign, continue with next character
   if (sign != 0) ++iter;
 
-  auto [value, exp_offset] = parse_integer(iter, iter_end);
+  using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+  auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
   if (value == 0) { return DecimalType{0}; }
 
   // check for exponent
@@ -143,9 +151,9 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
 
   // shift the output value based on the exp_ten and the scale values
   if (exp_ten < scale) {
-    value = value / static_cast<uint64_t>(exp10(static_cast<double>(scale - exp_ten)));
+    value = value / static_cast<UnsignedDecimalType>(exp10(static_cast<double>(scale - exp_ten)));
   } else {
-    value = value * static_cast<uint64_t>(exp10(static_cast<double>(exp_ten - scale)));
+    value = value * static_cast<UnsignedDecimalType>(exp10(static_cast<double>(exp_ten - scale)));
   }
 
   return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 2f57b38249f..6f7076422c4 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -97,7 +97,8 @@ struct string_to_decimal_check_fn {
 
     auto const iter_end = d_str.data() + d_str.size_bytes();
 
-    auto [value, exp_offset] = parse_integer(iter, iter_end);
+    using UnsignedDecimalType = std::make_unsigned_t<DecimalType>;
+    auto [value, exp_offset]  = parse_integer<UnsignedDecimalType>(iter, iter_end);
 
     // only exponent notation is expected here
     if ((iter < iter_end) && (*iter != 'e' && *iter != 'E')) { return false; }
@@ -112,11 +113,10 @@ struct string_to_decimal_check_fn {
     exp_ten += exp_offset;
 
     // finally, check for overflow based on the exp_ten and scale values
-    return (exp_ten < scale)
-             ? true
-             : value <= static_cast<uint64_t>(
-                          std::numeric_limits<DecimalType>::max() /
-                          static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
+    return (exp_ten < scale) or
+           value <= static_cast<UnsignedDecimalType>(
+                      std::numeric_limits<DecimalType>::max() /
+                      static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
   }
 };
 
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index d8b570cee8b..820bf5ec216 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -189,31 +189,39 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     "9223372036854775807",
     "-9223372036854775807",
     "9223372036854775808",
+    "9223372036854775808000",
     "100E2147483648",
   });
-  results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+  results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL32});
-  auto const expected32 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, false, false, false, false, false});
+  auto const expected32 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, false, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32);
 
-  results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
+  results               = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL64});
-  auto const expected64 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, false, false});
+  auto const expected64 = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{10}});
-  auto const expected32_scaled =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, false});
+  auto const expected32_scaled = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, true, true, true, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32_scaled);
 
+  results = cudf::strings::is_fixed_point(
+    cudf::strings_column_view(big_numbers),
+    cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{10}});
+  auto const expected64_scaled_positive =
+    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, true, false});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled_positive);
+
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-5}});
-  auto const expected64_scaled =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, false, false, false, false});
+  auto const expected64_scaled = cudf::test::fixed_width_column_wrapper<bool>(
+    {true, true, true, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }

From 40cad3868ae6902cb0fe3dcf2fea16cc5a52fab2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 26 Aug 2021 04:19:31 -0700
Subject: [PATCH 19/20] Refactor implementation of column setitem (#9110)

This small PR reworks the behavior of `ColumnBase.__setitem__` when it is provided something other than a slice as input, for instance an array. This code path requires scattering the new values into the column, which previously involved converting the Column to a Frame in order to call Frame._scatter. Since that method was only used for this one purpose, the underlying libcudf scatter implementation has been rewritten to accept and return Columns, allowing us to inline the call and also get rid of a round trip from Column to Frame and back.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Marlene  (https://github.com/marlenezw)

URL: https://github.com/rapidsai/cudf/pull/9110
---
 python/cudf/cudf/_lib/copying.pyx      | 105 +++++++++----------------
 python/cudf/cudf/core/column/column.py |  18 ++---
 python/cudf/cudf/core/frame.py         |   8 --
 3 files changed, 43 insertions(+), 88 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index ed31574b4a5..88f54632000 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -19,7 +19,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.table cimport Table
+from cudf._lib.table cimport Table, make_table_view
 
 from cudf._lib.reduce import minmax
 from cudf.core.abc import Serializable
@@ -192,92 +192,59 @@ def gather(
     )
 
 
-def _scatter_table(Table source_table, Column scatter_map,
-                   Table target_table, bool bounds_check=True):
+def scatter(object source, Column scatter_map, Column target_column,
+            bool bounds_check=True):
+    """
+    Scattering input into target as per the scatter map,
+    input can be a list of scalars or can be a table
+    """
 
-    cdef table_view source_table_view = source_table.data_view()
     cdef column_view scatter_map_view = scatter_map.view()
-    cdef table_view target_table_view = target_table.data_view()
+    cdef table_view target_table_view = make_table_view((target_column,))
     cdef bool c_bounds_check = bounds_check
-
     cdef unique_ptr[table] c_result
 
-    with nogil:
-        c_result = move(
-            cpp_copying.scatter(
-                source_table_view,
-                scatter_map_view,
-                target_table_view,
-                c_bounds_check
-            )
-        )
-
-    data, _ = data_from_unique_ptr(
-        move(c_result),
-        column_names=target_table._column_names,
-        index_names=None
-    )
-
-    return data, (
-        None if target_table._index is None else target_table._index.copy(
-            deep=False)
-    )
-
-
-def _scatter_scalar(scalars, Column scatter_map,
-                    Table target_table, bool bounds_check=True):
+    # Needed for the table branch
+    cdef table_view source_table_view
 
+    # Needed for the scalar branch
     cdef vector[reference_wrapper[constscalar]] source_scalars
-    source_scalars.reserve(len(scalars))
-    cdef bool c_bounds_check = bounds_check
     cdef DeviceScalar slr
-    for val, col in zip(scalars, target_table._columns):
-        slr = as_device_scalar(val, col.dtype)
+
+    if isinstance(source, Column):
+        source_table_view = make_table_view((<Column> source,))
+
+        with nogil:
+            c_result = move(
+                cpp_copying.scatter(
+                    source_table_view,
+                    scatter_map_view,
+                    target_table_view,
+                    c_bounds_check
+                )
+            )
+    else:
+        slr = as_device_scalar(source, target_column.dtype)
         source_scalars.push_back(reference_wrapper[constscalar](
             slr.get_raw_ptr()[0]))
-    cdef column_view scatter_map_view = scatter_map.view()
-    cdef table_view target_table_view = target_table.data_view()
-
-    cdef unique_ptr[table] c_result
 
-    with nogil:
-        c_result = move(
-            cpp_copying.scatter(
-                source_scalars,
-                scatter_map_view,
-                target_table_view,
-                c_bounds_check
+        with nogil:
+            c_result = move(
+                cpp_copying.scatter(
+                    source_scalars,
+                    scatter_map_view,
+                    target_table_view,
+                    c_bounds_check
+                )
             )
-        )
 
     data, _ = data_from_unique_ptr(
         move(c_result),
-        column_names=target_table._column_names,
+        column_names=(None,),
         index_names=None
     )
 
-    return data, (
-        None if target_table._index is None else target_table._index.copy(
-            deep=False)
-    )
-
-
-def scatter(object input, object scatter_map, Table target,
-            bool bounds_check=True):
-    """
-    Scattering input into target as per the scatter map,
-    input can be a list of scalars or can be a table
-    """
-
-    from cudf.core.column.column import as_column
-
-    if not isinstance(scatter_map, Column):
-        scatter_map = as_column(scatter_map)
-
-    if isinstance(input, Table):
-        return _scatter_table(input, scatter_map, target, bounds_check)
-    else:
-        return _scatter_scalar(input, scatter_map, target, bounds_check)
+    return next(iter(data.values()))
 
 
 def _reverse_column(Column source_column):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index d52f63a79f5..c834efec9fb 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -599,17 +599,13 @@ def __setitem__(self, key: Any, value: Any):
             )
         else:
             try:
-                if is_scalar(value):
-                    input = self
-                    out = input.as_frame()._scatter(key, [value])._as_column()
-                else:
-                    if not isinstance(value, Column):
-                        value = as_column(value)
-                    out = (
-                        self.as_frame()
-                        ._scatter(key, value.as_frame())
-                        ._as_column()
-                    )
+                if not isinstance(key, Column):
+                    key = as_column(key)
+                if not is_scalar(value) and not isinstance(value, Column):
+                    value = as_column(value)
+                out = libcudf.copying.scatter(
+                    value, key, self
+                )._with_type_metadata(self.dtype)
             except RuntimeError as e:
                 if "out of bounds" in str(e):
                     raise IndexError(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 9f743cd8c85..4f46794aa3f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -692,14 +692,6 @@ def _as_column(self):
 
         return self._data[None].copy(deep=False)
 
-    def _scatter(self, key, value):
-        result = self.__class__._from_data(
-            *libcudf.copying.scatter(value, key, self)
-        )
-
-        result._copy_type_metadata(self)
-        return result
-
     def _empty_like(self, keep_index=True):
         result = self.__class__._from_data(
             *libcudf.copying.table_empty_like(self, keep_index)

From 0ad36ff30fd693e9778ec7af206d5e5460d2bfa5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 26 Aug 2021 11:38:04 -0500
Subject: [PATCH 20/20] Add backend for `percentile_lookup` dispatch (#9118)

This PR adds backend for `percentile_lookup` dispatch in `dask_cudf`, related dask upstream changes were done in https://github.com/dask/dask/pull/8083/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/9118
---
 python/dask_cudf/dask_cudf/backends.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index c0204190957..299d6f7b119 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -259,9 +259,14 @@ def is_categorical_dtype_cudf(obj):
 
 
 try:
-    from dask.dataframe.dispatch import percentile_dispatch
+    try:
+        from dask.array.dispatch import percentile_lookup
+    except ImportError:
+        from dask.dataframe.dispatch import (
+            percentile_dispatch as percentile_lookup,
+        )
 
-    @percentile_dispatch.register((cudf.Series, cp.ndarray, cudf.Index))
+    @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.Index))
     def percentile_cudf(a, q, interpolation="linear"):
         # Cudf dispatch to the equivalent of `np.percentile`:
         # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html