From ad5452d7eb417527ad6bd0b6a29a544466b38429 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 26 Mar 2021 10:24:00 -0400
Subject: [PATCH 01/20] Add gbenchmark for nvtext replace-tokens function
 (#7708)

Reference #5696
Creates gbenchmarks for `nvtext::replace_tokens()` function.
The benchmarks measures various string lengths and number of rows with the default whitespace delimiter and 4 hardcoded tokens.

This API already uses the `make_strings_children` utility.

Authors:
  - David (@davidwendt)

Approvers:
  - Karthikeyan (@karthikeyann)
  - Nghia Truong (@ttnghia)
  - @nvdbaranec
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7708
---
 cpp/benchmarks/CMakeLists.txt             |  5 +-
 cpp/benchmarks/text/replace_benchmark.cpp | 85 +++++++++++++++++++++++
 2 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 cpp/benchmarks/text/replace_benchmark.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7fd84b508ac..43ca6de11b4 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -177,8 +177,9 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
 ConfigureBench(TEXT_BENCH
   text/normalize_benchmark.cpp
   text/normalize_spaces_benchmark.cpp
-  text/tokenize_benchmark.cpp
-  text/subword_benchmark.cpp)
+  text/replace_benchmark.cpp
+  text/subword_benchmark.cpp
+  text/tokenize_benchmark.cpp)
 
 ###################################################################################################
 # - strings benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/text/replace_benchmark.cpp b/cpp/benchmarks/text/replace_benchmark.cpp
new file mode 100644
index 00000000000..f5428aee225
--- /dev/null
+++ b/cpp/benchmarks/text/replace_benchmark.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <nvtext/replace.hpp>
+
+class TextReplace : public cudf::benchmark {
+};
+
+static void BM_replace(benchmark::State& state)
+{
+  auto const n_rows   = static_cast<cudf::size_type>(state.range(0));
+  auto const n_length = static_cast<cudf::size_type>(state.range(1));
+
+  std::vector<std::string> words{" ",        "one  ",    "two ",       "three ",     "four ",
+                                 "five ",    "six  ",    "sevén  ",    "eight ",     "nine ",
+                                 "ten   ",   "eleven ",  "twelve ",    "thirteen  ", "fourteen ",
+                                 "fifteen ", "sixteen ", "seventeen ", "eighteen ",  "nineteen "};
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> tokens_dist(0, words.size() - 1);
+  std::string row;  // build a row of random tokens
+  while (static_cast<int>(row.size()) < n_length) row += words[tokens_dist(generator)];
+
+  std::uniform_int_distribution<int> position_dist(0, 16);
+
+  auto elements = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto idx) { return row.c_str() + position_dist(generator); });
+  cudf::test::strings_column_wrapper input(elements, elements + n_rows);
+  cudf::strings_column_view view(input);
+
+  cudf::test::strings_column_wrapper targets({"one", "two", "sevén", "zero"});
+  cudf::test::strings_column_wrapper replacements({"1", "2", "7", "0"});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    nvtext::replace_tokens(
+      view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements));
+  }
+
+  state.SetBytesProcessed(state.iterations() * view.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows          = 1 << 12;
+  int const max_rows          = 1 << 24;
+  int const row_multiplier    = 8;
+  int const min_row_length    = 1 << 5;
+  int const max_row_length    = 1 << 13;
+  int const length_multiplier = 4;
+  generate_string_bench_args(
+    b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)           \
+  BENCHMARK_DEFINE_F(TextReplace, name)         \
+  (::benchmark::State & st) { BM_replace(st); } \
+  BENCHMARK_REGISTER_F(TextReplace, name)       \
+    ->Apply(generate_bench_args)                \
+    ->UseManualTime()                           \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(replace)

From bf2e96c70c9c7097ecf64ad413550be2f75374b8 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 26 Mar 2021 11:42:37 -0400
Subject: [PATCH 02/20] Add support for `unique` groupby aggregation (#7726)

Adds support for `SeriesGroupBy.unique()`. Also adds support for `DataFrameGroupBy.unique()` but that's not tested, as Pandas doesn't support it (yet?).

Resolves https://github.com/rapidsai/cudf/issues/2973

Authors:
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7726
---
 docs/cudf/source/groupby.md            |  1 +
 python/cudf/cudf/_lib/aggregation.pyx  |  7 +++---
 python/cudf/cudf/_lib/groupby.pyx      | 31 +++++++++++++++++++-------
 python/cudf/cudf/tests/test_groupby.py | 31 +++++++++++++++++++++++++-
 4 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md
index 7e96d4fe38c..5376df261e7 100644
--- a/docs/cudf/source/groupby.md
+++ b/docs/cudf/source/groupby.md
@@ -137,6 +137,7 @@ The following table summarizes the available aggregations and the types that sup
 | nunique             | ✅       | ✅       | ✅       | ✅          |      |        |
 | nth                 | ✅       | ✅       | ✅       |             |      |        |
 | collect             | ✅       | ✅       | ✅       |             | ✅   |        |
+| unique              | ✅       | ✅       | ✅       | ✅          |      |        |
 
 ## GroupBy apply
 
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 840f0c98987..7138bb49743 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -41,7 +41,7 @@ class AggregationKind(Enum):
     ALL = libcudf_aggregation.aggregation.Kind.ALL
     SUM_OF_SQUARES = libcudf_aggregation.aggregation.Kind.SUM_OF_SQUARES
     MEAN = libcudf_aggregation.aggregation.Kind.MEAN
-    VARIANCE = libcudf_aggregation.aggregation.Kind.VARIANCE
+    VAR = libcudf_aggregation.aggregation.Kind.VARIANCE
     STD = libcudf_aggregation.aggregation.Kind.STD
     MEDIAN = libcudf_aggregation.aggregation.Kind.MEDIAN
     QUANTILE = libcudf_aggregation.aggregation.Kind.QUANTILE
@@ -50,13 +50,12 @@ class AggregationKind(Enum):
     NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE
     NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT
     COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT
-    COLLECT_SET = libcudf_aggregation.aggregation.Kind.COLLECT_SET
+    UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
     CUDA = libcudf_aggregation.aggregation.Kind.CUDA
 
 
 cdef class Aggregation:
-
     def __init__(self, op, **kwargs):
         self.c_obj = move(make_aggregation(op, kwargs))
 
@@ -246,7 +245,7 @@ cdef class _AggregationFactory:
         return agg
 
     @classmethod
-    def collect_set(cls):
+    def unique(cls):
         cdef Aggregation agg = Aggregation.__new__(Aggregation)
         agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation())
         return agg
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 0f5cdc73d3b..713a2274a77 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -35,13 +35,15 @@ _GROUPBY_AGGS = {
     "median",
     "nunique",
     "nth",
-    "collect"
+    "collect",
+    "unique",
 }
 
 _CATEGORICAL_AGGS = {
     "count",
     "size",
     "nunique",
+    "unique",
 }
 
 _STRING_AGGS = {
@@ -51,13 +53,15 @@ _STRING_AGGS = {
     "min",
     "nunique",
     "nth",
-    "collect"
+    "collect",
+    "unique",
 }
 
 _LIST_AGGS = {
-    "collect"
+    "collect",
 }
 
+
 cdef class GroupBy:
     cdef unique_ptr[libcudf_groupby.groupby] c_obj
     cdef dict __dict__
@@ -145,12 +149,23 @@ cdef class GroupBy:
             vector[libcudf_groupby.aggregation_result]
         ] c_result
 
-        with nogil:
-            c_result = move(
-                self.c_obj.get()[0].aggregate(
-                    c_agg_requests
+        try:
+            with nogil:
+                c_result = move(
+                    self.c_obj.get()[0].aggregate(
+                        c_agg_requests
+                    )
                 )
-            )
+        except RuntimeError as e:
+            # TODO: remove this try..except after
+            # https://github.com/rapidsai/cudf/issues/7611
+            # is resolved
+            if ("make_empty_column") in str(e):
+                raise NotImplementedError(
+                    "Aggregation not supported for empty columns"
+                ) from e
+            else:
+                raise
 
         grouped_keys = Table.from_unique_ptr(
             move(c_result.first),
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 8011510d340..a96db59dee3 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -12,7 +12,13 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.tests.utils import (
+    DATETIME_TYPES,
+    SIGNED_TYPES,
+    TIMEDELTA_TYPES,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 _now = np.datetime64("now")
 _tomorrow = _now + np.timedelta64(1, "D")
@@ -1532,3 +1538,26 @@ def test_groupby_nonempty_no_keys(pdf):
         lambda: gdf.groupby([]),
         compare_error_message=False,
     )
+
+
+@pytest.mark.parametrize(
+    "by,data",
+    [
+        # ([], []),  # error?
+        ([1, 1, 2, 2], [0, 0, 1, 1]),
+        ([1, 2, 3, 4], [0, 0, 0, 0]),
+        ([1, 2, 1, 2], [0, 1, 1, 1]),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    SIGNED_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["string", "category"],
+)
+def test_groupby_unique(by, data, dtype):
+    pdf = pd.DataFrame({"by": by, "data": data})
+    pdf["data"] = pdf["data"].astype(dtype)
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.groupby("by")["data"].unique()
+    got = gdf.groupby("by")["data"].unique()
+    assert_eq(expect, got)

From b0586c4e8988b836d8bcdeddfd5d384b7011af6f Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 26 Mar 2021 14:23:06 -0500
Subject: [PATCH 03/20] Added JNI support for new is_integer (#7739)

Adds JNI bindings for improved is_integer with bounds checks

Authors:
  - Robert (Bobby) Evans (@revans2)

Approvers:
  - Jason Lowe (@jlowe)

URL: https://github.com/rapidsai/cudf/pull/7739
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 21 ++++++-
 java/src/main/native/src/ColumnViewJni.cpp    | 17 +++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 63 +++++++++++++++++++
 3 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index e50a9e86ead..b29b873092d 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -288,19 +288,34 @@ public final ColumnVector isNull() {
   /**
    * Returns a Boolean vector with the same number of rows as this instance, that has
    * TRUE for any entry that is an integer, and FALSE if its not an integer. A null will be returned
-   * for null entries
+   * for null entries.
    *
    * NOTE: Integer doesn't mean a 32-bit integer. It means a number that is not a fraction.
    * i.e. If this method returns true for a value it could still result in an overflow or underflow
    * if you convert it to a Java integral type
    *
-   * @return - Boolean vector
+   * @return Boolean vector
    */
   public final ColumnVector isInteger() {
     assert type.equals(DType.STRING);
     return new ColumnVector(isInteger(getNativeView()));
   }
 
+  /**
+   * Returns a Boolean vector with the same number of rows as this instance, that has
+   * TRUE for any entry that is an integer, and FALSE if its not an integer. A null will be returned
+   * for null entries.
+   *
+   * @param intType the data type that should be used for bounds checking. Note that only
+   *                integer types are allowed.
+   * @return Boolean vector
+   */
+  public final ColumnVector isInteger(DType intType) {
+    assert type.equals(DType.STRING);
+    return new ColumnVector(isIntegerWithType(getNativeView(),
+        intType.getTypeId().getNativeId(), intType.getScale()));
+  }
+
   /**
    * Returns a Boolean vector with the same number of rows as this instance, that has
    * TRUE for any entry that is a float, and FALSE if its not a float. A null will be returned
@@ -2845,6 +2860,8 @@ private static native long rollingWindow(
 
   private static native long isInteger(long viewHandle);
 
+  private static native long isIntegerWithType(long viewHandle, int typeId, int typeScale);
+
   private static native long isNotNanNative(long viewHandle);
 
   private static native long isNotNullNative(long viewHandle);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4132016d85c..3928794b55c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1788,6 +1788,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jo
   CATCH_STD(env, 0)
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv *env, jobject,
+                                                                         jlong handle,
+                                                                         jint j_dtype,
+                                                                         jint scale) {
+
+  JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::data_type int_dtype = cudf::jni::make_data_type(j_dtype, scale);
+    std::unique_ptr<cudf::column> result = cudf::strings::is_integer(*view, int_dtype);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0)
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv *env, jobject j_object,
                                                                           jlong handle) {
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 02fbe56431b..5a9404f5760 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3339,6 +3339,69 @@ void testNansToNulls() {
     }
   }
 
+  @Test
+  void testIsIntegerWithBounds() {
+    String[] intStrings = {"A", "nan", "Inf", "-Inf", "3.5",
+        String.valueOf(Byte.MIN_VALUE),
+        String.valueOf(Byte.MIN_VALUE + 1L),
+        String.valueOf(Byte.MIN_VALUE - 1L),
+        String.valueOf(Byte.MAX_VALUE),
+        String.valueOf(Byte.MAX_VALUE + 1L),
+        String.valueOf(Byte.MAX_VALUE - 1L),
+        String.valueOf(Short.MIN_VALUE),
+        String.valueOf(Short.MIN_VALUE + 1L),
+        String.valueOf(Short.MIN_VALUE - 1L),
+        String.valueOf(Short.MAX_VALUE),
+        String.valueOf(Short.MAX_VALUE + 1L),
+        String.valueOf(Short.MAX_VALUE - 1L),
+        String.valueOf(Integer.MIN_VALUE),
+        String.valueOf(Integer.MIN_VALUE + 1L),
+        String.valueOf(Integer.MIN_VALUE - 1L),
+        String.valueOf(Integer.MAX_VALUE),
+        String.valueOf(Integer.MAX_VALUE + 1L),
+        String.valueOf(Integer.MAX_VALUE - 1L),
+        String.valueOf(Long.MIN_VALUE),
+        String.valueOf(Long.MIN_VALUE + 1L),
+        "-9223372036854775809",
+        String.valueOf(Long.MAX_VALUE),
+        "9223372036854775808",
+        String.valueOf(Long.MAX_VALUE - 1L)};
+    try (ColumnVector intStringCV = ColumnVector.fromStrings(intStrings);
+         ColumnVector isByte = intStringCV.isInteger(DType.INT8);
+         ColumnVector expectedByte = ColumnVector.fromBoxedBooleans(
+             false, false, false, false, false,
+             true, true, false, true, false, true,
+             false, false, false, false, false, false,
+             false, false, false, false, false, false,
+             false, false, false, false, false, false);
+         ColumnVector isShort = intStringCV.isInteger(DType.INT16);
+         ColumnVector expectedShort = ColumnVector.fromBoxedBooleans(
+             false, false, false, false, false,
+             true, true, true, true, true, true,
+             true, true, false, true, false, true,
+             false, false, false, false, false, false,
+             false, false, false, false, false, false);
+         ColumnVector isInt = intStringCV.isInteger(DType.INT32);
+         ColumnVector expectedInt = ColumnVector.fromBoxedBooleans(
+             false, false, false, false, false,
+             true, true, true, true, true, true,
+             true, true, true, true, true, true,
+             true, true, false, true, false, true,
+             false, false, false, false, false, false);
+         ColumnVector isLong = intStringCV.isInteger(DType.INT64);
+         ColumnVector expectedLong = ColumnVector.fromBoxedBooleans(
+             false, false, false, false, false,
+             true, true, true, true, true, true,
+             true, true, true, true, true, true,
+             true, true, true, true, true, true,
+             true, true, false, true, false, true)) {
+      assertColumnsAreEqual(expectedByte, isByte);
+      assertColumnsAreEqual(expectedShort, isShort);
+      assertColumnsAreEqual(expectedInt, isInt);
+      assertColumnsAreEqual(expectedLong, isLong);
+    }
+  }
+
   @Test
   void testIsInteger() {
     String[] intStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", "2147483647",

From add4b4535999dcc200b7fdf83298b90d0495af96 Mon Sep 17 00:00:00 2001
From: Kumar Aatish <kaatish@nvidia.com>
Date: Fri, 26 Mar 2021 22:26:02 -0400
Subject: [PATCH 04/20] Fix string length in stripe dictionary building (#7744)

In PR #7676 the length of the current string being referred to while building stripe dictionaries was always set to 0 while incrementing the dictionary character count of a StripeDictionary. This led to corrupted strings when the dictionary encoding was used as noted in issue #7741. This has been fixed in this PR.

Fixes #7741

Authors:
  - Kumar Aatish (@kaatish)

Approvers:
  - Vukasin Milovanovic (@vuule)
  - Nghia Truong (@ttnghia)

URL: https://github.com/rapidsai/cudf/pull/7744
---
 cpp/src/io/orc/dict_enc.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 5695e882a95..e69a61bde66 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -396,7 +396,10 @@ __global__ void __launch_bounds__(block_size)
     uint32_t cur     = (i + t < num_strings) ? dict_data[i + t] : 0;
     uint32_t cur_len = 0;
     bool is_dupe     = false;
-    if (i + t < num_strings) { current_string = s->stripe.leaf_column->element<string_view>(cur); }
+    if (i + t < num_strings) {
+      current_string = s->stripe.leaf_column->element<string_view>(cur);
+      cur_len        = current_string.size_bytes();
+    }
     if (i + t != 0 && i + t < num_strings) {
       uint32_t prev = dict_data[i + t - 1];
       is_dupe       = (current_string == (s->stripe.leaf_column->element<string_view>(prev)));

From 44adf97fc49e5569b83b31ad5c7f05f6b64c20bd Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 26 Mar 2021 22:48:53 -0700
Subject: [PATCH 05/20] Fix dictionary size computation in ORC writer (#7737)

Fixes #7661

Corrects the field order in `std::accumulate` that computes the string column size w.r.t encoding.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Kumar Aatish (@kaatish)
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/7737
---
 cpp/src/io/orc/writer_impl.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index cb75698fd8d..10050806552 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -321,8 +321,8 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
                         string_column_cost{},
                         [&](auto cost, auto rg_idx) -> string_column_cost {
                           const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx];
-                          return {cost.dictionary + dt.dict_char_count + dt.num_dict_strings,
-                                  cost.direct + dt.string_char_count};
+                          return {cost.direct + dt.string_char_count,
+                                  cost.dictionary + dt.dict_char_count + dt.num_dict_strings};
                         });
       // Disable dictionary if it does not reduce the output size
       if (col_cost.dictionary >= col_cost.direct) {

From ccc28d55202f6f6bb14718ed9022881ef0176b6e Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Sat, 27 Mar 2021 13:56:20 +0530
Subject: [PATCH 06/20] Use stream in groupby calls (#7705)

**sort_groupby_helper::**
- [x] sorted_values()
- [x] grouped_values()
-  unique_keys()
-  sorted_keys()
- [x] num_groups()
-  num_keys()
- [x] key_sort_order()
- [x] group_offsets()
- [x] group_labels()
- [x] unsorted_keys_labels()
- [x] keys_bitmask_column()

**groupby::**
- [x] - dispatch_aggregation()

Authors:
  - Karthikeyan (@karthikeyann)

Approvers:
  - David (@davidwendt)
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/7705
---
 .../cudf/detail/groupby/sort_helper.hpp       | 26 ++++----
 cpp/src/groupby/groupby.cu                    |  4 +-
 cpp/src/groupby/sort/aggregate.cpp            | 60 ++++++++++---------
 cpp/src/groupby/sort/functors.hpp             |  4 +-
 cpp/src/groupby/sort/scan.cpp                 | 13 ++--
 cpp/src/groupby/sort/sort_helper.cu           | 26 ++++----
 cpp/src/rolling/grouped_rolling.cu            |  4 +-
 7 files changed, 72 insertions(+), 65 deletions(-)

diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index a68d649b8c8..bfc9673d3cb 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -93,7 +93,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> sorted_values(
     column_view const& values,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -108,7 +108,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> grouped_values(
     column_view const& values,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -117,7 +117,7 @@ struct sort_groupby_helper {
    * @return a new table in which each row is a unique row in the sorted key table.
    */
   std::unique_ptr<table> unique_keys(
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -126,13 +126,13 @@ struct sort_groupby_helper {
    * @return a new table containing the sorted keys.
    */
   std::unique_ptr<table> sorted_keys(
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get the number of groups in `keys`
    */
-  size_type num_groups() { return group_offsets().size() - 1; }
+  size_type num_groups(rmm::cuda_stream_view stream) { return group_offsets(stream).size() - 1; }
 
   /**
    * @brief Return the effective number of keys
@@ -141,7 +141,7 @@ struct sort_groupby_helper {
    * When include_null_keys = NO, returned value is the number of rows in `keys`
    *  in which no element is null
    */
-  size_type num_keys(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  size_type num_keys(rmm::cuda_stream_view stream);
 
   /**
    * @brief Get the sorted order of `keys`.
@@ -156,7 +156,7 @@ struct sort_groupby_helper {
    *
    * @return the sort order indices for `keys`.
    */
-  column_view key_sort_order(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  column_view key_sort_order(rmm::cuda_stream_view stream);
 
   /**
    * @brief Get each group's offset into the sorted order of `keys`.
@@ -169,13 +169,13 @@ struct sort_groupby_helper {
    * @return vector of offsets of the starting point of each group in the sorted
    * key table
    */
-  index_vector const& group_offsets(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  index_vector const& group_offsets(rmm::cuda_stream_view stream);
 
   /**
    * @brief Get the group labels corresponding to the sorted order of `keys`.
    *
    * Each group is assigned a unique numerical "label" in
-   * `[0, 1, 2, ... , num_groups() - 1, num_groups())`.
+   * `[0, 1, 2, ... , num_groups() - 1, num_groups(stream))`.
    * For a row in sorted `keys`, its corresponding group label indicates which
    * group it belongs to.
    *
@@ -184,7 +184,7 @@ struct sort_groupby_helper {
    *
    * @return vector of group labels for each row in the sorted key column
    */
-  index_vector const& group_labels(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  index_vector const& group_labels(rmm::cuda_stream_view stream);
 
  private:
   /**
@@ -192,7 +192,7 @@ struct sort_groupby_helper {
    *
    * Returns the group label for every row in the original `keys` table. For a
    * given unique key row, its group label is equivalent to what is returned by
-   * `group_labels()`. However, if a row contains a null value, and
+   * `group_labels(stream)`. However, if a row contains a null value, and
    * `include_null_keys == NO`, then its label is NULL.
    *
    * Computes and stores unsorted labels on first invocation and returns stored
@@ -201,7 +201,7 @@ struct sort_groupby_helper {
    * @return A nullable column of `INT32` containing group labels in the order
    *         of the unsorted key table
    */
-  column_view unsorted_keys_labels(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  column_view unsorted_keys_labels(rmm::cuda_stream_view stream);
 
   /**
    * @brief Get the column representing the row bitmask for the `keys`
@@ -215,7 +215,7 @@ struct sort_groupby_helper {
    * Computes and stores bitmask on first invocation and returns stored column
    * on subsequent calls.
    */
-  column_view keys_bitmask_column(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+  column_view keys_bitmask_column(rmm::cuda_stream_view stream);
 
  private:
   column_ptr _key_sorted_order;      ///< Indices to produce _keys in sorted order
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 0312d17a37c..34c57996af3 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -156,7 +156,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
   if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); }
 
-  return dispatch_aggregation(requests, 0, mr);
+  return dispatch_aggregation(requests, rmm::cuda_stream_default, mr);
 }
 
 // Compute scan requests
@@ -190,7 +190,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
 
   if (values.num_columns()) {
     auto grouped_values = cudf::detail::gather(values,
-                                               helper().key_sort_order(),
+                                               helper().key_sort_order(rmm::cuda_stream_default),
                                                cudf::out_of_bounds_policy::DONT_CHECK,
                                                cudf::detail::negative_index_policy::NOT_ALLOWED,
                                                rmm::cuda_stream_default,
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 86e2837967e..4e2303c8b9b 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -70,8 +70,9 @@ void aggregrate_result_functor::operator()<aggregation::COUNT_VALID>(aggregation
     agg,
     get_grouped_values().nullable()
       ? detail::group_count_valid(
-          get_grouped_values(), helper.group_labels(), helper.num_groups(), stream, mr)
-      : detail::group_count_all(helper.group_offsets(), helper.num_groups(), stream, mr));
+          get_grouped_values(), helper.group_labels(stream), helper.num_groups(stream), stream, mr)
+      : detail::group_count_all(
+          helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
 template <>
@@ -80,7 +81,9 @@ void aggregrate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation c
   if (cache.has_result(col_idx, agg)) return;
 
   cache.add_result(
-    col_idx, agg, detail::group_count_all(helper.group_offsets(), helper.num_groups(), stream, mr));
+    col_idx,
+    agg,
+    detail::group_count_all(helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
 template <>
@@ -88,10 +91,11 @@ void aggregrate_result_functor::operator()<aggregation::SUM>(aggregation const&
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  cache.add_result(col_idx,
-                   agg,
-                   detail::group_sum(
-                     get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::group_sum(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 };
 
 template <>
@@ -102,9 +106,9 @@ void aggregrate_result_functor::operator()<aggregation::ARGMAX>(aggregation cons
   cache.add_result(col_idx,
                    agg,
                    detail::group_argmax(get_grouped_values(),
-                                        helper.num_groups(),
-                                        helper.group_labels(),
-                                        helper.key_sort_order(),
+                                        helper.num_groups(stream),
+                                        helper.group_labels(stream),
+                                        helper.key_sort_order(stream),
                                         stream,
                                         mr));
 };
@@ -117,9 +121,9 @@ void aggregrate_result_functor::operator()<aggregation::ARGMIN>(aggregation cons
   cache.add_result(col_idx,
                    agg,
                    detail::group_argmin(get_grouped_values(),
-                                        helper.num_groups(),
-                                        helper.group_labels(),
-                                        helper.key_sort_order(),
+                                        helper.num_groups(stream),
+                                        helper.group_labels(stream),
+                                        helper.key_sort_order(stream),
                                         stream,
                                         mr));
 };
@@ -132,7 +136,7 @@ void aggregrate_result_functor::operator()<aggregation::MIN>(aggregation const&
   auto result = [&]() {
     if (cudf::is_fixed_width(values.type())) {
       return detail::group_min(
-        get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr);
+        get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr);
     } else {
       auto argmin_agg = make_argmin_aggregation();
       operator()<aggregation::ARGMIN>(*argmin_agg);
@@ -169,7 +173,7 @@ void aggregrate_result_functor::operator()<aggregation::MAX>(aggregation const&
   auto result = [&]() {
     if (cudf::is_fixed_width(values.type())) {
       return detail::group_max(
-        get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr);
+        get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr);
     } else {
       auto argmax_agg = make_argmax_aggregation();
       operator()<aggregation::ARGMAX>(*argmax_agg);
@@ -238,7 +242,7 @@ void aggregrate_result_functor::operator()<aggregation::VARIANCE>(aggregation co
   auto result = detail::group_var(get_grouped_values(),
                                   mean_result,
                                   group_sizes,
-                                  helper.group_labels(),
+                                  helper.group_labels(stream),
                                   var_agg._ddof,
                                   stream,
                                   mr);
@@ -271,8 +275,8 @@ void aggregrate_result_functor::operator()<aggregation::QUANTILE>(aggregation co
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
-                                        helper.group_offsets(),
-                                        helper.num_groups(),
+                                        helper.group_offsets(stream),
+                                        helper.num_groups(stream),
                                         quantile_agg._quantiles,
                                         quantile_agg._interpolation,
                                         stream,
@@ -291,8 +295,8 @@ void aggregrate_result_functor::operator()<aggregation::MEDIAN>(aggregation cons
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
-                                        helper.group_offsets(),
-                                        helper.num_groups(),
+                                        helper.group_offsets(stream),
+                                        helper.num_groups(stream),
                                         {0.5},
                                         interpolation::LINEAR,
                                         stream,
@@ -308,9 +312,9 @@ void aggregrate_result_functor::operator()<aggregation::NUNIQUE>(aggregation con
   auto nunique_agg = static_cast<cudf::detail::nunique_aggregation const&>(agg);
 
   auto result = detail::group_nunique(get_sorted_values(),
-                                      helper.group_labels(),
-                                      helper.num_groups(),
-                                      helper.group_offsets(),
+                                      helper.group_labels(stream),
+                                      helper.num_groups(stream),
+                                      helper.group_offsets(stream),
                                       nunique_agg._null_handling,
                                       stream,
                                       mr);
@@ -337,9 +341,9 @@ void aggregrate_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation
                    agg,
                    detail::group_nth_element(get_grouped_values(),
                                              group_sizes,
-                                             helper.group_labels(),
-                                             helper.group_offsets(),
-                                             helper.num_groups(),
+                                             helper.group_labels(stream),
+                                             helper.group_offsets(stream),
+                                             helper.num_groups(stream),
                                              nth_element_agg._n,
                                              nth_element_agg._null_handling,
                                              stream,
@@ -357,7 +361,7 @@ void aggregrate_result_functor::operator()<aggregation::COLLECT_LIST>(aggregatio
   if (cache.has_result(col_idx, agg)) return;
 
   auto result = detail::group_collect(
-    get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr);
+    get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr);
 
   cache.add_result(col_idx, agg, std::move(result));
 };
@@ -373,7 +377,7 @@ void aggregrate_result_functor::operator()<aggregation::COLLECT_SET>(aggregation
   if (cache.has_result(col_idx, agg)) { return; }
 
   auto const collect_result = detail::group_collect(
-    get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr);
+    get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr);
   auto const nulls_equal =
     static_cast<cudf::detail::collect_set_aggregation const&>(agg)._null_equal;
   cache.add_result(col_idx,
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index 565320fbe80..afb92f8e141 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -64,7 +64,7 @@ struct store_result_functor {
       // It's overridden in scan implementation.
       return sorted_values->view();
     else
-      return (grouped_values = helper.grouped_values(values))->view();
+      return (grouped_values = helper.grouped_values(values, stream))->view();
   };
 
   /**
@@ -76,7 +76,7 @@ struct store_result_functor {
   column_view get_sorted_values()
   {
     return sorted_values ? sorted_values->view()
-                         : (sorted_values = helper.sorted_values(values))->view();
+                         : (sorted_values = helper.sorted_values(values, stream))->view();
   };
 
  protected:
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 3d7ccf18242..336a6777ffa 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -59,7 +59,7 @@ struct scan_result_functor final : store_result_functor {
     if (grouped_values)
       return grouped_values->view();
     else
-      return (grouped_values = helper.grouped_values(values))->view();
+      return (grouped_values = helper.grouped_values(values, stream))->view();
   };
 };
 
@@ -71,7 +71,8 @@ void scan_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
   cache.add_result(
     col_idx,
     agg,
-    detail::sum_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+    detail::sum_scan(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 }
 
 template <>
@@ -82,7 +83,8 @@ void scan_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
   cache.add_result(
     col_idx,
     agg,
-    detail::min_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+    detail::min_scan(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 }
 
 template <>
@@ -93,7 +95,8 @@ void scan_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
   cache.add_result(
     col_idx,
     agg,
-    detail::max_scan(get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
+    detail::max_scan(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
 }
 
 template <>
@@ -101,7 +104,7 @@ void scan_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const&
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  cache.add_result(col_idx, agg, detail::count_scan(helper.group_labels(), stream, mr));
+  cache.add_result(col_idx, agg, detail::count_scan(helper.group_labels(stream), stream, mr));
 }
 }  // namespace detail
 
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 6a9da36e21b..5e944f75712 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -141,7 +141,7 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
     // presence of a null value within a row. This allows moving all rows that
     // contain a null value to the end of the sorted order.
 
-    auto augmented_keys = table_view({table_view({keys_bitmask_column()}), _keys});
+    auto augmented_keys = table_view({table_view({keys_bitmask_column(stream)}), _keys});
 
     _key_sorted_order = cudf::detail::stable_sorted_order(
       augmented_keys,
@@ -164,7 +164,7 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
 
   auto device_input_table = table_device_view::create(_keys, stream);
-  auto sorted_order       = key_sort_order().data<size_type>();
+  auto sorted_order       = key_sort_order(stream).data<size_type>();
   decltype(_group_offsets->begin()) result_end;
 
   if (has_nulls(_keys)) {
@@ -207,9 +207,9 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(
                              group_labels.end(),
                              index_vector::value_type{0});
   thrust::scatter(rmm::exec_policy(stream),
-                  thrust::make_constant_iterator(1, decltype(num_groups())(1)),
-                  thrust::make_constant_iterator(1, num_groups()),
-                  group_offsets().begin() + 1,
+                  thrust::make_constant_iterator(1, decltype(num_groups(stream))(1)),
+                  thrust::make_constant_iterator(1, num_groups(stream)),
+                  group_offsets(stream).begin() + 1,
                   group_labels.begin());
 
   thrust::inclusive_scan(
@@ -226,9 +226,9 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre
     data_type(type_to_id<size_type>()), _keys.num_rows(), mask_state::ALL_NULL, stream);
 
   auto group_labels_view = cudf::column_view(
-    data_type(type_to_id<size_type>()), group_labels().size(), group_labels().data());
+    data_type(type_to_id<size_type>()), group_labels(stream).size(), group_labels(stream).data());
 
-  auto scatter_map = key_sort_order();
+  auto scatter_map = key_sort_order(stream);
 
   std::unique_ptr<table> t_unsorted_keys_labels =
     cudf::detail::scatter(table_view({group_labels_view}),
@@ -267,7 +267,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
   column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   column_ptr values_sort_order =
-    cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(), values}),
+    cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(stream), values}),
                                       {},
                                       std::vector<null_order>(2, null_order::AFTER),
                                       stream,
@@ -289,7 +289,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
 sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
   column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
-  auto gather_map = key_sort_order();
+  auto gather_map = key_sort_order(stream);
 
   auto grouped_values_table = cudf::detail::gather(table_view({values}),
                                                    gather_map,
@@ -304,14 +304,14 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
 std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
-  auto idx_data = key_sort_order().data<size_type>();
+  auto idx_data = key_sort_order(stream).data<size_type>();
 
   auto gather_map_it = thrust::make_transform_iterator(
-    group_offsets().begin(), [idx_data] __device__(size_type i) { return idx_data[i]; });
+    group_offsets(stream).begin(), [idx_data] __device__(size_type i) { return idx_data[i]; });
 
   return cudf::detail::gather(_keys,
                               gather_map_it,
-                              gather_map_it + num_groups(),
+                              gather_map_it + num_groups(stream),
                               out_of_bounds_policy::DONT_CHECK,
                               stream,
                               mr);
@@ -321,7 +321,7 @@ std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view st
                                                         rmm::mr::device_memory_resource* mr)
 {
   return cudf::detail::gather(_keys,
-                              key_sort_order(),
+                              key_sort_order(stream),
                               cudf::out_of_bounds_policy::DONT_CHECK,
                               cudf::detail::negative_index_policy::NOT_ALLOWED,
                               stream,
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index b8cb5e45fec..34d6d5fa194 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -838,8 +838,8 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
   index_vector group_offsets(0, stream), group_labels(0, stream);
   if (group_keys.num_columns() > 0) {
     sort_groupby_helper helper{group_keys, cudf::null_policy::INCLUDE, cudf::sorted::YES};
-    group_offsets = index_vector(helper.group_offsets(), stream);
-    group_labels  = index_vector(helper.group_labels(), stream);
+    group_offsets = index_vector(helper.group_offsets(stream), stream);
+    group_labels  = index_vector(helper.group_labels(stream), stream);
   }
 
   // Assumes that `timestamp_column` is actually of a timestamp type.

From fe7ec857c01a410521cffbb215527742510c642c Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Mon, 29 Mar 2021 05:35:28 -0400
Subject: [PATCH 07/20] Fix `cudf::cast` overflow for `decimal64` to `int32_t`
 or smaller in certain cases (#7733)

@galipremsagar found an issue with `cudf::cast` for `decimal64`. His test case was when you have a value un-representable in `int32_t`. The cast operation would cast to early and therefore overflow. This PR fixes that issue.

Resolves https://github.com/rapidsai/cudf/issues/7689

Authors:
  - Conor Hoekstra (@codereport)

Approvers:
  - Mike Wilson (@hyperbolic2346)
  - Ram (Ramakrishna Prabhu) (@rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/7733
---
 cpp/include/cudf/fixed_point/fixed_point.hpp | 51 ++++++++++++++++----
 cpp/tests/unary/cast_tests.cpp               | 27 +++++++++++
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index eb752a8a0ea..952075b1703 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -218,14 +218,15 @@ class fixed_point {
   using rep = Rep;
 
   /**
-   * @brief Constructor that will perform shifting to store value appropriately
+   * @brief Constructor that will perform shifting to store value appropriately (from floating point
+   * types)
    *
-   * @tparam T The type that you are constructing from (integral or floating)
+   * @tparam T The floating point type that you are constructing from
    * @param value The value that will be constructed from
    * @param scale The exponent that is applied to Rad to perform shifting
    */
   template <typename T,
-            typename cuda::std::enable_if_t<is_supported_construction_value_type<T>() &&
+            typename cuda::std::enable_if_t<cuda::std::is_floating_point<T>() &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
   CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale)
     : _value{static_cast<Rep>(detail::shift<Rep, Rad>(value, scale))}, _scale{scale}
@@ -233,8 +234,25 @@ class fixed_point {
   }
 
   /**
-   * @brief Constructor that will not perform shifting (assumes value already
-   * shifted)
+   * @brief Constructor that will perform shifting to store value appropriately (from integral
+   * types)
+   *
+   * @tparam T The integral type that you are constructing from
+   * @param value The value that will be constructed from
+   * @param scale The exponent that is applied to Rad to perform shifting
+   */
+  template <typename T,
+            typename cuda::std::enable_if_t<cuda::std::is_integral<T>() &&
+                                            is_supported_representation_type<Rep>()>* = nullptr>
+  CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale)
+    // `value` is cast to `Rep` to avoid overflow in cases where
+    // constructing to `Rep` that is wider than `T`
+    : _value{detail::shift<Rep, Rad>(static_cast<Rep>(value), scale)}, _scale{scale}
+  {
+  }
+
+  /**
+   * @brief Constructor that will not perform shifting (assumes value already shifted)
    *
    * @param s scaled_integer that contains scale and already shifted value
    */
@@ -260,18 +278,33 @@ class fixed_point {
   fixed_point() : _value{0}, _scale{scale_type{0}} {}
 
   /**
-   * @brief Explicit conversion operator
+   * @brief Explicit conversion operator for casting to floating point types
    *
-   * @tparam U The type that is being explicitly converted to (integral or floating)
+   * @tparam U The floating point type that is being explicitly converted to
    * @return The `fixed_point` number in base 10 (aka human readable format)
    */
   template <typename U,
-            typename cuda::std::enable_if_t<is_supported_construction_value_type<U>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE explicit constexpr operator U() const
+            typename cuda::std::enable_if_t<cuda::std::is_floating_point<U>::value>* = nullptr>
+  explicit constexpr operator U() const
   {
     return detail::shift<Rep, Rad>(static_cast<U>(_value), detail::negate(_scale));
   }
 
+  /**
+   * @brief Explicit conversion operator for casting to integral types
+   *
+   * @tparam U The integral type that is being explicitly converted to
+   * @return The `fixed_point` number in base 10 (aka human readable format)
+   */
+  template <typename U,
+            typename cuda::std::enable_if_t<cuda::std::is_integral<U>::value>* = nullptr>
+  explicit constexpr operator U() const
+  {
+    // Don't cast to U until converting to Rep because in certain cases casting to U before shifting
+    // will result in integer overflow (i.e. if U = int32_t, Rep = int64_t and _value > 2 billion)
+    return static_cast<U>(detail::shift<Rep, Rad>(_value, detail::negate(_scale)));
+  }
+
   CUDA_HOST_DEVICE_CALLABLE operator scaled_integer<Rep>() const
   {
     return scaled_integer<Rep>{_value, _scale};
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index e8953ab9a30..15d014f9d9c 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -537,6 +537,9 @@ inline auto make_fixed_point_data_type(int32_t scale)
   return cudf::data_type{cudf::type_to_id<T>(), scale};
 }
 
+struct FixedPointTestSingleType : public cudf::test::BaseFixture {
+};
+
 template <typename T>
 struct FixedPointTests : public cudf::test::BaseFixture {
 };
@@ -592,6 +595,18 @@ TYPED_TEST(FixedPointTests, CastToInt32)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TEST_F(FixedPointTestSingleType, CastDecimal64ToInt32)
+{
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+  auto const input    = fp_wrapper{{7246212000}, numeric::scale_type{-5}};
+  auto const expected = fw_wrapper{72462};
+  auto const result   = cudf::cast(input, make_data_type<int32_t>());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(FixedPointTests, CastToIntLarge)
 {
   using namespace numeric;
@@ -659,6 +674,18 @@ TYPED_TEST(FixedPointTests, CastFromInt)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TEST_F(FixedPointTestSingleType, CastInt32ToDecimal64)
+{
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+  auto const input    = fw_wrapper{-48938};
+  auto const expected = fp_wrapper{{-4893800000LL}, numeric::scale_type{-5}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<numeric::decimal64>(-5));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(FixedPointTests, CastFromIntLarge)
 {
   using namespace numeric;

From d9103c4b7998610abc05aa9d85a5a89f3b347251 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 29 Mar 2021 11:50:06 -0400
Subject: [PATCH 08/20] Add gbenchmark for nvtext ngrams functions (#7693)

Reference #5696
Creates a gbenchmark for `nvtext::generate_ngrams()` and `nvtext::generate_character_ngrams()` functions.
The benchmarks measures various string lengths and number of rows.
The `nvtext::generate_ngrams()` was refactored to use the more efficient `make_strings_children` which improved its performance by about 50%.

Authors:
  - David (@davidwendt)

Approvers:
  - Nghia Truong (@ttnghia)
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7693
---
 cpp/benchmarks/CMakeLists.txt            |  1 +
 cpp/benchmarks/text/ngrams_benchmark.cpp | 76 ++++++++++++++++++++++++
 cpp/src/text/generate_ngrams.cu          | 37 ++++--------
 3 files changed, 87 insertions(+), 27 deletions(-)
 create mode 100644 cpp/benchmarks/text/ngrams_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 43ca6de11b4..5aa7e0132f8 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -175,6 +175,7 @@ ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
 ###################################################################################################
 # - nvtext benchmark -------------------------------------------------------------------
 ConfigureBench(TEXT_BENCH
+  text/ngrams_benchmark.cpp
   text/normalize_benchmark.cpp
   text/normalize_spaces_benchmark.cpp
   text/replace_benchmark.cpp
diff --git a/cpp/benchmarks/text/ngrams_benchmark.cpp b/cpp/benchmarks/text/ngrams_benchmark.cpp
new file mode 100644
index 00000000000..1fe8e3b7f2e
--- /dev/null
+++ b/cpp/benchmarks/text/ngrams_benchmark.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/string/string_bench_args.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+
+#include <nvtext/generate_ngrams.hpp>
+
+class TextNGrams : public cudf::benchmark {
+};
+
+enum class ngrams_type { tokens, characters };
+
+static void BM_ngrams(benchmark::State& state, ngrams_type nt)
+{
+  auto const n_rows         = static_cast<cudf::size_type>(state.range(0));
+  auto const max_str_length = static_cast<cudf::size_type>(state.range(1));
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (nt) {
+      case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
+      case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 5;
+  int const max_rowlen = 40;
+  int const len_mult   = 2;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define NVTEXT_BENCHMARK_DEFINE(name)                             \
+  BENCHMARK_DEFINE_F(TextNGrams, name)                            \
+  (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
+  BENCHMARK_REGISTER_F(TextNGrams, name)                          \
+    ->Apply(generate_bench_args)                                  \
+    ->UseManualTime()                                             \
+    ->Unit(benchmark::kMillisecond);
+
+NVTEXT_BENCHMARK_DEFINE(tokens)
+NVTEXT_BENCHMARK_DEFINE(characters)
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 3c583622ed8..4a41dacbd30 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -50,7 +50,7 @@ struct ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::string_view const d_separator;
-  int32_t const* d_offsets{};
+  int32_t* d_offsets{};
   char* d_chars{};
 
   /**
@@ -62,7 +62,7 @@ struct ngram_generator_fn {
    * @param idx Index of the kernel thread.
    * @return Number of bytes required for the string for this thread.
    */
-  __device__ cudf::size_type operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx)
   {
     char* out_ptr         = d_chars ? d_chars + d_offsets[idx] : nullptr;
     cudf::size_type bytes = 0;
@@ -74,7 +74,7 @@ struct ngram_generator_fn {
       bytes += d_separator.size_bytes();
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -109,11 +109,11 @@ std::unique_ptr<cudf::column> generate_ngrams(
                              if (d_strings.is_null(idx)) return false;
                              return !d_strings.element<cudf::string_view>(idx).empty();
                            },
-                           stream,
-                           mr)
+                           stream)
                            ->release();
     strings_count = table_offsets.front()->size() - 1;
-    return std::move(table_offsets.front());
+    auto result   = std::move(table_offsets.front());
+    return result;
   }();  // this allows freeing the temporary table_offsets
 
   CUDF_EXPECTS(strings_count >= ngrams, "Insufficient number of strings to generate ngrams");
@@ -131,30 +131,13 @@ std::unique_ptr<cudf::column> generate_ngrams(
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  // build output offsets by computing the output bytes for each generated ngram
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(
-    0, ngram_generator_fn{d_strings, ngrams, d_separator});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + ngrams_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
-
-  // build the chars column
-  // generate the ngrams from the input strings and copy them into the chars data buffer
-  cudf::size_type const total_bytes = thrust::device_pointer_cast(d_offsets)[ngrams_count];
-  auto chars_column =
-    cudf::strings::detail::create_chars_child_column(ngrams_count, 0, total_bytes, stream, mr);
-  char* const d_chars = chars_column->mutable_view().data<char>();
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     ngrams_count,
-                     ngram_generator_fn{d_strings, ngrams, d_separator, d_offsets, d_chars});
-  chars_column->set_null_count(0);
+  auto children = cudf::strings::detail::make_strings_children(
+    ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, 0, stream, mr);
 
   // make the output strings column from the offsets and chars column
   return cudf::make_strings_column(ngrams_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
+                                   std::move(children.first),
+                                   std::move(children.second),
                                    0,
                                    rmm::device_buffer{0, stream, mr},
                                    stream,

From 54dfaaa9e99a15e6e8f76106adba842f424fb160 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 29 Mar 2021 12:13:04 -0500
Subject: [PATCH 09/20] Create and promote library aliases in libcudf
 installations (#7734)

This PR ensures all `cudf::*` library aliases are created and promoted to `IMPORTED_GLOBAL` when `find_package(cudf)` finds cudf in a local build directory.

~This PR shouldn't affect CI or the targets you'd see when `libcudf` is installed (e.g. by conda), only local source builds.~

edit: This now fixes `cudf::*` alias targets for the `libcudf` installations too, needed by https://github.com/rapidsai/cuspatial/pull/365.

Validation method:
```shell
$ docker run --rm -it \
  -w /tmp/findpackagecudf \
  -v "/tmp/findpackagecudf:/tmp/findpackagecudf" \
  gpuci/miniconda-cuda:10.2-devel-ubuntu18.04 bash

# Set up mamba environment
conda install -y -n base -c conda-forge mamba
mamba update -y -n base -c defaults conda && mamba update -y -n base -c conda-forge mamba
mamba install -y -n base -c conda-forge -c rapidsai-nightly \
    git gtest gmock ninja cmake=3.18 gdal=3.0.2 boost-cpp=1.72.0 cudatoolkit=10.2 libcudf=0.19

# Copy changes in this PR (from the host) to container's /opt/conda/lib/cmake/cudf
# cmake --install $CUDF_ROOT --prefix $CUDF_ROOT/local-install
# docker cp $CUDF_ROOT/local-install/lib/cmake/cudf frosty_agnesi:/opt/conda/lib/cmake/

# Clone cuspatial
git clone https://github.com/trxcllnt/cuspatial.git && cd cuspatial && git checkout fix/cmake-exports

# Configure cuspatial
rm -rf cpp/build && mkdir -p cpp/build \
 && cmake -GNinja -B cpp/build -S cpp \
    -DBUILD_TESTS=ON -DBUILD_BENCHMARKS=ON -DCMAKE_CUDA_ARCHITECTURES=

```

Authors:
  - Paul Taylor (@trxcllnt)
  - Robert Maynard (@robertmaynard)

Approvers:
  - Robert Maynard (@robertmaynard)
  - Keith Kraus (@kkraus14)
  - Ray Douglass (@raydouglass)

URL: https://github.com/rapidsai/cudf/pull/7734
---
 conda/recipes/libcudf/meta.yaml               |  2 +-
 cpp/CMakeLists.txt                            | 15 ++++---
 cpp/cmake/cudf-build-config.cmake.in          | 44 ++++++++++++++++---
 cpp/cmake/cudf-config.cmake.in                | 28 ++++++------
 cpp/cmake/thirdparty/CUDF_GetGTest.cmake      | 10 +----
 cpp/cmake/thirdparty/CUDF_GetRMM.cmake        |  5 ---
 .../cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake |  6 ---
 cpp/libcudf_kafka/tests/CMakeLists.txt        |  6 +--
 8 files changed, 62 insertions(+), 54 deletions(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 1be8a6b450a..39587b4bd05 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -33,7 +33,7 @@ build:
 
 requirements:
   build:
-    - cmake >=3.17.0
+    - cmake >=3.18
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fc439ebfa7f..48562476070 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -554,12 +554,6 @@ if(CUDF_BUILD_BENCHMARKS)
         GIT_SHALLOW     TRUE
         OPTIONS         "BENCHMARK_ENABLE_TESTING OFF"
                         "BENCHMARK_ENABLE_INSTALL OFF")
-    if(benchmark_ADDED)
-        install(TARGETS benchmark
-                        benchmark_main
-            DESTINATION lib
-            EXPORT cudf-targets)
-    endif()
     add_subdirectory(benchmarks)
 endif()
 
@@ -636,6 +630,15 @@ elseif(TARGET arrow_static)
     endif()
 endif()
 
+if(TARGET gtest)
+    get_target_property(gtest_is_imported gtest IMPORTED)
+    if(NOT gtest_is_imported)
+        export(TARGETS gtest gmock gtest_main gmock_main
+            FILE ${CUDF_BINARY_DIR}/cudf-gtesting-targets.cmake
+            NAMESPACE   GTest::)
+    endif()
+endif()
+
 export(EXPORT cudf-targets
     FILE ${CUDF_BINARY_DIR}/cudf-targets.cmake
     NAMESPACE   cudf::)
diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in
index d0c5a608e45..ed1926f20f0 100644
--- a/cpp/cmake/cudf-build-config.cmake.in
+++ b/cpp/cmake/cudf-build-config.cmake.in
@@ -2,6 +2,22 @@
 
 cmake_minimum_required(VERSION 3.18)
 
+set(_possible_targets_to_promote
+      cudf::cudf
+      GTest::gmock
+      GTest::gmock_main
+      GTest::gtest
+      GTest::gtest_main
+      cudf::cudftestutil
+      rmm::rmm
+      arrow_shared
+      arrow_cuda_shared )
+foreach(target IN LISTS _possible_targets_to_promote)
+  if(NOT TARGET ${target})
+    list(APPEND _targets_to_promote ${target})
+  endif()
+endforeach()
+
 set(CUDF_VERSION @CUDF_VERSION@)
 set(CUDF_VERSION_MAJOR @CUDF_VERSION_MAJOR@)
 set(CUDF_VERSION_MINOR @CUDF_VERSION_MINOR@)
@@ -36,21 +52,29 @@ include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetThrust.cmake)
 # find rmm
 set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
 include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetRMM.cmake)
-# find gtest
-include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetGTest.cmake)
 
 # find arrow
-if(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
+  include("${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
+else()
+  if(NOT DEFINED CUDF_USE_ARROW_STATIC)
+    set(CUDF_USE_ARROW_STATIC OFF)
+  endif()
   include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetArrow.cmake)
 endif()
 
+# find GTest
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-gtesting-targets.cmake")
+  include("${CMAKE_CURRENT_LIST_DIR}/cudf-gtesting-targets.cmake")
+else()
+  # find gtest
+  include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetGTest.cmake)
+endif()
+
 list(POP_FRONT CMAKE_MODULE_PATH)
 
-if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
-  include("${CMAKE_CURRENT_LIST_DIR}/cudf-arrow-targets.cmake")
-endif()
-include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake")
 
+include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake")
 if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
   include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
 endif()
@@ -59,6 +83,12 @@ include("${CMAKE_CURRENT_LIST_DIR}/cudf-config-version.cmake")
 
 check_required_components(cudf)
 
+foreach(target IN LISTS _targets_to_promote)
+  if(TARGET ${target})
+    fix_cmake_global_defaults(${target})
+  endif()
+endforeach()
+
 set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
 
 include(FindPackageHandleStandardArgs)
diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
index 6a280264d3c..66c669851fa 100644
--- a/cpp/cmake/cudf-config.cmake.in
+++ b/cpp/cmake/cudf-config.cmake.in
@@ -26,11 +26,6 @@ This module offers an optional testing component which defines the
 following IMPORTED GLOBAL  targets:
 
  cudf::cudftestutil     - The main cudf testing library
- cudf::gmock
- cudf::gmock_main
- cudf::gtest
- cudf::gtest_main
-
 
 Result Variables
 ^^^^^^^^^^^^^^^^
@@ -49,13 +44,11 @@ cmake_minimum_required(VERSION 3.18)
 
 set(_possible_targets_to_promote
       cudf::cudf
-      cudf::benchmark
-      cudf::benchmark_main
-      cudf::gmock
-      cudf::gtest
-      cudf::gmock_main
-      cudf::gtest_main
       cudf::cudftestutil
+      GTest::gmock
+      GTest::gmock_main
+      GTest::gtest
+      GTest::gtest_main
       rmm::rmm
       arrow_shared
       arrow_cuda_shared )
@@ -101,17 +94,22 @@ include("${CMAKE_CURRENT_LIST_DIR}/cudf-targets.cmake")
 if(testing IN_LIST cudf_FIND_COMPONENTS)
   enable_language(CUDA)
 
-  find_dependency(GTest @CUDF_MIN_VERSION_GTest@)
+  find_dependency(GTest @CUDF_MIN_VERSION_GTest@ CONFIG)
+
   include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
+
 endif()
 
 include("${CMAKE_CURRENT_LIST_DIR}/cudf-config-version.cmake")
 
 check_required_components(cudf)
 
-foreach(t IN LISTS _targets_to_promote)
-  if(TARGET ${t})
-    set_target_properties(${t} PROPERTIES IMPORTED_GLOBAL TRUE)
+foreach(target IN LISTS _targets_to_promote)
+  if(TARGET ${target})
+    get_target_property(_already_global ${target} IMPORTED_GLOBAL)
+    if(NOT _already_global)
+      set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE)
+    endif()
   endif()
 endforeach()
 set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
diff --git a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
index 666ba0fbb2c..9e4f3c137b1 100644
--- a/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetGTest.cmake
@@ -26,7 +26,7 @@ function(find_and_configure_gtest VERSION)
         GIT_REPOSITORY  https://github.com/google/googletest.git
         GIT_TAG         release-${VERSION}
         GIT_SHALLOW     TRUE
-        OPTIONS         "INSTALL_GTEST OFF"
+        OPTIONS         "INSTALL_GTEST ON"
         # googletest >= 1.10.0 provides a cmake config file -- use it if it exists
         FIND_PACKAGE_ARGUMENTS "CONFIG")
     # Add GTest aliases if they don't already exist.
@@ -43,14 +43,6 @@ function(find_and_configure_gtest VERSION)
     fix_cmake_global_defaults(GTest::gmock)
     fix_cmake_global_defaults(GTest::gtest_main)
     fix_cmake_global_defaults(GTest::gmock_main)
-    if(GTest_ADDED)
-        install(TARGETS gmock
-                        gtest
-                        gmock_main
-                        gtest_main
-            DESTINATION lib
-            EXPORT cudf-testing-targets)
-    endif()
 endfunction()
 
 set(CUDF_MIN_VERSION_GTest 1.10.0)
diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
index e5d1f2f07a9..136947674f9 100644
--- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
@@ -55,11 +55,6 @@ function(find_and_configure_rmm VERSION)
 
     # Make sure consumers of cudf can also see rmm::rmm
     fix_cmake_global_defaults(rmm::rmm)
-
-    if(NOT rmm_BINARY_DIR IN_LIST CMAKE_PREFIX_PATH)
-        list(APPEND CMAKE_PREFIX_PATH "${rmm_BINARY_DIR}")
-        set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} PARENT_SCOPE)
-    endif()
 endfunction()
 
 set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake b/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake
index 4796495413e..1f7c15d4f75 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake
@@ -40,12 +40,6 @@ function(find_and_configure_cudf VERSION)
                         "BUILD_BENCHMARKS OFF")
     cudfkafka_restore_if_enabled(BUILD_TESTS)
     cudfkafka_restore_if_enabled(BUILD_BENCHMARKS)
-
-    if(NOT cudf_BINARY_DIR IN_LIST CMAKE_PREFIX_PATH)
-        list(APPEND CMAKE_PREFIX_PATH "${cudf_BINARY_DIR}")
-        set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} PARENT_SCOPE)
-    endif()
-
 endfunction()
 
 set(CUDF_KAFKA_MIN_VERSION_cudf 0.19)
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index f556d36d9d2..e813ed5439e 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -21,11 +21,7 @@ function(ConfigureTest CMAKE_TEST_NAME )
     add_executable(${CMAKE_TEST_NAME} ${ARGN})
     set_target_properties(${CMAKE_TEST_NAME}
         PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>")
-    if(TARGET cudf::gmock_main)
-		target_link_libraries(${CMAKE_TEST_NAME} PRIVATE cudf::gmock_main cudf::gtest_main cudf_kafka)
-    else()
-		target_link_libraries(${CMAKE_TEST_NAME} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka)
-    endif()
+    target_link_libraries(${CMAKE_TEST_NAME} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka)
     target_include_directories(${CMAKE_TEST_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include)
     add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
 endfunction()

From cddafd9b1dd3ab815020a513626a611cd8a50de0 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 29 Mar 2021 12:35:27 -0500
Subject: [PATCH 10/20] Add replacements column support for Java replaceNulls
 (#7750)

Adds Java bindings for `cudf::replace_nulls` with a columnar replacement parameter.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7750
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 18 ++++++-
 java/src/main/native/src/ColumnViewJni.cpp    | 20 +++++++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 50 ++++++++++++++++---
 3 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index b29b873092d..90fe3553abc 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -388,7 +388,19 @@ public final ColumnVector findAndReplaceAll(ColumnView oldValues, ColumnView new
    * @return - ColumnVector with nulls replaced by scalar
    */
   public final ColumnVector replaceNulls(Scalar scalar) {
-    return new ColumnVector(replaceNulls(getNativeView(), scalar.getScalarHandle()));
+    return new ColumnVector(replaceNullsScalar(getNativeView(), scalar.getScalarHandle()));
+  }
+
+  /**
+   * Returns a ColumnVector with any null values replaced with the corresponding row in the
+   * specified replacement column.
+   * This column and the replacement column must have the same type and number of rows.
+   *
+   * @param replacements column of replacement values
+   * @return column with nulls replaced by corresponding row of replacements column
+   */
+  public final ColumnVector replaceNulls(ColumnView replacements) {
+    return new ColumnVector(replaceNullsColumn(getNativeView(), replacements.getNativeView()));
   }
 
   /**
@@ -2840,7 +2852,9 @@ private static native long rollingWindow(
 
   private static native long charLengths(long viewHandle) throws CudfException;
 
-  private static native long replaceNulls(long viewHandle, long scalarHandle) throws CudfException;
+  private static native long replaceNullsScalar(long viewHandle, long scalarHandle) throws CudfException;
+
+  private static native long replaceNullsColumn(long viewHandle, long replaceViewHandle) throws CudfException;
 
   private static native long ifElseVV(long predVec, long trueVec, long falseVec) throws CudfException;
 
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 3928794b55c..dc1acc50b5f 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -121,8 +121,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNulls(JNIEnv *env, jclass,
-                                                                    jlong j_col, jlong j_scalar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv *env, jclass,
+                                                                          jlong j_col,
+                                                                          jlong j_scalar) {
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
@@ -135,6 +136,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNulls(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv *env, jclass,
+                                                                          jlong j_col,
+                                                                          jlong j_replace_col) {
+  JNI_NULL_CHECK(env, j_col, "column is null", 0);
+  JNI_NULL_CHECK(env, j_replace_col, "replacement column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto col = reinterpret_cast<cudf::column_view *>(j_col);
+    auto replacements = reinterpret_cast<cudf::column_view *>(j_replace_col);
+    std::unique_ptr<cudf::column> result = cudf::replace_nulls(*col, *replacements);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(JNIEnv *env, jclass,
                                                                 jlong j_pred_vec,
                                                                 jlong j_true_vec,
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 5a9404f5760..fe1cba5ceb1 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1368,7 +1368,7 @@ void testFromScalarNullByte() {
   }
 
   @Test
-  void testReplaceEmptyColumn() {
+  void testReplaceNullsScalarEmptyColumn() {
     try (ColumnVector input = ColumnVector.fromBoxedBooleans();
          ColumnVector expected = ColumnVector.fromBoxedBooleans();
          Scalar s = Scalar.fromBool(false);
@@ -1378,7 +1378,7 @@ void testReplaceEmptyColumn() {
   }
 
   @Test
-  void testReplaceNullBoolsWithAllNulls() {
+  void testReplaceNullsScalarBoolsWithAllNulls() {
     try (ColumnVector input = ColumnVector.fromBoxedBooleans(null, null, null, null);
          ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, false, false);
          Scalar s = Scalar.fromBool(false);
@@ -1388,7 +1388,7 @@ void testReplaceNullBoolsWithAllNulls() {
   }
 
   @Test
-  void testReplaceSomeNullBools() {
+  void testReplaceNullsScalarSomeNullBools() {
     try (ColumnVector input = ColumnVector.fromBoxedBooleans(false, null, null, false);
          ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, false);
          Scalar s = Scalar.fromBool(true);
@@ -1398,7 +1398,7 @@ void testReplaceSomeNullBools() {
   }
 
   @Test
-  void testReplaceNullIntegersWithAllNulls() {
+  void testReplaceNullsScalarIntegersWithAllNulls() {
     try (ColumnVector input = ColumnVector.fromBoxedInts(null, null, null, null);
          ColumnVector expected = ColumnVector.fromBoxedInts(0, 0, 0, 0);
          Scalar s = Scalar.fromInt(0);
@@ -1408,7 +1408,7 @@ void testReplaceNullIntegersWithAllNulls() {
   }
 
   @Test
-  void testReplaceSomeNullIntegers() {
+  void testReplaceNullsScalarSomeNullIntegers() {
     try (ColumnVector input = ColumnVector.fromBoxedInts(1, 2, null, 4, null);
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 999, 4, 999);
          Scalar s = Scalar.fromInt(999);
@@ -1418,7 +1418,7 @@ void testReplaceSomeNullIntegers() {
   }
 
   @Test
-  void testReplaceNullsFailsOnTypeMismatch() {
+  void testReplaceNullsScalarFailsOnTypeMismatch() {
     try (ColumnVector input = ColumnVector.fromBoxedInts(1, 2, null, 4, null);
          Scalar s = Scalar.fromBool(true)) {
       assertThrows(CudfException.class, () -> input.replaceNulls(s).close());
@@ -1434,6 +1434,44 @@ void testReplaceNullsWithNullScalar() {
     }
   }
 
+  @Test
+  void testReplaceNullsColumnEmptyColumn() {
+    try (ColumnVector input = ColumnVector.fromBoxedBooleans();
+         ColumnVector r = ColumnVector.fromBoxedBooleans();
+         ColumnVector expected = ColumnVector.fromBoxedBooleans();
+         ColumnVector result = input.replaceNulls(r)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testReplaceNullsColumnBools() {
+    try (ColumnVector input = ColumnVector.fromBoxedBooleans(null, true, null, false);
+         ColumnVector r = ColumnVector.fromBoxedBooleans(false, null, true, true);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, false);
+         ColumnVector result = input.replaceNulls(r)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testReplaceNullsColumnIntegers() {
+    try (ColumnVector input = ColumnVector.fromBoxedInts(1, 2, null, 4, null);
+         ColumnVector r = ColumnVector.fromBoxedInts(996, 997, 998, 909, null);
+         ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 998, 4, null);
+         ColumnVector result = input.replaceNulls(r)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testReplaceNullsColumnFailsOnTypeMismatch() {
+    try (ColumnVector input = ColumnVector.fromBoxedInts(1, 2, null, 4, null);
+         ColumnVector r = ColumnVector.fromBoxedBooleans(true)) {
+      assertThrows(CudfException.class, () -> input.replaceNulls(r).close());
+    }
+  }
+
   static QuantileMethod[] methods = {LINEAR, LOWER, HIGHER, MIDPOINT, NEAREST};
   static double[] quantiles = {0.0, 0.25, 0.33, 0.5, 1.0};
 

From 213b7f5d100d188d90e07e615da43e2e3baad10c Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 29 Mar 2021 13:29:24 -0500
Subject: [PATCH 11/20] Remove unused JVM array creation (#7748)

The JNI method to build a column from Arrow creates a Java array but then doesn't use it.  This removes the unnecessary JVM callback and object creation.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7748
---
 java/src/main/native/src/ColumnVectorJni.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 737abea6f13..ba0e4f05714 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -85,7 +85,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env,
     auto null_buffer = arrow::Buffer::Wrap(static_cast<const char *>(validity_address), static_cast<int>(validity_length));
     auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char *>(offsets_address), static_cast<int>(offsets_length));
 
-    cudf::jni::native_jlongArray outcol_handles(env, 1);
     std::shared_ptr<arrow::Array> arrow_array;
     switch (n_type) {
       case cudf::type_id::DECIMAL32:

From 42c3bf9b73b51f78d89d2e2d2616f992699fa144 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 29 Mar 2021 13:35:05 -0500
Subject: [PATCH 12/20] Fix data corruption in string columns (#7746)

Fixes: #7735

Minimal repro of the above issue is:


```python
>>> import cudf
>>> s = cudf.Series(['hi', 'hello', None])
>>> s
0       hi
1    hello
2     <NA>
dtype: string
>>> h = s[0:3]
0       hi
1    hello
2     <NA>
dtype: string
>>> s._column.null_count
1
>>> h._column.null_count
1
```


Incorrect mask calculation in `Column.from_column_view` because of incorrect `base_size` calculation in `StringColumn`:
```python
>>> s._column.mask.to_host_array()
array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=uint8)
>>> h._column.mask.to_host_array()
array([], dtype=uint8) # Should have a mask similar to above one.

>>> s._column.base_size
0 # Should be 3
>>> h._column.base_size
0 # Should be 3
```

So in this PR I have fixed the calculation of `StringColumn.base_size` and introduced tests to have a check for the same.

Authors:
  - GALI PREM SAGAR (@galipremsagar)

Approvers:
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7746
---
 python/cudf/cudf/core/column/string.py        | 13 +++++-------
 python/cudf/cudf/tests/test_serialize.py      | 21 +++++++++++++++++++
 python/cudf/cudf/tests/test_string.py         | 11 ++++++++++
 .../dask_cudf/tests/test_distributed.py       | 14 +++++++++++--
 4 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 11dd7556812..de2df9b50d7 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -75,10 +75,6 @@
     is_space as cpp_isspace,
     is_upper as cpp_is_upper,
 )
-from cudf._lib.strings.convert.convert_integers import (
-    is_integer as cpp_is_integer,
-)
-from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
@@ -91,6 +87,10 @@
 from cudf._lib.strings.convert.convert_fixed_point import (
     to_decimal as cpp_to_decimal,
 )
+from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float
+from cudf._lib.strings.convert.convert_integers import (
+    is_integer as cpp_is_integer,
+)
 from cudf._lib.strings.convert.convert_urls import (
     url_decode as cpp_url_decode,
     url_encode as cpp_url_encode,
@@ -4760,10 +4760,7 @@ def base_size(self) -> int:
         if len(self.base_children) == 0:
             return 0
         else:
-            return int(
-                (self.base_children[0].size - 1)
-                / self.base_children[0].dtype.itemsize
-            )
+            return self.base_children[0].size - 1
 
     @property
     def data_array_view(self) -> cuda.devicearray.DeviceNDArray:
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 656b66bf793..0e9c61b634d 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -296,3 +296,24 @@ def test_deserialize_cudf_0_16(datadir):
     actual = pickle.load(open(fname, "rb"))
 
     assert_eq(expected, actual)
+
+
+def test_serialize_sliced_string():
+    # https://github.com/rapidsai/cudf/issues/7735
+    data = ["hi", "hello", None]
+    pd_series = pd.Series(data, dtype=pd.StringDtype())
+    gd_series = cudf.Series(data, dtype="str")
+    sliced = gd_series[0:3]
+    serialized_gd_series = gd_series.serialize()
+    serialized_sliced = sliced.serialize()
+
+    # validate frames are equal or not
+    # because both should be identical
+    for i in range(3):
+        assert_eq(
+            serialized_gd_series[1][i].to_host_array(),
+            serialized_sliced[1][i].to_host_array(),
+        )
+
+    recreated = cudf.Series.deserialize(*sliced.serialize())
+    assert_eq(recreated.to_pandas(nullable=True), pd_series)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 98b8bfb870d..8b1ad696f04 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -2922,3 +2922,14 @@ def test_string_std():
     assert_exceptions_equal(
         lfunc=psr.std, rfunc=sr.std, compare_error_message=False
     )
+
+
+def test_string_slice_with_mask():
+    actual = cudf.Series(["hi", "hello", None])
+    expected = actual[0:3]
+
+    assert actual._column.base_size == 3
+    assert_eq(actual._column.base_size, expected._column.base_size)
+    assert_eq(actual._column.null_count, expected._column.null_count)
+
+    assert_eq(actual, expected)
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index cb3c696adc3..85354704902 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -6,11 +6,11 @@
 from dask.distributed import Client
 from distributed.utils_test import loop  # noqa: F401
 
-import dask_cudf
-
 import cudf
 from cudf.tests.utils import assert_eq
 
+import dask_cudf
+
 dask_cuda = pytest.importorskip("dask_cuda")
 
 
@@ -65,3 +65,13 @@ def test_ucx_seriesgroupby():
             dask_df_g = dask_df.groupby(["a"]).b.sum().compute()
 
             assert dask_df_g.name == "b"
+
+
+def test_str_series_roundtrip():
+    with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
+        with Client(cluster):
+            expected = cudf.Series(["hi", "hello", None])
+            dask_series = dask_cudf.from_cudf(expected, npartitions=2)
+
+            actual = dask_series.compute()
+            assert_eq(actual, expected)

From 4dd75c4b96612d459d26194ab6c1129a0cd0fb95 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 29 Mar 2021 15:48:51 -0700
Subject: [PATCH 13/20] Memory map the input file only when GDS compatiblity
 mode is not used (#7717)

`mmap` is expensive on some systems and we can expect better performance with file reads when GDS is used, especially with compatibility mode.
This PR adds a source type that does not use `mmap` for host reads. This type is used when GDS and its compatibility mode are enabled.
`file_source` is now a base class for file-based input and only implements the device_read functions.
`memory_mapped_source` class implements the host reads through the memory mapped file.
`direct_read_source` is a newly implemented class that uses read for host reads, no `mmap`.
Selection is done in `datasource::create` based on `cufile_config`.

Authors:
  - Vukasin Milovanovic (@vuule)

Approvers:
  - Devavret Makkar (@devavret)
  - David (@davidwendt)

URL: https://github.com/rapidsai/cudf/pull/7717
---
 cpp/include/cudf/io/datasource.hpp         |   2 +-
 cpp/src/io/utilities/datasource.cpp        | 160 +++++++++++++--------
 cpp/src/io/utilities/file_io_utilities.cpp | 109 ++++++--------
 cpp/src/io/utilities/file_io_utilities.hpp |  33 ++++-
 4 files changed, 174 insertions(+), 130 deletions(-)

diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 8fcc045e6d2..ab7a3a6fa9b 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -123,7 +123,7 @@ class datasource {
    * @param[in] offset Bytes from the start
    * @param[in] size Bytes to read
    *
-   * @return The data buffer
+   * @return The data buffer (can be smaller than size)
    */
   virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 3f2884d5b7d..8f2a5389b4d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -25,32 +25,69 @@
 
 namespace cudf {
 namespace io {
+namespace {
 
 /**
- * @brief Implementation class for reading from a file or memory source using
- * memory mapped access.
- *
- * Unlike Arrow's memory mapped IO class, this implementation allows memory
- * mapping a subset of the file where the starting offset may not be zero.
+ * @brief Base class for file input. Only implements direct device reads.
  */
-class memory_mapped_source : public datasource {
-  class memory_mapped_buffer : public buffer {
-    size_t _size   = 0;
-    uint8_t *_data = nullptr;
+class file_source : public datasource {
+ public:
+  explicit file_source(const char *filepath)
+    : _file(filepath, O_RDONLY), _cufile_in(detail::make_cufile_input(filepath))
+  {
+  }
+
+  virtual ~file_source() = default;
+
+  bool supports_device_read() const override { return _cufile_in != nullptr; }
+
+  bool is_device_read_preferred(size_t size) const
+  {
+    return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
+  }
+
+  std::unique_ptr<datasource::buffer> device_read(size_t offset,
+                                                  size_t size,
+                                                  rmm::cuda_stream_view stream) override
+  {
+    CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
+
+    auto const read_size = std::min(size, _file.size() - offset);
+    return _cufile_in->read(offset, read_size, stream);
+  }
+
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t *dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
 
-   public:
-    memory_mapped_buffer(uint8_t *data, size_t size) : _size(size), _data(data) {}
-    size_t size() const override { return _size; }
-    const uint8_t *data() const override { return _data; }
-  };
+    auto const read_size = std::min(size, _file.size() - offset);
+    return _cufile_in->read(offset, read_size, dst, stream);
+  }
+
+  size_t size() const override { return _file.size(); }
+
+ protected:
+  detail::file_wrapper _file;
+
+ private:
+  std::unique_ptr<detail::cufile_input_impl> _cufile_in;
+};
 
+/**
+ * @brief Implementation class for reading from a file using memory mapped access.
+ *
+ * Unlike Arrow's memory mapped IO class, this implementation allows memory mapping a subset of the
+ * file where the starting offset may not be zero.
+ */
+class memory_mapped_source : public file_source {
  public:
   explicit memory_mapped_source(const char *filepath, size_t offset, size_t size)
-    : _cufile_in(detail::make_cufile_input(filepath))
+    : file_source(filepath)
   {
-    auto const file = detail::file_wrapper(filepath, O_RDONLY);
-    _file_size      = file.size();
-    if (_file_size != 0) { map(file.desc(), offset, size); }
+    if (_file.size() != 0) map(_file.desc(), offset, size);
   }
 
   virtual ~memory_mapped_source()
@@ -65,7 +102,7 @@ class memory_mapped_source : public datasource {
     // Clamp length to available data in the mapped region
     auto const read_size = std::min(size, _map_size - (offset - _map_offset));
 
-    return std::make_unique<memory_mapped_buffer>(
+    return std::make_unique<non_owning_buffer>(
       static_cast<uint8_t *>(_map_addr) + (offset - _map_offset), read_size);
   }
 
@@ -81,49 +118,15 @@ class memory_mapped_source : public datasource {
     return read_size;
   }
 
-  bool supports_device_read() const override { return _cufile_in != nullptr; }
-
-  bool is_device_read_preferred(size_t size) const
-  {
-    return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
-  }
-
-  std::unique_ptr<datasource::buffer> device_read(size_t offset,
-                                                  size_t size,
-                                                  rmm::cuda_stream_view stream) override
-  {
-    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
-
-    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
-    return _cufile_in->read(offset, read_size, stream);
-  }
-
-  size_t device_read(size_t offset,
-                     size_t size,
-                     uint8_t *dst,
-                     rmm::cuda_stream_view stream) override
-  {
-    if (!supports_device_read()) CUDF_FAIL("Device reads are not supported for this file.");
-    auto const read_size = std::min(size, _map_size - (offset - _map_offset));
-    return _cufile_in->read(offset, read_size, dst, stream);
-  }
-
-  size_t size() const override { return _file_size; }
-
  private:
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < _file_size, "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file");
 
     // Offset for `mmap()` must be page aligned
     _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
 
-    // Clamp length to available data in the file
-    if (size == 0) {
-      size = _file_size - offset;
-    } else {
-      if ((offset + size) > _file_size) { size = _file_size - offset; }
-    }
+    if (size == 0 || (offset + size) > _file.size()) { size = _file.size() - offset; }
 
     // Size for `mmap()` needs to include the page padding
     _map_size = size + (offset - _map_offset);
@@ -134,11 +137,44 @@ class memory_mapped_source : public datasource {
   }
 
  private:
-  size_t _file_size  = 0;
-  void *_map_addr    = nullptr;
   size_t _map_size   = 0;
   size_t _map_offset = 0;
-  std::unique_ptr<detail::cufile_input_impl> _cufile_in;
+  void *_map_addr    = nullptr;
+};
+
+/**
+ * @brief Implementation class for reading from a file using `read` calls
+ *
+ * Potentially faster than `memory_mapped_source` when only a small portion of the file is read
+ * through the host.
+ */
+class direct_read_source : public file_source {
+ public:
+  explicit direct_read_source(const char *filepath) : file_source(filepath) {}
+
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    lseek(_file.desc(), offset, SEEK_SET);
+
+    // Clamp length to available data
+    ssize_t const read_size = std::min(size, _file.size() - offset);
+
+    std::vector<uint8_t> v(read_size);
+    CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed");
+    return buffer::create(std::move(v));
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t *dst) override
+  {
+    lseek(_file.desc(), offset, SEEK_SET);
+
+    // Clamp length to available data
+    auto const read_size = std::min(size, _file.size() - offset);
+
+    CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast<ssize_t>(read_size),
+                 "read failed");
+    return read_size;
+  }
 };
 
 /**
@@ -185,10 +221,18 @@ class user_datasource_wrapper : public datasource {
   datasource *const source;  ///< A non-owning pointer to the user-implemented datasource
 };
 
+}  // namespace
+
 std::unique_ptr<datasource> datasource::create(const std::string &filepath,
                                                size_t offset,
                                                size_t size)
 {
+#ifdef CUFILE_FOUND
+  if (detail::cufile_config::instance()->is_required()) {
+    // avoid mmap as GDS is expected to be used for most reads
+    return std::make_unique<direct_read_source>(filepath.c_str());
+  }
+#endif
   // Use our own memory mapping implementation for direct file reads
   return std::make_unique<memory_mapped_source>(filepath.c_str(), offset, size);
 }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 22ff057cbc1..322296715fc 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf_test/file_utilities.hpp>
 #include <io/utilities/file_io_utilities.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -26,93 +25,67 @@ namespace cudf {
 namespace io {
 namespace detail {
 
+size_t get_file_size(int file_descriptor)
+{
+  struct stat st;
+  CUDF_EXPECTS(fstat(file_descriptor, &st) != -1, "Cannot query file size");
+  return static_cast<size_t>(st.st_size);
+}
+
 file_wrapper::file_wrapper(std::string const &filepath, int flags)
-  : fd(open(filepath.c_str(), flags))
+  : fd(open(filepath.c_str(), flags)), _size{get_file_size(fd)}
 {
   CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
 }
 
 file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode)
-  : fd(open(filepath.c_str(), flags, mode))
+  : fd(open(filepath.c_str(), flags, mode)), _size{get_file_size(fd)}
 {
   CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath);
 }
 
 file_wrapper::~file_wrapper() { close(fd); }
 
-long file_wrapper::size() const
+std::string getenv_or(std::string const &env_var_name, std::string const &default_val)
 {
-  if (_size < 0) {
-    struct stat st;
-    CUDF_EXPECTS(fstat(fd, &st) != -1, "Cannot query file size");
-    _size = static_cast<size_t>(st.st_size);
-  }
-  return _size;
+  auto const env_val = std::getenv(env_var_name.c_str());
+  return (env_val == nullptr) ? default_val : std::string(env_val);
 }
 
 #ifdef CUFILE_FOUND
 
-/**
- * @brief Class that manages cuFile configuration.
- */
-class cufile_config {
-  std::string const default_policy    = "OFF";
-  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
-
-  std::string const policy = default_policy;
-  temp_directory tmp_config_dir{"cudf_cufile_config"};
-
-  std::string getenv_or(std::string const &env_var_name, std::string const &default_val)
-  {
-    auto const env_val = std::getenv(env_var_name.c_str());
-    return (env_val == nullptr) ? default_val : std::string(env_val);
-  }
-
-  cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
-  {
-    if (is_enabled()) {
-      // Modify the config file based on the policy
-      auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
-      std::ifstream user_config_file(config_file_path);
-      // Modified config file is stored in a temporary directory
-      auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
-      std::ofstream cudf_config_file(cudf_config_path);
-
-      std::string line;
-      while (std::getline(user_config_file, line)) {
-        std::string const tag = "\"allow_compat_mode\"";
-        if (line.find(tag) != std::string::npos) {
-          // TODO: only replace the true/false value
-          // Enable compatiblity mode when cuDF does not fall back to host path
-          cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
-        } else {
-          cudf_config_file << line << '\n';
-        }
-
-        // Point libcufile to the modified config file
-        CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
-                     "Failed to set the cuFile config file environment variable.");
+cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
+{
+  if (is_enabled()) {
+    // Modify the config file based on the policy
+    auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
+    std::ifstream user_config_file(config_file_path);
+    // Modified config file is stored in a temporary directory
+    auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+    std::ofstream cudf_config_file(cudf_config_path);
+
+    std::string line;
+    while (std::getline(user_config_file, line)) {
+      std::string const tag = "\"allow_compat_mode\"";
+      if (line.find(tag) != std::string::npos) {
+        // TODO: only replace the true/false value
+        // Enable compatiblity mode when cuDF does not fall back to host path
+        cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
+      } else {
+        cudf_config_file << line << '\n';
       }
-    }
-  }
-
- public:
-  /**
-   * @brief Returns true when cuFile use is enabled.
-   */
-  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
-
-  /**
-   * @brief Returns true when cuDF should not fall back to host IO.
-   */
-  bool is_required() const { return policy == "ALWAYS"; }
 
-  static cufile_config const *instance()
-  {
-    static cufile_config _instance;
-    return &_instance;
+      // Point libcufile to the modified config file
+      CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
+                   "Failed to set the cuFile config file environment variable.");
+    }
   }
-};
+}
+cufile_config const *cufile_config::instance()
+{
+  static cufile_config _instance;
+  return &_instance;
+}
 
 /**
  * @brief Class that dynamically loads the cuFile library and manages the cuFile driver.
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 85399bdd44d..0119484aee5 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -24,6 +24,7 @@
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf_test/file_utilities.hpp>
 
 #include <string>
 
@@ -35,14 +36,14 @@ namespace detail {
  * @brief Class that provides RAII for file handling.
  */
 class file_wrapper {
-  int const fd       = -1;
-  long mutable _size = -1;
+  int fd = -1;
+  size_t _size;
 
  public:
   explicit file_wrapper(std::string const &filepath, int flags);
   explicit file_wrapper(std::string const &filepath, int flags, mode_t mode);
   ~file_wrapper();
-  long size() const;
+  auto size() const { return _size; }
   auto desc() const { return fd; }
 };
 
@@ -128,6 +129,32 @@ class cufile_output : public cufile_io_base {
 
 class cufile_shim;
 
+/**
+ * @brief Class that manages cuFile configuration.
+ */
+class cufile_config {
+  std::string const default_policy    = "OFF";
+  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
+
+  std::string const policy = default_policy;
+  temp_directory tmp_config_dir{"cudf_cufile_config"};
+
+  cufile_config();
+
+ public:
+  /**
+   * @brief Returns true when cuFile use is enabled.
+   */
+  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
+
+  /**
+   * @brief Returns true when cuDF should not fall back to host IO.
+   */
+  bool is_required() const { return policy == "ALWAYS"; }
+
+  static cufile_config const *instance();
+};
+
 /**
  * @brief Class that provides RAII for cuFile file registration.
  */

From 56976fa8ca47392299ff8bb6710d25894f741ec6 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Mon, 29 Mar 2021 19:03:40 -0500
Subject: [PATCH 14/20] cudf::row_bit_count() support. (#7534)

Closes https://github.com/rapidsai/cudf/issues/7408

Some notes:
- There are some limitations on what this computes, specifically regarding lists or strings embedded inside structs that have null masks.  I've added some documentation for this.  @jlowe @revans2    This could be made to handle that case properly but it would incur a fairly significant performance cost, and likely would require a large amount of temporary memory.

- I made some modifications to the `test::print()` code for lists and structs to be a little more clear when displaying null masks.

- The structure of `flatten_functor` and `flatten_hierarchy` will probably raise some eyebrows.  These functions return 3 separate pieces of data and rather than trying to cram them awkwardly through as actual return values, they are passed by reference.

Authors:
  - @nvdbaranec

Approvers:
  - Jake Hemstad (@jrhemstad)
  - David (@davidwendt)
  - Jason Lowe (@jlowe)
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7534
---
 cpp/CMakeLists.txt                           |   1 +
 cpp/include/cudf/detail/transform.hpp        |  12 +-
 cpp/include/cudf/lists/lists_column_view.hpp |   1 -
 cpp/include/cudf/transform.hpp               |  31 +-
 cpp/include/cudf/types.hpp                   |   1 +
 cpp/src/jit/type.cpp                         |   1 +
 cpp/src/lists/drop_list_duplicates.cu        |   2 +-
 cpp/src/transform/row_bit_count.cu           | 542 +++++++++++++++++
 cpp/tests/CMakeLists.txt                     |   3 +-
 cpp/tests/transform/row_bit_count_test.cu    | 596 +++++++++++++++++++
 cpp/tests/utilities/column_utilities.cu      |  34 +-
 11 files changed, 1206 insertions(+), 18 deletions(-)
 create mode 100644 cpp/src/transform/row_bit_count.cu
 create mode 100644 cpp/tests/transform/row_bit_count_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 48562476070..5cd82e52180 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -389,6 +389,7 @@ add_library(cudf
     src/transform/jit/code/kernel.cpp
     src/transform/mask_to_bools.cu
     src/transform/nans_to_nulls.cu
+    src/transform/row_bit_count.cu
     src/transform/transform.cpp
     src/transpose/transpose.cu
     src/unary/cast_ops.cu
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index bea480d85cd..b94223cdabe 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,5 +77,15 @@ std::unique_ptr<column> mask_to_bools(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::row_bit_count
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> row_bit_count(
+  table_view const& t,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index f8facb83975..768dde2c280 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -56,7 +56,6 @@ class lists_column_view : private column_view {
   using column_view::null_mask;
   using column_view::offset;
   using column_view::size;
-  using offset_type = int32_t;
   static_assert(std::is_same<offset_type, size_type>::value,
                 "offset_type is expected to be the same as size_type.");
   using offset_iterator = offset_type const*;
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 9b740d207e1..e99e0db21c5 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -142,5 +142,34 @@ std::unique_ptr<column> mask_to_bools(
   size_type end_bit,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
+ * each row.
+ *
+ * This function counts bits instead of bytes to account for the null mask which only has one
+ * bit per row.
+ *
+ * Each row in the returned column is the sum of the per-row size for each column in
+ * the table.
+ *
+ * In some cases, this is an inexact approximation. Specifically, columns of lists and strings
+ * require N+1 offsets to represent N rows. It is up to the caller to calculate the small
+ * additional overhead of the terminating offset for any group of rows being considered.
+ *
+ * This function returns the per-row sizes as the columns are currently formed. This can
+ * end up being larger than the number you would get by gathering the rows. Specifically,
+ * the push-down of struct column validity masks can nullify rows that contain data for
+ * string or list columns. In these cases, the size returned is conservative:
+ *
+ * row_bit_count(column(x)) >= row_bit_count(gather(column(x)))
+ *
+ * @param t The table view to perform the computation on.
+ * @param mr Device memory resource used to allocate the returned columns's device memory
+ * @return A 32-bit integer column containing the per-row bit counts.
+ */
+std::unique_ptr<column> row_bit_count(
+  table_view const& t,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 727284194d8..1b8d83883b3 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -89,6 +89,7 @@ class mutable_table_view;
 using size_type    = int32_t;
 using bitmask_type = uint32_t;
 using valid_type   = uint8_t;
+using offset_type  = int32_t;
 
 /**
  * @brief Similar to `std::distance` but returns `cudf::size_type` and performs `static_cast`
diff --git a/cpp/src/jit/type.cpp b/cpp/src/jit/type.cpp
index d71e2eb4df8..6b1e8c57c3d 100644
--- a/cpp/src/jit/type.cpp
+++ b/cpp/src/jit/type.cpp
@@ -71,6 +71,7 @@ std::string get_type_name(data_type type)
   // TODO: Remove in JIT type utils PR
   switch (type.id()) {
     case type_id::LIST: return CUDF_STRINGIFY(List);
+    case type_id::STRUCT: return CUDF_STRINGIFY(Struct);
     case type_id::DECIMAL32: return CUDF_STRINGIFY(int32_t);
     case type_id::DECIMAL64: return CUDF_STRINGIFY(int64_t);
 
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index 529b7489c35..584b9791d19 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -34,7 +34,7 @@ namespace cudf {
 namespace lists {
 namespace detail {
 namespace {
-using offset_type = lists_column_view::offset_type;
+
 /**
  * @brief Copy list entries and entry list offsets ignoring duplicates
  *
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
new file mode 100644
index 00000000000..e36fa36596f
--- /dev/null
+++ b/cpp/src/transform/row_bit_count.cu
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <thrust/optional.h>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+
+/**
+ * @brief Struct which contains per-column information necessary to
+ * traverse a column hierarchy on the gpu.
+ *
+ * When `row_bit_count` is called, the input column hierarchy is flattened into a
+ * vector of column_device_views.  For each one of them, we store a column_info
+ * struct.   The `depth` field represents the depth of the column in the original
+ * hierarchy.
+ *
+ * As we traverse the hierarchy for each input row, we maintain a span representing
+ * the start and end rows for the current nesting depth.  At depth 0, this span is
+ * always just 1 row.  As we cross list boundaries int the hierarchy, this span
+ * grows. So for each column we visit we always know how many rows of it are relevant
+ * and can compute it's contribution to the overall size.
+ *
+ *  An example using a list<list<int>> column, computing the size of row 1.
+ *
+ *  { {{1, 2}, {3, 4}, {5, 6}}, {{7}, {8, 9, 10}, {11, 12, 13, 14}} }
+ *
+ *  L0 = List<List<int32_t>>:
+ *  Length : 2
+ *  Offsets : 0, 3, 6
+ *     L1 = List<int32_t>:
+ *     Length : 6
+ *     Offsets : 0, 2, 4, 6, 7, 10, 14
+ *        I = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ *
+ *
+ *  span0 = [1, 2]                                        row 1 is represented by the span [1, 2]
+ *  span1 = [L0.offsets[span0[0]], L0.offsets[span0[1]]]  expand by the offsets of L0
+ *  span1 = [3, 6]                                        span applied to children of L0
+ *  span2 = [L1.offsets[span1[0]], L1.offsets[span1[1]]]  expand by the offsets of L1
+ *  span2 = [6, 14]                                       span applied to children of L1
+ *
+ *  The total size of our row is computed as:
+ *  (span0[1] - span0[0]) * sizeof(int)        the cost of the offsets for L0
+ *                 +
+ *  (span1[1] - span1[0]) * sizeof(int)        the cost of the offsets for L1
+ *                 +
+ *  (span2[1] - span2[0]) * sizeof(int)        the cost of the integers in I
+ *
+ * `depth` represents our depth in the source column hierarchy.
+ *
+ * "branches" within the spans can occur when we have lists inside of structs.
+ * consider a case where we are entering a struct<list, float> with a span of [4, 8].
+ * The internal list column will change that span to something else, say [5, 9].
+ * But when we finish processing the list column, the final float column wants to
+ * go back and use the original span [4, 8].
+ *
+ * [4, 8]  [5, 9]   [4, 8]
+ * struct< list<>   float>
+ *
+ * To accomplish this we maintain a stack of spans. Pushing the current span
+ * whenever we enter a branch, and popping a span whenever we leave a branch.
+ *
+ * `branch_depth_start` represents the branch depth as we reach a new column.
+ * if `branch_depth_start` is < the last branch depth we saw, we are returning
+ * from a branch and should pop off the stack.
+ *
+ * `branch_depth_end` represents the new branch depth caused by this column.
+ * if branch_depth_end > branch_depth_start, we are branching and need to
+ * push the current span on the stack.
+ *
+ */
+struct column_info {
+  size_type depth;
+  size_type branch_depth_start;
+  size_type branch_depth_end;
+};
+
+/**
+ * @brief Struct which contains hierarchy information precomputed on the host.
+ *
+ * If the input data contains only fixed-width types, this preprocess step
+ * produces the value `simple_per_row_size` which is a constant for every
+ * row in the output.  We can use this value and skip the more complicated
+ * processing for lists, structs and strings entirely if `complex_type_count`
+ * is 0.
+ *
+ */
+struct hierarchy_info {
+  hierarchy_info() : simple_per_row_size(0), complex_type_count(0), max_branch_depth(0) {}
+
+  // These two fields act as an optimization. If we find that the entire table
+  // is just fixed-width types, we do not need to do the more expensive kernel call that
+  // traverses the individual columns. So if complex_type_count is 0, we can just
+  // return a column where every row contains the value simple_per_row_size
+  size_type simple_per_row_size;  // in bits
+  size_type complex_type_count;
+
+  // max depth of span branches present in the hierarchy.
+  size_type max_branch_depth;
+};
+
+/**
+ * @brief Function which flattens the incoming column hierarchy into a vector
+ * of column_views and produces accompanying column_info and hierarchy_info
+ * metadata.
+ *
+ * @param begin: Beginning of a range of column views
+ * @param end: End of a range of column views
+ * @param out: (output) Flattened vector of output column_views
+ * @param info: (output) Additional per-output column_view metadata needed by the gpu
+ * @param h_info: (output) Information about the hierarchy
+ * @param cur_depth: Current absolute depth in the hierarchy
+ * @param cur_branch_depth: Current branch depth
+ * @param parent_index: Index into `out` representing our owning parent column
+ */
+template <typename ColIter>
+void flatten_hierarchy(ColIter begin,
+                       ColIter end,
+                       std::vector<cudf::column_view>& out,
+                       std::vector<column_info>& info,
+                       hierarchy_info& h_info,
+                       rmm::cuda_stream_view stream,
+                       size_type cur_depth                = 0,
+                       size_type cur_branch_depth         = 0,
+                       thrust::optional<int> parent_index = {});
+
+/**
+ * @brief Type-dispatched functor called by flatten_hierarchy.
+ *
+ */
+struct flatten_functor {
+  rmm::cuda_stream_view stream;
+
+  // fixed width
+  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  void operator()(column_view const& col,
+                  std::vector<cudf::column_view>& out,
+                  std::vector<column_info>& info,
+                  hierarchy_info& h_info,
+                  rmm::cuda_stream_view stream,
+                  size_type cur_depth,
+                  size_type cur_branch_depth,
+                  thrust::optional<int> parent_index)
+  {
+    out.push_back(col);
+    info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
+    h_info.simple_per_row_size +=
+      (sizeof(device_storage_type_t<T>) * CHAR_BIT) + (col.nullable() ? 1 : 0);
+  }
+
+  // strings
+  template <typename T, std::enable_if_t<std::is_same<T, string_view>::value>* = nullptr>
+  void operator()(column_view const& col,
+                  std::vector<cudf::column_view>& out,
+                  std::vector<column_info>& info,
+                  hierarchy_info& h_info,
+                  rmm::cuda_stream_view stream,
+                  size_type cur_depth,
+                  size_type cur_branch_depth,
+                  thrust::optional<int> parent_index)
+  {
+    out.push_back(col);
+    info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
+    h_info.complex_type_count++;
+  }
+
+  // lists
+  template <typename T, std::enable_if_t<std::is_same<T, list_view>::value>* = nullptr>
+  void operator()(column_view const& col,
+                  std::vector<cudf::column_view>& out,
+                  std::vector<column_info>& info,
+                  hierarchy_info& h_info,
+                  rmm::cuda_stream_view stream,
+                  size_type cur_depth,
+                  size_type cur_branch_depth,
+                  thrust::optional<int> parent_index)
+  {
+    // track branch depth as we reach this list and after we pass it
+    size_type const branch_depth_start = cur_branch_depth;
+    auto const is_list_inside_struct =
+      parent_index && out[parent_index.value()].type().id() == type_id::STRUCT;
+    if (is_list_inside_struct) {
+      cur_branch_depth++;
+      h_info.max_branch_depth = max(h_info.max_branch_depth, cur_branch_depth);
+    }
+    size_type const branch_depth_end = cur_branch_depth;
+
+    out.push_back(col);
+    info.push_back({cur_depth, branch_depth_start, branch_depth_end});
+
+    lists_column_view lcv(col);
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [col = lcv.get_sliced_child(stream)](auto i) { return col; });
+    h_info.complex_type_count++;
+
+    flatten_hierarchy(
+      iter, iter + 1, out, info, h_info, stream, cur_depth + 1, cur_branch_depth, out.size() - 1);
+  }
+
+  // structs
+  template <typename T, std::enable_if_t<std::is_same<T, struct_view>::value>* = nullptr>
+  void operator()(column_view const& col,
+                  std::vector<cudf::column_view>& out,
+                  std::vector<column_info>& info,
+                  hierarchy_info& h_info,
+                  rmm::cuda_stream_view stream,
+                  size_type cur_depth,
+                  size_type cur_branch_depth,
+                  thrust::optional<int> parent_index)
+  {
+    out.push_back(col);
+    info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
+
+    h_info.simple_per_row_size += col.nullable() ? 1 : 0;
+
+    structs_column_view scv(col);
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [&scv](auto i) { return scv.get_sliced_child(i); });
+    flatten_hierarchy(iter,
+                      iter + scv.num_children(),
+                      out,
+                      info,
+                      h_info,
+                      stream,
+                      cur_depth + 1,
+                      cur_branch_depth,
+                      out.size() - 1);
+  }
+
+  // everything else
+  template <typename T,
+            std::enable_if_t<!cudf::is_fixed_width<T>() && !std::is_same<T, string_view>::value &&
+                             !std::is_same<T, list_view>::value &&
+                             !std::is_same<T, struct_view>::value>* = nullptr>
+  void operator()(column_view const& col,
+                  std::vector<cudf::column_view>& out,
+                  std::vector<column_info>& info,
+                  hierarchy_info& h_info,
+                  rmm::cuda_stream_view stream,
+                  size_type cur_depth,
+                  size_type cur_branch_depth,
+                  thrust::optional<int> parent_index)
+  {
+    CUDF_FAIL("Unsupported column type in row_bit_count");
+  }
+};
+
+template <typename ColIter>
+void flatten_hierarchy(ColIter begin,
+                       ColIter end,
+                       std::vector<cudf::column_view>& out,
+                       std::vector<column_info>& info,
+                       hierarchy_info& h_info,
+                       rmm::cuda_stream_view stream,
+                       size_type cur_depth,
+                       size_type cur_branch_depth,
+                       thrust::optional<int> parent_index)
+{
+  std::for_each(begin, end, [&](column_view const& col) {
+    cudf::type_dispatcher(col.type(),
+                          flatten_functor{stream},
+                          col,
+                          out,
+                          info,
+                          h_info,
+                          stream,
+                          cur_depth,
+                          cur_branch_depth,
+                          parent_index);
+  });
+}
+
+/**
+ * @brief Struct representing a span of rows.
+ *
+ */
+struct row_span {
+  size_type row_start, row_end;
+};
+
+/**
+ * @brief Functor for computing the size, in bits, of a `row_span` of rows for a given
+ * `column_device_view`
+ *
+ */
+struct row_size_functor {
+  /**
+   * @brief Computes size in bits of a span of rows in a fixed-width column.
+   *
+   * Computed as :   ((# of rows) * sizeof(data type) * 8)
+   *                 +
+   *                 1 bit per row for validity if applicable.
+   */
+  template <typename T>
+  __device__ size_type operator()(column_device_view const& col, row_span const& span)
+  {
+    auto const num_rows{span.row_end - span.row_start};
+    auto const element_size  = sizeof(device_storage_type_t<T>) * CHAR_BIT;
+    auto const validity_size = col.nullable() ? 1 : 0;
+    return (element_size + validity_size) * num_rows;
+  }
+};
+
+/**
+ * @brief Computes size in bits of a span of rows in a strings column.
+ *
+ * Computed as :   ((# of rows) * sizeof(offset) * 8) + (total # of characters * 8))
+ *                 +
+ *                 1 bit per row for validity if applicable.
+ */
+template <>
+__device__ size_type row_size_functor::operator()<string_view>(column_device_view const& col,
+                                                               row_span const& span)
+{
+  column_device_view const& offsets = col.child(strings_column_view::offsets_column_index);
+  auto const num_rows{span.row_end - span.row_start};
+  auto const row_start{span.row_start + col.offset()};
+  auto const row_end{span.row_end + col.offset()};
+
+  auto const offsets_size  = sizeof(offset_type) * CHAR_BIT;
+  auto const validity_size = col.nullable() ? 1 : 0;
+  auto const chars_size =
+    (offsets.data<offset_type>()[row_end] - offsets.data<offset_type>()[row_start]) * CHAR_BIT;
+  return ((offsets_size + validity_size) * num_rows) + chars_size;
+}
+
+/**
+ * @brief Computes size in bits of a span of rows in a list column.
+ *
+ * Computed as :   ((# of rows) * sizeof(offset) * 8)
+ *                 +
+ *                 1 bit per row for validity if applicable.
+ */
+template <>
+__device__ size_type row_size_functor::operator()<list_view>(column_device_view const& col,
+                                                             row_span const& span)
+{
+  column_device_view const& offsets = col.child(lists_column_view::offsets_column_index);
+  auto const num_rows{span.row_end - span.row_start};
+
+  auto const offsets_size  = sizeof(offset_type) * CHAR_BIT;
+  auto const validity_size = col.nullable() ? 1 : 0;
+  return (offsets_size + validity_size) * num_rows;
+}
+
+/**
+ * @brief Computes size in bits of a span of rows in a struct column.
+ *
+ * Computed as :   1 bit per row for validity if applicable.
+ */
+template <>
+__device__ size_type row_size_functor::operator()<struct_view>(column_device_view const& col,
+                                                               row_span const& span)
+{
+  auto const num_rows{span.row_end - span.row_start};
+  return (col.nullable() ? 1 : 0) * num_rows;  // cost of validity
+}
+
+/**
+ * @brief Kernel for computing per-row sizes in bits.
+ *
+ * @param cols An span of column_device_views represeting a column hierarcy
+ * @param info An span of column_info structs corresponding the elements in `cols`
+ * @param output Output span of size (# rows) where per-row bit sizes are stored
+ * @param max_branch_depth Maximum depth of the span stack needed per-thread
+ */
+__global__ void compute_row_sizes(device_span<column_device_view const> cols,
+                                  device_span<column_info const> info,
+                                  device_span<size_type> output,
+                                  size_type max_branch_depth)
+{
+  extern __shared__ row_span thread_branch_stacks[];
+  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto const num_rows = output.size();
+  if (tid >= num_rows) { return; }
+
+  // branch stack. points to the last list prior to branching.
+  row_span* my_branch_stack = thread_branch_stacks + (tid * max_branch_depth);
+  size_type branch_depth{0};
+
+  // current row span - always starts at 1 row.
+  row_span cur_span{tid, tid + 1};
+
+  // output size
+  size_type& size = output[tid];
+  size            = 0;
+
+  size_type last_branch_depth{0};
+  for (size_type idx = 0; idx < cols.size(); idx++) {
+    column_device_view const& col = cols[idx];
+
+    // if we've returned from a branch
+    if (info[idx].branch_depth_start < last_branch_depth) {
+      cur_span = my_branch_stack[--branch_depth];
+    }
+    // if we're entering a new branch.
+    // NOTE: this case can happen (a pop and a push by the same column)
+    // when we have a struct<list, list>
+    if (info[idx].branch_depth_end > info[idx].branch_depth_start) {
+      my_branch_stack[branch_depth++] = cur_span;
+    }
+
+    // if we're back at depth 0, this is a new top-level column, so reset
+    // span info
+    if (info[idx].depth == 0) {
+      branch_depth      = 0;
+      last_branch_depth = 0;
+      cur_span          = row_span{tid, tid + 1};
+    }
+
+    // add the contributing size of this row
+    size += cudf::type_dispatcher(col.type(), row_size_functor{}, col, cur_span);
+
+    // if this is a list column, update the working span from our offsets
+    if (col.type().id() == type_id::LIST) {
+      column_device_view const& offsets = col.child(lists_column_view::offsets_column_index);
+      auto const base_offset            = offsets.data<offset_type>()[col.offset()];
+      cur_span.row_start =
+        offsets.data<offset_type>()[cur_span.row_start + col.offset()] - base_offset;
+      cur_span.row_end = offsets.data<offset_type>()[cur_span.row_end + col.offset()] - base_offset;
+    }
+
+    last_branch_depth = info[idx].branch_depth_end;
+  }
+}
+
+}  // anonymous namespace
+
+/**
+ * @copydoc cudf::detail::row_bit_count
+ *
+ */
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  // no rows
+  if (t.num_rows() <= 0) { return cudf::make_empty_column(data_type{type_id::INT32}); }
+
+  // flatten the hierarchy and determine some information about it.
+  std::vector<cudf::column_view> cols;
+  std::vector<column_info> info;
+  hierarchy_info h_info;
+  flatten_hierarchy(t.begin(), t.end(), cols, info, h_info, stream);
+  CUDF_EXPECTS(info.size() == cols.size(), "Size/info mismatch");
+
+  // create output buffer and view
+  auto output = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, t.num_rows(), mask_state::UNALLOCATED, stream, mr);
+  mutable_column_view mcv = output->mutable_view();
+
+  // simple case.  if we have no complex types (lists, strings, etc), the per-row size is already
+  // trivially computed
+  if (h_info.complex_type_count <= 0) {
+    thrust::fill(rmm::exec_policy(stream),
+                 mcv.begin<size_type>(),
+                 mcv.end<size_type>(),
+                 h_info.simple_per_row_size);
+    return output;
+  }
+
+  // create a contiguous block of column_device_views
+  auto d_cols = contiguous_copy_column_device_views<column_device_view>(cols, stream);
+
+  // move stack info to the gpu
+  rmm::device_uvector<column_info> d_info(info.size(), stream);
+  CUDA_TRY(cudaMemcpyAsync(d_info.data(),
+                           info.data(),
+                           sizeof(column_info) * info.size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
+  // shared memory to do this rather than allocating a potentially gigantic temporary buffer
+  // of memory of size (# input rows * sizeof(row_span) * max_branch_depth).
+  auto const shmem_per_thread = sizeof(row_span) * h_info.max_branch_depth;
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int shmem_limit_per_block;
+  CUDA_TRY(
+    cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+  constexpr int max_block_size = 256;
+  auto const block_size =
+    shmem_per_thread != 0
+      ? std::min(max_block_size, shmem_limit_per_block / static_cast<int>(shmem_per_thread))
+      : max_block_size;
+  auto const shared_mem_size = shmem_per_thread * block_size;
+  // should we be aborting if we reach some extremely small block size, or just if we hit 0?
+  CUDF_EXPECTS(block_size > 0, "Encountered a column hierarchy too complex for row_bit_count");
+
+  cudf::detail::grid_1d grid{t.num_rows(), block_size, 1};
+  compute_row_sizes<<<grid.num_blocks, block_size, shared_mem_size, stream.value()>>>(
+    {std::get<1>(d_cols), cols.size()},
+    {d_info.data(), info.size()},
+    {mcv.data<size_type>(), static_cast<std::size_t>(t.num_rows())},
+    h_info.max_branch_depth);
+
+  return output;
+}
+
+}  // namespace detail
+
+/**
+ * @copydoc cudf::row_bit_count
+ *
+ */
+std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)
+{
+  return detail::row_bit_count(t, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ab14c2577bb..082f039054e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -162,7 +162,8 @@ ConfigureTest(TRANSFORM_TEST
     transform/integration/unary-transform-test.cpp
     transform/nans_to_null_test.cpp
     transform/mask_to_bools_test.cpp
-    transform/bools_to_mask_test.cpp)
+    transform/bools_to_mask_test.cpp
+    transform/row_bit_count_test.cu)
 
 ###################################################################################################
 # - interop tests -------------------------------------------------------------------------
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
new file mode 100644
index 00000000000..21e5c818197
--- /dev/null
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/types.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+using namespace cudf;
+
+template <typename T>
+struct RowBitCountTyped : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(RowBitCountTyped, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(RowBitCountTyped, SimpleTypes)
+{
+  using T = TypeParam;
+
+  auto col = cudf::make_fixed_width_column(data_type{type_to_id<T>()}, 16);
+
+  table_view t({*col});
+  auto result = cudf::row_bit_count(t);
+
+  // expect size of the type per row
+  auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16);
+  cudf::mutable_column_view mcv(*expected);
+  thrust::fill(rmm::exec_policy(0),
+               mcv.begin<size_type>(),
+               mcv.end<size_type>(),
+               sizeof(device_storage_type_t<T>) * CHAR_BIT);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
+}
+
+TYPED_TEST(RowBitCountTyped, SimpleTypesWithNulls)
+{
+  using T = TypeParam;
+
+  auto iter   = thrust::make_counting_iterator(0);
+  auto valids = cudf::detail::make_counting_transform_iterator(
+    0, [](int i) { return i % 2 == 0 ? true : false; });
+  cudf::test::fixed_width_column_wrapper<T> col(iter, iter + 16, valids);
+
+  table_view t({col});
+  auto result = cudf::row_bit_count(t);
+
+  // expect size of the type + 1 bit per row
+  auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16);
+  cudf::mutable_column_view mcv(*expected);
+  thrust::fill(rmm::exec_policy(0),
+               mcv.begin<size_type>(),
+               mcv.end<size_type>(),
+               (sizeof(device_storage_type_t<T>) * CHAR_BIT) + 1);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
+}
+
+template <typename T>
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_list_column()
+{
+  using LCW                     = cudf::test::lists_column_wrapper<T, int>;
+  constexpr size_type type_size = sizeof(device_storage_type_t<T>) * CHAR_BIT;
+
+  // clang-format off
+  cudf::test::lists_column_wrapper<T, int> col{ {{1, 2}, {3, 4, 5}}, 
+                                                LCW{LCW{}}, 
+                                                {LCW{10}},
+                                                {{6, 7, 8}, {9}},
+                                                {{-1, -2}, {-3, -4}},
+                                                {{-5, -6, -7}, {-8, -9}} };
+  // clang-format on
+
+  // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf
+  cudf::test::fixed_width_column_wrapper<size_type> expected{
+    ((4 + 8) * CHAR_BIT) + (type_size * 5),
+    ((4 + 0) * CHAR_BIT) + (type_size * 0),
+    ((4 + 4) * CHAR_BIT) + (type_size * 1),
+    ((4 + 8) * CHAR_BIT) + (type_size * 4),
+    ((4 + 8) * CHAR_BIT) + (type_size * 4),
+    ((4 + 8) * CHAR_BIT) + (type_size * 5)};
+
+  return {col.release(), expected.release()};
+}
+
+TYPED_TEST(RowBitCountTyped, Lists)
+{
+  using T = TypeParam;
+
+  std::unique_ptr<column> col;
+  std::unique_ptr<column> expected_sizes;
+  std::tie(col, expected_sizes) = build_list_column<T>();
+
+  table_view t({*col});
+  auto result = cudf::row_bit_count(t);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_sizes, *result);
+}
+
+TYPED_TEST(RowBitCountTyped, ListsWithNulls)
+{
+  using T                       = TypeParam;
+  using LCW                     = cudf::test::lists_column_wrapper<T, int>;
+  constexpr size_type type_size = sizeof(device_storage_type_t<T>) * CHAR_BIT;
+
+  std::vector<bool> valids{true, false, true};
+  std::vector<bool> valids2{false, true, false};
+  std::vector<bool> valids3{true, false};
+
+  // clang-format off
+  cudf::test::lists_column_wrapper<T, int> col{ {{1, 2}, {{3, 4, 5}, valids.begin()}}, 
+                                                LCW{LCW{}}, 
+                                                {LCW{10}}, 
+                                                {{{{6, 7, 8}, valids2.begin()}, {9}}, valids3.begin()} };
+  // clang-format on
+
+  table_view t({col});
+  auto result = cudf::row_bit_count(t);
+
+  // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf + validity
+  // where applicable
+  cudf::test::fixed_width_column_wrapper<size_type> expected{
+    ((4 + 8) * CHAR_BIT) + (type_size * 5) + 7,
+    ((4 + 0) * CHAR_BIT) + (type_size * 0),
+    ((4 + 4) * CHAR_BIT) + (type_size * 1) + 2,
+    ((4 + 8) * CHAR_BIT) + (type_size * 3) + 5};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+struct RowBitCount : public cudf::test::BaseFixture {
+};
+
+TEST_F(RowBitCount, Strings)
+{
+  std::vector<std::string> strings{"abc", "ï", "", "z", "bananas", "warp", "", "zing"};
+
+  cudf::test::strings_column_wrapper col(strings.begin(), strings.end());
+
+  table_view t({col});
+  auto result = cudf::row_bit_count(t);
+
+  // expect 1 offset (4 bytes) + length of string per row
+  auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
+    return (static_cast<size_type>(strings[i].size()) + sizeof(offset_type)) * CHAR_BIT;
+  });
+  cudf::test::fixed_width_column_wrapper<size_type> expected(size_iter, size_iter + strings.size());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(RowBitCount, StringsWithNulls)
+{
+  // clang-format off
+  std::vector<std::string> strings { "daïs", "def", "", "z", "bananas", "warp", "", "zing" };
+  std::vector<bool>        valids  {  1,      0,    0,  1,   0,          1,      1,  1 };
+  // clang-format on
+
+  cudf::test::strings_column_wrapper col(strings.begin(), strings.end(), valids.begin());
+
+  table_view t({col});
+  auto result = cudf::row_bit_count(t);
+
+  // expect 1 offset (4 bytes) + (length of string, or 0 if null) + 1 validity bit per row
+  auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings, &valids](int i) {
+    return ((static_cast<size_type>(valids[i] ? strings[i].size() : 0) + sizeof(offset_type)) *
+            CHAR_BIT) +
+           1;
+  });
+  cudf::test::fixed_width_column_wrapper<size_type> expected(size_iter, size_iter + strings.size());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_struct_column()
+{
+  std::vector<bool> struct_validity{0, 1, 1, 1, 1, 0};
+  std::vector<std::string> strings{"abc", "def", "", "z", "bananas", "daïs"};
+
+  cudf::test::fixed_width_column_wrapper<float> col0{0, 1, 2, 3, 4, 5};
+  cudf::test::fixed_width_column_wrapper<int16_t> col1{{8, 9, 10, 11, 12, 13}, {1, 0, 1, 1, 1, 1}};
+  cudf::test::strings_column_wrapper col2(strings.begin(), strings.end());
+
+  // creating a struct column will cause all child columns to be promoted to have validity
+  cudf::test::structs_column_wrapper struct_col({col0, col1, col2}, struct_validity);
+
+  // expect (1 offset (4 bytes) + (length of string if row is valid) + 1 validity bit) +
+  //        (1 float + 1 validity bit) +
+  //        (1 int16_t + 1 validity bit) +
+  //        (1 validity bit)
+  auto size_iter =
+    cudf::detail::make_counting_transform_iterator(0, [&strings, &struct_validity](int i) {
+      return (sizeof(float) * CHAR_BIT) + 1 + (sizeof(int16_t) * CHAR_BIT) + 1 +
+             (static_cast<size_type>(strings[i].size()) * CHAR_BIT) +
+             (sizeof(offset_type) * CHAR_BIT) + 1 + 1;
+    });
+  cudf::test::fixed_width_column_wrapper<size_type> expected_sizes(size_iter,
+                                                                   size_iter + strings.size());
+
+  return {struct_col.release(), expected_sizes.release()};
+}
+
+TEST_F(RowBitCount, StructsNoNulls)
+{
+  std::vector<std::string> strings{"abc", "daïs", "", "z", "bananas", "warp"};
+
+  cudf::test::fixed_width_column_wrapper<float> col0{0, 1, 2, 3, 4, 5};
+  cudf::test::fixed_width_column_wrapper<int16_t> col1{8, 9, 10, 11, 12, 13};
+  cudf::test::strings_column_wrapper col2(strings.begin(), strings.end());
+
+  cudf::test::structs_column_wrapper struct_col({col0, col1, col2});
+
+  table_view t({struct_col});
+  auto result = cudf::row_bit_count(t);
+
+  // expect 1 offset (4 bytes) + (length of string) + 1 float + 1 int16_t
+  auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
+    return ((sizeof(float) + sizeof(int16_t)) * CHAR_BIT) +
+           ((static_cast<size_type>(strings[i].size()) + sizeof(offset_type)) * CHAR_BIT);
+  });
+  cudf::test::fixed_width_column_wrapper<size_type> expected(size_iter, size_iter + t.num_rows());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(RowBitCount, StructsNulls)
+{
+  std::unique_ptr<column> struct_col;
+  std::unique_ptr<column> expected_sizes;
+  std::tie(struct_col, expected_sizes) = build_struct_column();
+  table_view t({*struct_col});
+  auto result = cudf::row_bit_count(t);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_sizes, *result);
+}
+
+TEST_F(RowBitCount, StructsNested)
+{
+  // struct<struct<int>, int16>
+  cudf::test::fixed_width_column_wrapper<int> col0{0, 1, 2, 3, 4, 5};
+  cudf::test::structs_column_wrapper inner_struct({col0});
+
+  cudf::test::fixed_width_column_wrapper<int16_t> col1{8, 9, 10, 11, 12, 13};
+  cudf::test::structs_column_wrapper struct_col({inner_struct, col1});
+
+  table_view t({struct_col});
+  auto result = cudf::row_bit_count(t);
+
+  // expect num_rows * (4 + 2) bytes
+  auto size_iter =
+    cudf::detail::make_counting_transform_iterator(0, [&](int i) { return (4 + 2) * CHAR_BIT; });
+  cudf::test::fixed_width_column_wrapper<size_type> expected(size_iter, size_iter + t.num_rows());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_nested_and_expected_column(
+  std::vector<bool> const& struct_validity)
+{
+  // tests the "branching" case ->  list<struct<list> ...>>>
+
+  // List<Struct<List<int>, float, int16>
+
+  // Inner list column
+  // clang-format off
+  cudf::test::lists_column_wrapper<int> list{
+    {1, 2, 3, 4, 5},     
+    {6, 7, 8},
+    {33, 34, 35, 36, 37, 38, 39},
+    {-1, -2},
+    {-10, -11, -1, -20},
+    {40, 41, 42},
+    {100, 200, 300},
+    {-100, -200, -300}};
+  // clang-format on
+
+  // floats
+  std::vector<float> ages{5, 10, 15, 20, 4, 75, 16, -16};
+  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 1};
+  auto ages_column =
+    cudf::test::fixed_width_column_wrapper<float>(ages.begin(), ages.end(), ages_validity.begin());
+
+  // int16 values
+  std::vector<int16_t> vals{-1, -2, -3, 1, 2, 3, 8, 9};
+  auto i16_column = cudf::test::fixed_width_column_wrapper<int16_t>(vals.begin(), vals.end());
+
+  // Assemble struct column
+  auto struct_column =
+    cudf::test::structs_column_wrapper({list, ages_column, i16_column}, struct_validity);
+
+  // wrap in a list
+  std::vector<int> outer_offsets{0, 1, 1, 3, 6, 7, 8};
+  cudf::test::fixed_width_column_wrapper<int> outer_offsets_col(outer_offsets.begin(),
+                                                                outer_offsets.end());
+  auto const size = static_cast<column_view>(outer_offsets_col).size() - 1;
+
+  cudf::test::fixed_width_column_wrapper<size_type> expected_sizes{276, 32, 520, 572, 212, 212};
+
+  return {cudf::make_lists_column(static_cast<cudf::size_type>(size),
+                                  outer_offsets_col.release(),
+                                  struct_column.release(),
+                                  cudf::UNKNOWN_NULL_COUNT,
+                                  rmm::device_buffer{}),
+          expected_sizes.release()};
+}
+
+std::unique_ptr<column> build_nested_column(std::vector<bool> const& struct_validity)
+{
+  // List<Struct<List<List<int>>, Struct<int16>>>
+
+  // Inner list column
+  // clang-format off
+  cudf::test::lists_column_wrapper<int> list{    
+     {{1, 2, 3, 4, 5}, {2, 3}},
+     {{6, 7, 8}, {8, 9}},
+     {{1, 2}, {3, 4, 5}, {33, 34, 35, 36, 37, 38, 39}}};
+  // clang-format on
+
+  // Inner struct
+  std::vector<int16_t> vals{-1, -2, -3};
+  auto i16_column   = cudf::test::fixed_width_column_wrapper<int16_t>(vals.begin(), vals.end());
+  auto inner_struct = cudf::test::structs_column_wrapper({i16_column});
+
+  // outer struct
+  auto outer_struct = cudf::test::structs_column_wrapper({list, inner_struct}, struct_validity);
+
+  // wrap in a list
+  std::vector<int> outer_offsets{0, 1, 1, 3};
+  cudf::test::fixed_width_column_wrapper<int> outer_offsets_col(outer_offsets.begin(),
+                                                                outer_offsets.end());
+  auto const size = static_cast<column_view>(outer_offsets_col).size() - 1;
+  return make_lists_column(static_cast<cudf::size_type>(size),
+                           outer_offsets_col.release(),
+                           outer_struct.release(),
+                           cudf::UNKNOWN_NULL_COUNT,
+                           rmm::device_buffer{});
+}
+
+TEST_F(RowBitCount, NestedTypes)
+{
+  // List<Struct<List<int>, float, List<int>, int16>
+  {
+    std::unique_ptr<column> col_no_nulls;
+    std::unique_ptr<column> expected_sizes;
+    std::tie(col_no_nulls, expected_sizes) =
+      build_nested_and_expected_column({1, 1, 1, 1, 1, 1, 1, 1});
+    table_view no_nulls_t({*col_no_nulls});
+    auto no_nulls_result = cudf::row_bit_count(no_nulls_t);
+
+    auto col_nulls = build_nested_and_expected_column({0, 0, 1, 1, 1, 1, 1, 1}).first;
+    table_view nulls_t({*col_nulls});
+    auto nulls_result = cudf::row_bit_count(nulls_t);
+
+    // List<Struct<List<int>, float, int16>
+    //
+    // this illustrates the difference between a row_bit_count
+    // returning a pre-gather result, or a post-gather result.
+    //
+    // in a post-gather situation, the nulls in the struct would result in the values
+    // nested in the list below to be dropped, resulting in smaller row sizes.
+    //
+    // however, for performance reasons, row_bit_count simply walks the data that is
+    // currently there. so list rows that are null, but have a real span of
+    // offsets (X, Y) instead of (X, X)  will end up getting the child data for those
+    // rows included.
+    //
+    // if row_bit_count() is changed to return a post-gather result (which may be desirable),
+    // the nulls_result case below will start failing and will need to be changed.
+    //
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_sizes, *no_nulls_result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_sizes, *nulls_result);
+  }
+
+  // List<Struct<List<List<int>>, Struct<int16>>>
+  {
+    auto col_no_nulls = build_nested_column({1, 1, 1});
+    table_view no_nulls_t({*col_no_nulls});
+    auto no_nulls_result = cudf::row_bit_count(no_nulls_t);
+
+    auto col_nulls = build_nested_column({1, 0, 1});
+    table_view nulls_t({*col_nulls});
+    auto nulls_result = cudf::row_bit_count(nulls_t);
+
+    cudf::test::fixed_width_column_wrapper<size_type> expected_sizes{372, 32, 840};
+
+    // same explanation as above
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *no_nulls_result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *nulls_result);
+  }
+
+  // test pushing/popping multiple times within one struct, and branch depth > 1
+  //
+  // Struct<int, List<int>, float, List<List<int16>>, Struct<List<int>, List<Struct<List<int>,
+  // float>>, int8_t>>
+  {
+    cudf::test::lists_column_wrapper<int> l0{{1, 2, 3}, {4, 5}, {6, 7, 8, 9}, {5}};
+    cudf::test::lists_column_wrapper<int16_t> l1{
+      {{-1, -2}, {3, 4}}, {{4, 5}, {6, 7, 8}}, {{-6, -7}, {2}}, {{-11, -11}, {-12, -12}, {3}}};
+    cudf::test::lists_column_wrapper<int> l2{{-1, -2}, {4, 5}, {-6, -7}, {1}};
+    cudf::test::lists_column_wrapper<int> l3{{-1, -2, 0}, {5}, {-1, -6, -7}, {1, 2}};
+
+    cudf::test::fixed_width_column_wrapper<int> c0{1, 2, 3, 4};
+    cudf::test::fixed_width_column_wrapper<float> c1{1, 2, 3, 4};
+    cudf::test::fixed_width_column_wrapper<int8_t> c2{1, 2, 3, 4};
+    cudf::test::fixed_width_column_wrapper<float> c3{11, 12, 13, 14};
+
+    // innermost List<Struct<List<int>>>
+    auto innermost_struct = cudf::test::structs_column_wrapper({l3, c3});
+    std::vector<int> l4_offsets{0, 1, 2, 3, 4};
+    cudf::test::fixed_width_column_wrapper<int> l4_offsets_col(l4_offsets.begin(),
+                                                               l4_offsets.end());
+    auto const l4_size = l4_offsets.size() - 1;
+    auto l4            = cudf::make_lists_column(static_cast<cudf::size_type>(l4_size),
+                                      l4_offsets_col.release(),
+                                      innermost_struct.release(),
+                                      cudf::UNKNOWN_NULL_COUNT,
+                                      rmm::device_buffer{});
+
+    // inner struct
+    std::vector<std::unique_ptr<column>> inner_struct_children;
+    inner_struct_children.push_back(l2.release());
+    inner_struct_children.push_back(std::move(l4));
+    auto inner_struct = cudf::test::structs_column_wrapper(std::move(inner_struct_children));
+
+    // outer struct
+    auto struct_col = cudf::test::structs_column_wrapper({c0, l0, c1, l1, inner_struct, c2});
+
+    table_view t({struct_col});
+    auto result = cudf::row_bit_count(t);
+
+    cudf::test::fixed_width_column_wrapper<size_type> expected_sizes{648, 568, 664, 568};
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sizes, *result);
+  }
+}
+
+struct sum_functor {
+  size_type const* s0;
+  size_type const* s1;
+  size_type const* s2;
+
+  size_type operator() __device__(int i) { return s0[i] + s1[i] + s2[i]; }
+};
+
+TEST_F(RowBitCount, Table)
+{
+  // complex nested column
+  std::unique_ptr<column> col0;
+  std::unique_ptr<column> col0_sizes;
+  std::tie(col0, col0_sizes) = build_nested_and_expected_column({1, 1, 1, 1, 1, 1, 1, 1});
+
+  // struct column
+  std::unique_ptr<column> col1;
+  std::unique_ptr<column> col1_sizes;
+  std::tie(col1, col1_sizes) = build_struct_column();
+
+  // list column
+  std::unique_ptr<column> col2;
+  std::unique_ptr<column> col2_sizes;
+  std::tie(col2, col2_sizes) = build_list_column<int16_t>();
+
+  table_view t({*col0, *col1, *col2});
+  auto result = cudf::row_bit_count(t);
+
+  // sum all column sizes
+  column_view cv0 = static_cast<column_view>(*col0_sizes);
+  column_view cv1 = static_cast<column_view>(*col1_sizes);
+  column_view cv2 = static_cast<column_view>(*col2_sizes);
+  auto expected   = cudf::make_fixed_width_column(data_type{type_id::INT32}, t.num_rows());
+  cudf::mutable_column_view mcv(*expected);
+  thrust::transform(
+    rmm::exec_policy(0),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(0) + t.num_rows(),
+    mcv.begin<size_type>(),
+    sum_functor{cv0.data<size_type>(), cv1.data<size_type>(), cv2.data<size_type>()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
+}
+
+TEST_F(RowBitCount, SlicedColumnsFixedWidth)
+{
+  auto const slice_size = 7;
+  cudf::test::fixed_width_column_wrapper<int16_t> c0_unsliced{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  auto c0 = cudf::slice(c0_unsliced, {2, 2 + slice_size});
+
+  table_view t({c0});
+  auto result = cudf::row_bit_count(t);
+
+  cudf::test::fixed_width_column_wrapper<size_type> expected{16, 16, 16, 16, 16, 16, 16};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(RowBitCount, SlicedColumnsStrings)
+{
+  auto const slice_size = 7;
+  std::vector<std::string> strings{
+    "banana", "metric", "imperial", "abc", "daïs", "", "fire", "def", "cudf", "xyzw"};
+  cudf::test::strings_column_wrapper c0_unsliced(strings.begin(), strings.end());
+  auto c0 = cudf::slice(c0_unsliced, {3, 3 + slice_size});
+
+  table_view t({c0});
+  auto result = cudf::row_bit_count(t);
+
+  // expect 1 offset (4 bytes) + length of string per row
+  auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
+    return (static_cast<size_type>(strings[i].size()) + sizeof(offset_type)) * CHAR_BIT;
+  });
+  cudf::test::fixed_width_column_wrapper<size_type> expected(size_iter + 3,
+                                                             size_iter + 3 + slice_size);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(RowBitCount, SlicedColumnsLists)
+{
+  auto const slice_size = 2;
+  cudf::test::lists_column_wrapper<cudf::string_view> c0_unsliced{
+    {{"banana", "v"}, {"cats"}},
+    {{"dogs", "yay"}, {"xyz", ""}, {"daïs"}},
+    {{"fast", "parrot"}, {"orange"}},
+    {{"blue"}, {"red", "yellow"}, {"ultraviolet", "", "green"}}};
+  auto c0 = cudf::slice(c0_unsliced, {1, 1 + slice_size});
+
+  table_view t({c0});
+  auto result = cudf::row_bit_count(t);
+
+  cudf::test::fixed_width_column_wrapper<size_type> expected{408, 320};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(RowBitCount, SlicedColumnsStructs)
+{
+  auto const slice_size = 7;
+
+  cudf::test::fixed_width_column_wrapper<int16_t> c0{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  std::vector<std::string> strings{
+    "banana", "metric", "imperial", "abc", "daïs", "", "fire", "def", "cudf", "xyzw"};
+  cudf::test::strings_column_wrapper c1(strings.begin(), strings.end());
+
+  auto struct_col_unsliced = cudf::test::structs_column_wrapper({c0, c1});
+  auto struct_col          = cudf::slice(struct_col_unsliced, {3, 3 + slice_size});
+
+  table_view t({struct_col});
+  auto result = cudf::row_bit_count(t);
+
+  // expect 1 offset (4 bytes) + length of string per row + 1 int16_t per row
+  auto size_iter = cudf::detail::make_counting_transform_iterator(0, [&strings](int i) {
+    return (static_cast<size_type>(strings[i].size()) + sizeof(offset_type) + sizeof(int16_t)) *
+           CHAR_BIT;
+  });
+  cudf::test::fixed_width_column_wrapper<size_type> expected(size_iter + 3,
+                                                             size_iter + 3 + slice_size);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(RowBitCount, EmptyTable)
+{
+  {
+    cudf::table_view empty;
+    auto result = cudf::row_bit_count(empty);
+    CUDF_EXPECTS(result != nullptr && result->size() == 0, "Expected an empty column");
+  }
+
+  {
+    auto strings = cudf::strings::detail::make_empty_strings_column(0);
+    auto ints    = cudf::make_empty_column(data_type{type_id::INT32});
+    cudf::table_view empty({*strings, *ints});
+
+    auto result = cudf::row_bit_count(empty);
+    CUDF_EXPECTS(result != nullptr && result->size() == 0, "Expected an empty column");
+  }
+}
\ No newline at end of file
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index cea66eced11..78a67464654 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -694,12 +694,13 @@ struct column_view_printer {
       get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent +
       "Length : " + std::to_string(lcv.size()) + "\n" + indent +
       "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" +
-      (lcv.has_nulls() ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" +
-                           detail::to_string(bitmask_to_host(col), col.size(), indent) + "\n"
-                       : "") +
-      indent + "Children :\n" +
-      (child.type().id() != type_id::LIST && child.has_nulls()
-         ? indent + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
+      (lcv.parent().nullable()
+         ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" +
+             detail::to_string(bitmask_to_host(col), col.size(), indent) + "\n"
+         : "") +
+      // non-nested types don't typically display their null masks, so do it here for convenience.
+      (!is_nested(child.type()) && child.nullable()
+         ? "   " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
          : "") +
       (detail::to_string(child, ", ", indent + "   ")) + "\n";
 
@@ -718,18 +719,25 @@ struct column_view_printer {
 
     out_stream << get_nested_type_str(col) << ":\n"
                << indent << "Length : " << view.size() << ":\n";
-    if (view.has_nulls()) {
+    if (view.nullable()) {
       out_stream << indent << "Null count: " << view.null_count() << "\n"
                  << detail::to_string(bitmask_to_host(col), col.size(), indent) << "\n";
     }
 
     auto iter = thrust::make_counting_iterator(0);
-    std::transform(iter,
-                   iter + view.num_children(),
-                   std::ostream_iterator<std::string>(out_stream, "\n"),
-                   [&](size_type index) {
-                     return detail::to_string(view.get_sliced_child(index), ", ", indent + "    ");
-                   });
+    std::transform(
+      iter,
+      iter + view.num_children(),
+      std::ostream_iterator<std::string>(out_stream, "\n"),
+      [&](size_type index) {
+        auto child = view.get_sliced_child(index);
+
+        // non-nested types don't typically display their null masks, so do it here for convenience.
+        return (!is_nested(child.type()) && child.nullable()
+                  ? "   " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
+                  : "") +
+               detail::to_string(child, ", ", indent + "   ");
+      });
 
     out.push_back(out_stream.str());
   }

From 563edfa566e56073f57acc737f910da9c1de0246 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Mon, 29 Mar 2021 20:11:20 -0400
Subject: [PATCH 15/20] Join APIs that return gathermaps (#7454)

Closes #6480

# C++ changes

## TL;DR

* Adds join APIs that accept join keys and return gathermaps
* Return type is a `unique_ptr<rmm::device_uvector<size_type>>>` (rather than a `unique_ptr<column>`), to accommodate join results that can be larger than `INT32_MAX` rows
* Simplifies previous join APIs to not accept arguments relating to "common columns" -- instead, those APIs always return all the columns from the LHS/RHS. Users wanting finer control can use the gathermap-based APIs

## The problem

The work in this PR was motivated by the need for simpler join APIs that give the user more flexibility in how they want to construct the result of a join. To explain the current problem, consider the `inner_join` API:

```c++
std::unique_ptr<cudf::table> inner_join(
  cudf::table_view const& left,
  cudf::table_view const& right,
  std::vector<cudf::size_type> const& left_on,
  std::vector<cudf::size_type> const& right_on,
  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
  null_equality compare_nulls         = null_equality::EQUAL,
  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
```

In addition to the left and right tables (and corresponding keys), the API also accepts a `columns_in_common` argument. This is argument specifies pairs of columns from the LHS and RHS respectively, for which only a single column should appear in the result. That single column appears on the "left" side of the result. This makes the API somewhat complicated as well as inflexible.

There is a "lower-level" join API that gives more control on which side the "common" columns should go, by providing an additional `common_columns_output_side` argument:

```c++
  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> inner_join(
    cudf::table_view const& probe,
    std::vector<size_type> const& probe_on,
    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
    common_columns_output_side common_columns_output_side = common_columns_output_side::PROBE,
    null_equality compare_nulls                           = null_equality::EQUAL,
    rmm::cuda_stream_view stream                          = rmm::cuda_stream_default,
    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
```

But even that offers only limited flexibility: for example, it doesn't allow the user to specify an arbitrary ordering of result columns, or omit columns altogether from the result.

## Proposed API

The proposed API in this PR is:

```c++
std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
          std::unique_ptr<rmm::device_uvector<size_type>>>
inner_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls         = null_equality::EQUAL,
           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
```

Note:

* Rather than requiring the full left and right tables of the join, this API only needs the key columns from the left and right tables.
* Rather than constructing the result of the join, this API returns the gathermaps which can be used to construct it.
* For outer join, non-matches are represented by out-of-bound values in the gathermap. In conjunction with the `out_of_bounds_policy::NULLIFY` argument to `gather`, this will produce nulls in the appropriate locations of the result table.
* The API returns a `std::unique_ptr<rmm::device_uvector>>` rather than just `rmm::device_uvector` because of a Cython limitation that prevents wrapping functions whose return types do not provide a nullary (default) constructor.
* The use of `rmm::device_uvector` allows the API to return results of size > `INT32_MAX`, which can occur easily in outer joins.


# Python changes

## TL;DR

* Add Cython bindings for the new C++ APIs
* Rework join internals to interface with the new Cython APIs

## Changes/Improvements

### _Indexer

One major change introduced in the join internals is the use of a new type `_Indexer` to represent a key column.

Previously, join keys were represented by a numeric offset. This was for two reasons:

* A join key could be either an index column or a data column, and the only way to refer to it unambiguously was by its offset -- a DataFrame can have an index column and a data column with the same name.
* The C++ API required numeric offsets for the `left_on` and `right_on` arguments

`_Indexer` provides a more convenient way to construct and represent join keys by allowing one to refer unambiguosly to an index or data column of a `Frame`:

```
    # >>> df
    #    a
    # b
    # 4  1
    # 5  2
    # 6  3
    # >>> _Indexer("a", column=True).get(df)  # returns column "a" of df
    # >>> _Indexer("b", index=True).get(df)  # returns index level "b" of df
```

### Casting logic

Some of the casting logic has been simplified since we no longer need to post-process (cast) the result returned by libcudf. Previously, we were accounting for `"right"` joins in our casting functions. But, since a right join is implemented in terms of a left join with the operands reversed, it turns out we never really needed to handle right joins separately. I have removed that and it simplifies casting logic further.

### Others

* Renamed `casting_logic.py` to `_join_helpers.py` and included other join utilities there.
* Added a subclass of `Merge` for handling semi/anti joins
* Added a `assert_join_results_equal` helper to compare join results between Pandas and cuDF. libcudf can return join results with arbitrary row ordering, and we weren't accounting for that in some of our tests previously. I'm a bit surprised we never ran into any test failures :)

Authors:
  - Ashwin Srinath (@shwina)
  - Vyas Ramasubramani (@vyasr)

Approvers:
  - Jake Hemstad (@jrhemstad)
  - Keith Kraus (@kkraus14)
  - Mike Wilson (@hyperbolic2346)
  - @brandon-b-miller
  - Mark Harris (@harrism)

URL: https://github.com/rapidsai/cudf/pull/7454
---
 cpp/benchmarks/join/join_benchmark.cu         |   8 +-
 cpp/include/cudf/join.hpp                     | 440 ++++++----
 cpp/include/cudf/table/table_view.hpp         |   5 +
 cpp/src/copying/gather.cu                     |   4 +-
 cpp/src/join/hash_join.cu                     | 499 +++--------
 cpp/src/join/hash_join.cuh                    | 143 ++--
 cpp/src/join/join.cu                          | 339 +++++---
 cpp/src/join/join_common_utils.hpp            |  15 +-
 cpp/src/join/semi_join.cu                     | 194 +++--
 cpp/tests/join/join_tests.cpp                 | 726 +++++++---------
 cpp/tests/join/semi_join_tests.cpp            | 807 +-----------------
 python/cudf/cudf/_lib/copying.pyx             |  12 +-
 python/cudf/cudf/_lib/cpp/join.pxd            |  56 +-
 .../cudf/cudf/_lib/cpp/table/table_view.pxd   |   1 +
 python/cudf/cudf/_lib/join.pyx                | 272 ++----
 python/cudf/cudf/core/column/categorical.py   |   3 +
 python/cudf/cudf/core/column/column.py        |  16 +-
 python/cudf/cudf/core/column/numerical.py     |   4 +-
 python/cudf/cudf/core/dataframe.py            |   9 +-
 python/cudf/cudf/core/frame.py                | 187 +---
 python/cudf/cudf/core/index.py                |   8 +
 python/cudf/cudf/core/join/__init__.py        |   2 +-
 python/cudf/cudf/core/join/_join_helpers.py   | 203 +++++
 python/cudf/cudf/core/join/casting_logic.py   | 207 -----
 python/cudf/cudf/core/join/join.py            | 638 +++++++-------
 python/cudf/cudf/core/multiindex.py           |  22 +-
 python/cudf/cudf/core/series.py               |  19 +-
 python/cudf/cudf/tests/test_joining.py        | 189 ++--
 python/cudf/cudf/tests/test_string.py         |  40 +-
 29 files changed, 1998 insertions(+), 3070 deletions(-)
 create mode 100644 python/cudf/cudf/core/join/_join_helpers.py
 delete mode 100644 python/cudf/cudf/core/join/casting_logic.py

diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu
index bd013afc451..fa6afdd908c 100644
--- a/cpp/benchmarks/join/join_benchmark.cu
+++ b/cpp/benchmarks/join/join_benchmark.cu
@@ -105,12 +105,8 @@ static void BM_join(benchmark::State &state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, 0);
 
-    auto result = cudf::inner_join(probe_table,
-                                   build_table,
-                                   columns_to_join,
-                                   columns_to_join,
-                                   {{0, 0}},
-                                   cudf::null_equality::UNEQUAL);
+    auto result = cudf::inner_join(
+      probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL);
   }
 }
 
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index b2c1296ccef..fcc0bcd444e 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -20,6 +20,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <vector>
 
@@ -30,6 +31,44 @@ namespace cudf {
  * @file
  */
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to an
+ * inner join between the specified tables.
+ *
+ * The first returned vector contains the row indices from the left
+ * table that have a match in the right table (in unspecified order).
+ * The corresponding values in the second returned vector are
+ * the matched row indices from the right table.
+ *
+ * @code{.pseudo}
+ *     Left: {{0, 1, 2}}
+ *     Right: {{1, 2, 3}}
+ *     Result: {{1, 2}, {0, 1}}
+ *
+ *     Left: {{0, 1, 2}, {3, 4, 5}}
+ *     Right: {{1, 2, 3}, {4, 6, 7}}
+ *     Result: {{1}, {0}}
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param[in] left_keys The left table
+ * @param[in] right_keys The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing an inner join between two tables with `left_keys` and `right_keys`
+ * as the join keys .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+inner_join(cudf::table_view const& left_keys,
+           cudf::table_view const& right_keys,
+           null_equality compare_nulls         = null_equality::EQUAL,
+           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs an inner join on the specified columns of two
  * tables (`left`, `right`)
@@ -38,26 +77,13 @@ namespace cudf {
  * in the columns being joined on match.
  *
  * @code{.pseudo}
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, a: {1, 2, 5}
+ *          Left: {{0, 1, 2}}
+ *          Right: {{4, 9, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {1}
- *          columns_in_common: { {0, 1} }
- * Result: { a: {1, 2}, b: {1, 2} }
- *
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, c: {1, 2, 5}
- *          left_on: {0}
- *          right_on: {0}
- *          columns_in_common: { }
- * Result: { a: {1, 2}, b: {1, 2}, c: {1, 2} }
+ * Result: {{1, 2}, {4, 9}, {1, 2}}
  * @endcode
  *
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`.
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) such that the location of `L` within `left_on` is not equal to
- * location of R within `right_on`
  * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
  * mismatch.
  * @throw cudf::logic_error if number of columns in either `left` or `right`
@@ -73,59 +99,83 @@ namespace cudf {
  * @param[in] right_on The column indices from `right` to join on.
  * The column from `right` indicated by `right_on[i]` will be compared against the column
  * from `left` indicated by `left_on[i]`.
- * @param[in] columns_in_common is a vector of pairs of column indices into
- * `left` and `right`, respectively, that are "in common". For "common"
- * columns, only a single output column will be produced, which is gathered
- * from `left_on` columns. Else, for every column in `left_on` and `right_on`,
- * an output column will be produced.  For each of these pairs (L, R), L
- * should exist in `left_on` and R should exist in `right_on`.
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`. The resulting table will be joined columns of
- * `left(including common columns)+right(excluding common columns)`.
+ * specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> inner_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to a
+ * left join between the specified tables.
+ *
+ * The first returned vector contains all the row indices from the left
+ * table (in unspecified order). The corresponding value in the
+ * second returned vector is either (1) the row index of the matched row
+ * from the right table, if there is a match  or  (2) an unspecified
+ * out-of-bounds value.
+ *
+ * @code{.pseudo}
+ *     Left: {{0, 1, 2}}
+ *     Right: {{1, 2, 3}}
+ *     Result: {{0, 1, 2}, {None, 0, 1}}
+ *
+ *     Left: {{0, 1, 2}, {3, 4, 5}}
+ *     Right: {{1, 2, 3}, {4, 6, 7}}
+ *     Result: {{0, 1, 2}, {None, 0, None}}
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param[in] left_keys The left table
+ * @param[in] right_keys The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a left join between two tables with `left_keys` and `right_keys`
+ * as the join keys .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+left_join(cudf::table_view const& left_keys,
+          cudf::table_view const& right_keys,
+          null_equality compare_nulls         = null_equality::EQUAL,
+          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a left join (also known as left outer join) on the
  * specified columns of two tables (`left`, `right`)
  *
- * Left Join returns all the rows from the left table and those rows from the
+ * Left join returns all the rows from the left table and those rows from the
  * right table that match on the joined columns.
  * For rows from the right table that do not have a match, the corresponding
  * values in the left columns will be null.
  *
  * @code{.pseudo}
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, a: {1 ,2 ,5}
+ *          Left: {{0, 1, 2}}
+ *          Right: {{1, 2, 3}, {1, 2 ,5}}
  *          left_on: {0}
  *          right_on: {1}
- *          columns_in_common: { {0, 1} }
- * Result: { a: {0, 1, 2}, b: {NULL, 1, 2} }
+ * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} }
  *
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, c: {1, 2, 5}
+ *          Left: {{0, 1, 2}}
+ *          Right {{1, 2, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {0}
- *          columns_in_common: { }
- * Result: { a: {0, 1, 2}, b: {NULL, 1, 2}, c: {NULL, 1, 2} }
+ * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} }
  * @endcode
  *
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`.
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) such that the location of `L` within `left_on` is not equal to
- * location of R within `right_on`
  * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
  * mismatch.
  * @throw cudf::logic_error if number of columns in either `left` or `right`
@@ -141,29 +191,59 @@ std::unique_ptr<cudf::table> inner_join(
  * @param[in] right_on The column indices from `right` to join on.
  * The column from `right` indicated by `right_on[i]` will be compared against the column
  * from `left` indicated by `left_on[i]`.
- * @param[in] columns_in_common is a vector of pairs of column indices into
- * `left` and `right`, respectively, that are "in common". For "common"
- * columns, only a single output column will be produced, which is gathered
- * from `left_on` columns. Else, for every column in `left_on` and `right_on`,
- * an output column will be produced.  For each of these pairs (L, R), L
- * should exist in `left_on` and R should exist in `right_on`.
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`. The resulting table will be joined columns of
- * `left(including common columns)+right(excluding common columns)`.
+ * specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> left_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to a
+ * full join between the specified tables.
+ *
+ * Taken pairwise, the values from the returned vectors are one of:
+ * (1) row indices corresponding to matching rows from the left and
+ * right tables, (2) a row index and an unspecified out-of-bounds value,
+ * representing a row from one table without a match in the other.
+ *
+ * @code{.pseudo}
+ *     Left: {{0, 1, 2}}
+ *     Right: {{1, 2, 3}}
+ *     Result: {{0, 1, 2, None}, {None, 0, 1, 2}}
+ *
+ *     Left: {{0, 1, 2}, {3, 4, 5}}
+ *     Right: {{1, 2, 3}, {4, 6, 7}}
+ *     Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}}
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param[in] left The left table
+ * @param[in] right The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a full join between two tables with `left_keys` and `right_keys`
+ * as the join keys .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+full_join(cudf::table_view const& left_keys,
+          cudf::table_view const& right_keys,
+          null_equality compare_nulls         = null_equality::EQUAL,
+          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a full join (also known as full outer join) on the
  * specified columns of two tables (`left`, `right`)
@@ -174,26 +254,19 @@ std::unique_ptr<cudf::table> left_join(
  * values in the left columns will be null.
  *
  * @code{.pseudo}
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, c: {1, 2, 5}
+ *          Left: {{0, 1, 2}}
+ *          Right: {{1, 2, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {1}
- *          columns_in_common: { {0, 1} }
- * Result: { a: {0, 1, 2, NULL}, b: {NULL, 1, 2, 3}, c: {NULL, 1, 2, 5} }
+ * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} }
  *
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, c: {1, 2, 5}
+ *          Left: {{0, 1, 2}}
+ *          Right: {{1, 2, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {0}
- *          columns_in_common: { }
- * Result: { a: {0, 1, 2, NULL}, b: {NULL, 1, 2, 3}, c: {NULL, 1, 2, 5} }
+ * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} }
  * @endcode
  *
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`.
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) such that the location of `L` within `left_on` is not equal to
- * location of R within `right_on`
  * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
  * mismatch.
  * @throw cudf::logic_error if number of columns in either `left` or `right`
@@ -209,28 +282,54 @@ std::unique_ptr<cudf::table> left_join(
  * @param[in] right_on The column indices from `right` to join on.
  * The column from `right` indicated by `right_on[i]` will be compared against the column
  * from `left` indicated by `left_on[i]`.
- * @param[in] columns_in_common is a vector of pairs of column indices into
- * `left` and `right`, respectively, that are "in common". For "common"
- * columns, only a single output column will be produced, which is gathered
- * from `left_on` columns. Else, for every column in `left_on` and `right_on`,
- * an output column will be produced.  For each of these pairs (L, R), L
- * should exist in `left_on` and R should exist in `right_on`.
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`. The resulting table will be joined columns of
- * `left(including common columns)+right(excluding common columns)`.
+ * specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> full_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a vector of row indices corresponding to a left semi join
+ * between the specified tables.
+ *
+ * The returned vector contains the row indices from the left table
+ * for which there is a matching row in the right table.
+ *
+ * @code{.pseudo}
+ *          TableA: {{0, 1, 2}}
+ *          TableB: {{1, 2, 3}}
+ *          right_on: {1}
+ * Result: {1, 2}
+ * @endcode
+ *
+ * @throw cudf::logic_error if number of columns in either
+ * `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE
+ *
+ * @param[in] left_keys The left table
+ * @param[in] right_keys The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A vector `left_indices` that can be used to construct
+ * the result of performing a left semi join between two tables with
+ * `left_keys` and `right_keys` as the join keys .
+ */
+std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
+  cudf::table_view const& left_keys,
+  cudf::table_view const& right_keys,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a left semi join on the specified columns of two
  * tables (`left`, `right`)
@@ -239,24 +338,20 @@ std::unique_ptr<cudf::table> full_join(
  * returns rows that exist in the right table.
  *
  * @code{.pseudo}
- *          TableA a: {0, 1, 2}
- *          TableB b: {1, 2, 3}, a: {1, 2, 5}
+ *          TableA: {{0, 1, 2}}
+ *          TableB: {{1, 2, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {1}
- *          return_columns: { 0 }
- * Result: { a: {1, 2} }
+ * Result: { {1, 2} }
  *
- *          TableA a: {0, 1, 2}, c: {1, 2, 5}
- *          TableB b: {1, 2, 3}
+ *          TableA {{0, 1, 2}, {1, 2, 5}}
+ *          TableB {{1, 2, 3}}
  *          left_on: {0}
  *          right_on: {0}
- *          return_columns: { 1 }
- * Result: { c: {1, 2} }
+ * Result: { {1, 2}, {2, 5} }
  * @endcode
  *
- * @throw cudf::logic_error if the number of columns in either `left` or `right` table is 0
- * @throw cudf::logic_error if the number of returned columns is 0
- * @throw cudf::logic_error if the number of elements in `left_on` and `right_on` are not equal
+ * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0
  *
  * @param[in] left             The left table
  * @param[in] right            The right table
@@ -268,22 +363,49 @@ std::unique_ptr<cudf::table> full_join(
  *                             The column from `right` indicated by `right_on[i]`
  *                             will be compared against the column from `left`
  *                             indicated by `left_on[i]`.
- * @param[in] return_columns   A vector of column indices from `left` to
- *                             include in the returned table.
  * @param[in] compare_nulls    Controls whether null join-key values should match or not.
  * @param[in] mr               Device memory resource used to allocate the returned table's
  *                             device memory
  *
  * @return                     Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`. The resulting table
- *                             will contain `return_columns` from `left` that match in right.
+ *                             specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<cudf::size_type> const& return_columns,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a vector of row indices corresponding to a left anti join
+ * between the specified tables.
+ *
+ * The returned vector contains the row indices from the left table
+ * for which there is no matching row in the right table.
+ *
+ * @code{.pseudo}
+ *          TableA: {{0, 1, 2}}
+ *          TableB: {{1, 2, 3}}
+ * Result: {0}
+ * @endcode
+ *
+ * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0
+ *
+ * @param[in] left_keys The left table
+ * @param[in] right_keys The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A column `left_indices` that can be used to construct
+ * the result of performing a left anti join between two tables with
+ * `left_keys` and `right_keys` as the join keys .
+ */
+std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
+  cudf::table_view const& left_keys,
+  cudf::table_view const& right_keys,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -295,24 +417,23 @@ std::unique_ptr<cudf::table> left_semi_join(
  * returns rows that do not exist in the right table.
  *
  * @code{.pseudo}
- *          TableA a: {0, 1, 2}
- *          TableB b: {1, 2, 3}, a: {1, 2, 5}
+ *          TableA: {{0, 1, 2}}
+ *          TableB: {{1, 2, 3},  {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {1}
- *          return_columns: { 0 }
- * Result: { a: {0} }
+ * Result: {{0}, {1}}
  *
- *          TableA a: {0, 1, 2}, c: {1, 2, 5}
- *          TableB b: {1, 2, 3}
+ *          TableA: {{0, 1, 2}, {1, 2, 5}}
+ *          TableB: {{1, 2, 3}}
  *          left_on: {0}
  *          right_on: {0}
- *          return_columns: { 1 }
- * Result: { c: {1} }
+ * Result: { {0} {1} }
  * @endcode
  *
- * @throw cudf::logic_error if the number of columns in either `left` or `right` table is 0
- * @throw cudf::logic_error if the number of returned columns is 0
- * @throw cudf::logic_error if the number of elements in `left_on` and `right_on` are not equal
+ * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
+ * mismatch.
+ * @throw cudf::logic_error if number of columns in either `left` or `right`
+ * table is 0 or exceeds MAX_JOIN_SIZE
  *
  * @param[in] left             The left table
  * @param[in] right            The right table
@@ -324,22 +445,18 @@ std::unique_ptr<cudf::table> left_semi_join(
  *                             The column from `right` indicated by `right_on[i]`
  *                             will be compared against the column from `left`
  *                             indicated by `left_on[i]`.
- * @param[in] return_columns   A vector of column indices from `left` to
- *                             include in the returned table.
  * @param[in] compare_nulls    Controls whether null join-key values should match or not.
  * @param[in] mr               Device memory resource used to allocate the returned table's
  *                             device memory
  *
  * @return                     Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`. The resulting table
- *                             will contain `return_columns` from `left` that match in right.
+ *                             specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<cudf::size_type> const& return_columns,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -393,128 +510,75 @@ class hash_join {
    * undefined.
    *
    * @param build The build table, from which the hash table is built.
-   * @param build_on The column indices from `build` to join on.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   hash_join(cudf::table_view const& build,
-            std::vector<size_type> const& build_on,
             null_equality compare_nulls,
             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
-   * @brief Controls where common columns will be output for a inner join.
-   */
-  enum class common_columns_output_side {
-    PROBE,  ///< Common columns is output in the probe portion of the table pair returned by
-            ///< `inner_join`.
-    BUILD   ///< Common columns is output in the build portion of the table pair returned by
-            ///< `inner_join`.
-  };
-
-  /**
-   * @brief Performs an inner join by probing in the internal hash table.
-   *
-   * Given that it is sometimes desired to choose the small table to be the `build` side for an
-   * inner join，a (`probe`, `build`) table pair, which contains the probe and build portions of the
-   * logical joined table respectively, is returned so that caller can freely rearrange them to
-   * restore the logical `left` `right` order. This introduces some extra logic about where "common"
-   * columns should go, i.e. the legacy `cudf::inner_join()` API always outputs "common" columns in
-   * the `left` portion and the corresponding columns in the `right` portion are omitted. To better
-   * align with the legacy `cudf::inner_join()` API, a `common_columns_output_side` parameter is
-   * introduced to specify whether "common" columns should go in `probe` or `build` portion.
-   *
-   * More details please @see cudf::inner_join().
+   * Returns the row indices that can be used to construct the result of performing
+   * an inner join between two tables. @see cudf::inner_join().
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param probe_on The column indices from `probe` to join on.
-   * @param columns_in_common is a vector of pairs of column indices into
-   * `probe` and `build`, respectively, that are "in common". For "common"
-   * columns, only a single output column will be produced, which is gathered
-   * from `probe_on` columns or `build_on` columns if `probe_output_side` is LEFT or RIGHT.
-   * Else, for every column in `probe_on` and `build_on`,
-   * an output column will be produced. For each of these pairs (P, B), P
-   * should exist in `probe_on` and B should exist in `build_on`.
-   * @param common_columns_output_side @see `common_columns_output_side`.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return Table pair of (`probe`, `build`) of joining both tables on the columns
-   * specified by `probe_on` and `build_on`. The resulting table pair will be joined columns of
-   * (`probe(including common columns)`, `build(excluding common columns)`) if
-   * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`,
-   * `build(including common columns)`) if `common_columns_output_side` is `BUILD`.
+   * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
+   * the result of performing an inner join between two tables with `build` and `probe`
+   * as the the join keys .
    */
-  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> inner_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    common_columns_output_side common_columns_output_side = common_columns_output_side::PROBE,
-    null_equality compare_nulls                           = null_equality::EQUAL,
-    rmm::cuda_stream_view stream                          = rmm::cuda_stream_default,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  inner_join(cudf::table_view const& probe,
+             null_equality compare_nulls         = null_equality::EQUAL,
+             rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+             rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
   /**
-   * @brief Performs a left join by probing in the internal hash table.
-   *
-   * More details please @see cudf::left_join().
+   * Returns the row indices that can be used to construct the result of performing
+   * a left join between two tables. @see cudf::left_join().
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param probe_on The column indices from `probe` to join on.
-   * @param columns_in_common is a vector of pairs of column indices into
-   * `probe` and `build`, respectively, that are "in common". For "common"
-   * columns, only a single output column will be produced, which is gathered
-   * from `probe_on` columns. Else, for every column in `probe_on` and `build_on`,
-   * an output column will be produced. For each of these pairs (P, B), P
-   * should exist in `probe_on` and B should exist in `build_on`.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return Result of joining `build` and `probe` tables on the columns
-   * specified by `build_on` and `probe_on`. The resulting table will be joined columns of
-   * `probe(including common columns)+build(excluding common columns)`.
+   * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
+   * the result of performing a left join between two tables with `build` and `probe`
+   * as the the join keys .
    */
-  std::unique_ptr<cudf::table> left_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    null_equality compare_nulls         = null_equality::EQUAL,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  left_join(cudf::table_view const& probe,
+            null_equality compare_nulls         = null_equality::EQUAL,
+            rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+            rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
   /**
-   * @brief Performs a full join by probing in the internal hash table.
-   *
-   * More details please @see cudf::full_join().
+   * Returns the row indices that can be used to construct the result of performing
+   * a full join between two tables. @see cudf::full_join().
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param probe_on The column indices from `probe` to join on.
-   * @param columns_in_common is a vector of pairs of column indices into
-   * `probe` and `build`, respectively, that are "in common". For "common"
-   * columns, only a single output column will be produced, which is gathered
-   * from `probe_on` columns. Else, for every column in `probe_on` and `build_on`,
-   * an output column will be produced. For each of these pairs (P, B), P
-   * should exist in `probe_on` and B should exist in `build_on`.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return Result of joining `build` and `probe` tables on the columns
-   * specified by `build_on` and `probe_on`. The resulting table will be joined columns of
-   * `probe(including common columns)+build(excluding common columns)`.
+   * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
+   * the result of performing a full join between two tables with `build` and `probe`
+   * as the the join keys .
    */
-  std::unique_ptr<cudf::table> full_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    null_equality compare_nulls         = null_equality::EQUAL,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  full_join(cudf::table_view const& probe,
+            null_equality compare_nulls         = null_equality::EQUAL,
+            rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+            rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   struct hash_join_impl;
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 5cdecab9115..a225e590f9a 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -126,6 +126,11 @@ class table_view_base {
    */
   size_type num_rows() const noexcept { return _num_rows; }
 
+  /**
+   * @brief Returns true if `num_columns()` returns zero, or false otherwise
+   */
+  size_type is_empty() const noexcept { return num_columns() == 0; }
+
   table_view_base() = default;
 
   ~table_view_base() = default;
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index dc153e9395d..181752d18e8 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -43,9 +43,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
 
   if (neg_indices == negative_index_policy::ALLOWED) {
     cudf::size_type n_rows = source_table.num_rows();
-    auto idx_converter     = [n_rows] __device__(size_type in) {
-      return ((in % n_rows) + n_rows) % n_rows;
-    };
+    auto idx_converter = [n_rows] __device__(size_type in) { return in < 0 ? in + n_rows : in; };
     return gather(source_table,
                   thrust::make_transform_iterator(map_begin, idx_converter),
                   thrust::make_transform_iterator(map_end, idx_converter),
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index d827d03a6c0..5a6ad8892de 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <thrust/uninitialized_fill.h>
 #include <join/hash_join.cuh>
 
 #include <cudf/detail/concatenate.cuh>
@@ -20,93 +21,44 @@
 #include <cudf/detail/gather.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <iostream>
 #include <numeric>
 
 namespace cudf {
 namespace detail {
 
-/**
- * @brief Returns a vector with non-common indices which is set difference
- * between `[0, num_columns)` and index values in common_column_indices
- *
- * @param num_columns The number of columns, which represents column indices
- * from `[0, num_columns)` in a table
- * @param common_column_indices A vector of common indices which needs to be
- * excluded from `[0, num_columns)`
- *
- * @return vector A vector containing only the indices which are not present in
- * `common_column_indices`
- */
-auto non_common_column_indices(size_type num_columns,
-                               std::vector<size_type> const &common_column_indices)
-{
-  CUDF_EXPECTS(common_column_indices.size() <= static_cast<uint64_t>(num_columns),
-               "Too many columns in common");
-  std::vector<size_type> all_column_indices(num_columns);
-  std::iota(std::begin(all_column_indices), std::end(all_column_indices), 0);
-  std::vector<size_type> sorted_common_column_indices{common_column_indices};
-  std::sort(std::begin(sorted_common_column_indices), std::end(sorted_common_column_indices));
-  std::vector<size_type> non_common_column_indices(num_columns - common_column_indices.size());
-  std::set_difference(std::cbegin(all_column_indices),
-                      std::cend(all_column_indices),
-                      std::cbegin(sorted_common_column_indices),
-                      std::cend(sorted_common_column_indices),
-                      std::begin(non_common_column_indices));
-  return non_common_column_indices;
-}
-
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
-  table_view const &probe,
-  table_view const &build,
-  std::vector<std::pair<size_type, size_type>> const &columns_in_common,
-  cudf::hash_join::common_columns_output_side common_columns_output_side)
+  table_view const &probe, table_view const &build)
 {
-  std::vector<size_type> columns_to_exclude(columns_in_common.size());
-  std::transform(columns_in_common.begin(),
-                 columns_in_common.end(),
-                 columns_to_exclude.begin(),
-                 [common_columns_output_side](auto &col) {
-                   return common_columns_output_side == hash_join::common_columns_output_side::PROBE
-                            ? col.second
-                            : col.first;
-                 });
-  std::vector<size_type> non_common_indices = non_common_column_indices(
-    common_columns_output_side == hash_join::common_columns_output_side::PROBE
-      ? build.num_columns()
-      : probe.num_columns(),
-    columns_to_exclude);
   std::unique_ptr<table> empty_probe = empty_like(probe);
   std::unique_ptr<table> empty_build = empty_like(build);
-  if (common_columns_output_side == hash_join::common_columns_output_side::PROBE) {
-    table_view empty_build_view = empty_build->select(non_common_indices);
-    empty_build                 = std::make_unique<table>(empty_build_view);
-  } else {
-    table_view empty_probe_view = empty_probe->select(non_common_indices);
-    empty_probe                 = std::make_unique<table>(empty_probe_view);
-  }
   return std::make_pair(std::move(empty_probe), std::move(empty_build));
 }
 
-VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b)
+VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((a.first.size() == a.second.size()),
+  CUDF_EXPECTS((a.first->size() == a.second->size()),
                "Mismatch between sizes of vectors in vector pair");
-  CUDF_EXPECTS((b.first.size() == b.second.size()),
+  CUDF_EXPECTS((b.first->size() == b.second->size()),
                "Mismatch between sizes of vectors in vector pair");
-  if (a.first.empty()) {
-    return b;
-  } else if (b.first.empty()) {
-    return a;
+  if (a.first->is_empty()) {
+    return std::move(b);
+  } else if (b.first->is_empty()) {
+    return std::move(a);
   }
-  auto original_size = a.first.size();
-  a.first.resize(a.first.size() + b.first.size());
-  a.second.resize(a.second.size() + b.second.size());
-  thrust::copy(b.first.begin(), b.first.end(), a.first.begin() + original_size);
-  thrust::copy(b.second.begin(), b.second.end(), a.second.begin() + original_size);
-  return a;
+  auto original_size = a.first->size();
+  a.first->resize(a.first->size() + b.first->size(), stream);
+  a.second->resize(a.second->size() + b.second->size(), stream);
+  thrust::copy(
+    rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size);
+  thrust::copy(rmm::exec_policy(stream),
+               b.second->begin(),
+               b.second->end(),
+               a.second->begin() + original_size);
+  return std::move(a);
 }
 
 template <typename T>
@@ -133,16 +85,20 @@ struct valid_range {
  *
  * @return Pair of vectors containing the left join indices complement
  */
-std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
-get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
-                                 size_type left_table_row_count,
-                                 size_type right_table_row_count,
-                                 rmm::cuda_stream_view stream)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+get_left_join_indices_complement(
+  std::unique_ptr<rmm::device_uvector<size_type>> &right_indices,
+  size_type left_table_row_count,
+  size_type right_table_row_count,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
 {
   // Get array of indices that do not appear in right_indices
 
   // Vector allocated for unmatched result
-  rmm::device_vector<size_type> right_indices_complement(right_table_row_count);
+  auto right_indices_complement =
+    std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
 
   // If left table is empty in a full join call then all rows of the right table
   // should be represented in the joined indices. This is an optimization since
@@ -151,12 +107,16 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
   // produce exactly the same result as the else path but will be faster.
   if (left_table_row_count == 0) {
     thrust::sequence(rmm::exec_policy(stream),
-                     right_indices_complement.begin(),
-                     right_indices_complement.end(),
+                     right_indices_complement->begin(),
+                     right_indices_complement->end(),
                      0);
   } else {
     // Assume all the indices in invalid_index_map are invalid
-    rmm::device_vector<size_type> invalid_index_map(right_table_row_count, 1);
+    auto invalid_index_map =
+      std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
+    thrust::uninitialized_fill(
+      rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1});
+
     // Functor to check for index validity since left joins can create invalid indices
     valid_range<size_type> valid(0, right_table_row_count);
 
@@ -164,11 +124,11 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
     // Thus specifying that those locations are valid
     thrust::scatter_if(rmm::exec_policy(stream),
                        thrust::make_constant_iterator(0),
-                       thrust::make_constant_iterator(0) + right_indices.size(),
-                       right_indices.begin(),      // Index locations
-                       right_indices.begin(),      // Stencil - Check if index location is valid
-                       invalid_index_map.begin(),  // Output indices
-                       valid);                     // Stencil Predicate
+                       thrust::make_constant_iterator(0) + right_indices->size(),
+                       right_indices->begin(),      // Index locations
+                       right_indices->begin(),      // Stencil - Check if index location is valid
+                       invalid_index_map->begin(),  // Output indices
+                       valid);                      // Stencil Predicate
     size_type begin_counter = static_cast<size_type>(0);
     size_type end_counter   = static_cast<size_type>(right_table_row_count);
 
@@ -176,15 +136,19 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
     size_type indices_count = thrust::copy_if(rmm::exec_policy(stream),
                                               thrust::make_counting_iterator(begin_counter),
                                               thrust::make_counting_iterator(end_counter),
-                                              invalid_index_map.begin(),
-                                              right_indices_complement.begin(),
+                                              invalid_index_map->begin(),
+                                              right_indices_complement->begin(),
                                               thrust::identity<size_type>()) -
-                              right_indices_complement.begin();
-    right_indices_complement.resize(indices_count);
+                              right_indices_complement->begin();
+    right_indices_complement->resize(indices_count, stream);
   }
 
-  rmm::device_vector<size_type> left_invalid_indices(right_indices_complement.size(),
-                                                     JoinNoneValue);
+  auto left_invalid_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(right_indices_complement->size(), stream);
+  thrust::fill(rmm::exec_policy(stream),
+               left_invalid_indices->begin(),
+               left_invalid_indices->end(),
+               JoinNoneValue);
 
   return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement));
 }
@@ -195,8 +159,6 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
  * @throw cudf::logic_error if the number of columns in `build` table is 0.
  * @throw cudf::logic_error if the number of rows in `build` table is 0.
  * @throw cudf::logic_error if insertion to the hash table fails.
- * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build`
- * table.
  *
  * @param build Table of columns used to build join hash.
  * @param compare_nulls Controls whether null join-key values should match or not.
@@ -256,19 +218,22 @@ std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_
  * @return Join output indices vector pair.
  */
 template <join_kind JoinKind>
-std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_join_hash_table(
-  cudf::table_device_view build_table,
-  cudf::table_device_view probe_table,
-  multimap_type const &hash_table,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+probe_join_hash_table(cudf::table_device_view build_table,
+                      cudf::table_device_view probe_table,
+                      multimap_type const &hash_table,
+                      null_equality compare_nulls,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource *mr)
 {
   size_type estimated_size = estimate_join_output_size<JoinKind, multimap_type>(
     build_table, probe_table, hash_table, compare_nulls, stream);
 
   // If the estimated output size is zero, return immediately
   if (estimated_size == 0) {
-    return std::make_pair(rmm::device_vector<size_type>{}, rmm::device_vector<size_type>{});
+    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   // Because we are approximating the number of joined elements, our approximation
@@ -278,12 +243,13 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
   rmm::device_scalar<size_type> write_index(0, stream);
   size_type join_size{0};
 
-  rmm::device_vector<size_type> left_indices;
-  rmm::device_vector<size_type> right_indices;
+  auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+  auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+
   auto current_estimated_size = estimated_size;
   do {
-    left_indices.resize(estimated_size);
-    right_indices.resize(estimated_size);
+    left_indices->resize(estimated_size, stream);
+    right_indices->resize(estimated_size, stream);
 
     constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
     detail::grid_1d config(probe_table.num_rows(), block_size);
@@ -298,8 +264,8 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
         probe_table,
         hash_probe,
         equality,
-        left_indices.data().get(),
-        right_indices.data().get(),
+        left_indices->data(),
+        right_indices->data(),
         write_index.data(),
         estimated_size);
 
@@ -310,179 +276,11 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
     estimated_size *= 2;
   } while ((current_estimated_size < join_size));
 
-  left_indices.resize(join_size);
-  right_indices.resize(join_size);
+  left_indices->resize(join_size, stream);
+  right_indices->resize(join_size, stream);
   return std::make_pair(std::move(left_indices), std::move(right_indices));
 }
 
-/**
- * @brief  Combines the non common probe, common probe, non common build and common build
- * columns in the correct order according to `common_columns_output_side` to form the joined
- * (`probe`, `build`) table pair.
- *
- * @param probe_noncommon_cols Columns obtained by gathering non common probe columns.
- * @param probe_noncommon_col_indices Output locations of non common probe columns in the probe
- * portion.
- * @param probe_common_col_indices Output locations of common probe columns in the probe portion.
- * @param build_noncommon_cols Columns obtained by gathering non common build columns.
- * @param build_noncommon_col_indices Output locations of non common build columns in the build
- * portion.
- * @param build_common_col_indices Output locations of common build columns in the build portion.
- * @param common_cols Columns obtained by gathering common columns from `probe` and `build` tables
- * in the build portion.
- * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side.
- *
- * @return Table pair of (`probe`, `build`).
- */
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> combine_join_columns(
-  std::vector<std::unique_ptr<column>> &&probe_noncommon_cols,
-  std::vector<size_type> const &probe_noncommon_col_indices,
-  std::vector<size_type> const &probe_common_col_indices,
-  std::vector<std::unique_ptr<column>> &&build_noncommon_cols,
-  std::vector<size_type> const &build_noncommon_col_indices,
-  std::vector<size_type> const &build_common_col_indices,
-  std::vector<std::unique_ptr<column>> &&common_cols,
-  cudf::hash_join::common_columns_output_side common_columns_output_side)
-{
-  if (common_columns_output_side == cudf::hash_join::common_columns_output_side::PROBE) {
-    std::vector<std::unique_ptr<column>> probe_cols(probe_noncommon_cols.size() +
-                                                    common_cols.size());
-    for (size_t i = 0; i < probe_noncommon_cols.size(); ++i) {
-      probe_cols.at(probe_noncommon_col_indices.at(i)) = std::move(probe_noncommon_cols.at(i));
-    }
-    for (size_t i = 0; i < common_cols.size(); ++i) {
-      probe_cols.at(probe_common_col_indices.at(i)) = std::move(common_cols.at(i));
-    }
-    return std::make_pair(std::make_unique<cudf::table>(std::move(probe_cols)),
-                          std::make_unique<cudf::table>(std::move(build_noncommon_cols)));
-  } else {
-    std::vector<std::unique_ptr<column>> build_cols(build_noncommon_cols.size() +
-                                                    common_cols.size());
-    for (size_t i = 0; i < build_noncommon_cols.size(); ++i) {
-      build_cols.at(build_noncommon_col_indices.at(i)) = std::move(build_noncommon_cols.at(i));
-    }
-    for (size_t i = 0; i < common_cols.size(); ++i) {
-      build_cols.at(build_common_col_indices.at(i)) = std::move(common_cols.at(i));
-    }
-    return std::make_pair(std::make_unique<cudf::table>(std::move(probe_noncommon_cols)),
-                          std::make_unique<cudf::table>(std::move(build_cols)));
-  }
-}
-
-/**
- * @brief  Gathers rows from `probe` and `build` table and returns a (`probe`, `build`) table pair,
- * which contains the probe and build portions of the logical joined table respectively.
- *
- * @tparam JoinKind The type of join to be performed
- *
- * @param probe Probe side table
- * @param build build side table
- * @param joined_indices Pair of vectors containing row indices from which
- * `probe` and `build` tables are gathered. If any row index is out of bounds,
- * the contribution in the output `table` will be NULL.
- * @param columns_in_common is a vector of pairs of column indices
- * from tables `probe` and `build` respectively, that are "in common".
- * For "common" columns, only a single output column will be produced.
- * For an inner or left join, the result will be gathered from the column in
- * `probe`. For a full join, the result will be gathered from both common
- * columns in `probe` and `build` and concatenated to form a single column.
- * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side.
- *
- * @return Table pair of (`probe`, `build`) containing the rows from `probe` and
- * `build` specified by `joined_indices`.
- * Columns in `columns_in_common` will be included in either `probe` or `build` portion as
- * `common_columns_output_side` indicates. Final form would look like
- * (`probe(including common columns)`, `build(excluding common columns)`) if
- * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`,
- * `build(including common columns)`) if `common_columns_output_side` is `BUILD`.
- */
-template <join_kind JoinKind>
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_df(
-  table_view const &probe,
-  table_view const &build,
-  VectorPair &joined_indices,
-  std::vector<std::pair<size_type, size_type>> const &columns_in_common,
-  cudf::hash_join::common_columns_output_side common_columns_output_side,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
-{
-  std::vector<size_type> probe_common_col;
-  probe_common_col.reserve(columns_in_common.size());
-  std::vector<size_type> build_common_col;
-  build_common_col.reserve(columns_in_common.size());
-  for (const auto &c : columns_in_common) {
-    probe_common_col.push_back(c.first);
-    build_common_col.push_back(c.second);
-  }
-  std::vector<size_type> probe_noncommon_col =
-    non_common_column_indices(probe.num_columns(), probe_common_col);
-  std::vector<size_type> build_noncommon_col =
-    non_common_column_indices(build.num_columns(), build_common_col);
-
-  out_of_bounds_policy const bounds_policy = JoinKind != join_kind::INNER_JOIN
-                                               ? out_of_bounds_policy::NULLIFY
-                                               : out_of_bounds_policy::DONT_CHECK;
-
-  std::unique_ptr<table> common_table = std::make_unique<table>();
-  // Construct the joined columns
-  if (join_kind::FULL_JOIN == JoinKind) {
-    auto complement_indices = get_left_join_indices_complement(
-      joined_indices.second, probe.num_rows(), build.num_rows(), stream);
-    if (not columns_in_common.empty()) {
-      auto common_from_build = detail::gather(build.select(build_common_col),
-                                              complement_indices.second.begin(),
-                                              complement_indices.second.end(),
-                                              bounds_policy,
-                                              stream,
-                                              rmm::mr::get_current_device_resource());
-      auto common_from_probe = detail::gather(probe.select(probe_common_col),
-                                              joined_indices.first.begin(),
-                                              joined_indices.first.end(),
-                                              bounds_policy,
-                                              stream,
-                                              rmm::mr::get_current_device_resource());
-      common_table           = cudf::detail::concatenate(
-        std::vector<table_view>({common_from_build->view(), common_from_probe->view()}),
-        stream,
-        mr);
-    }
-    joined_indices = concatenate_vector_pairs(complement_indices, joined_indices);
-  } else {
-    if (not columns_in_common.empty()) {
-      common_table = detail::gather(probe.select(probe_common_col),
-                                    joined_indices.first.begin(),
-                                    joined_indices.first.end(),
-                                    bounds_policy,
-                                    stream,
-                                    mr);
-    }
-  }
-
-  // Construct the probe non common columns
-  std::unique_ptr<table> probe_table = detail::gather(probe.select(probe_noncommon_col),
-                                                      joined_indices.first.begin(),
-                                                      joined_indices.first.end(),
-                                                      bounds_policy,
-                                                      stream,
-                                                      mr);
-
-  std::unique_ptr<table> build_table = detail::gather(build.select(build_noncommon_col),
-                                                      joined_indices.second.begin(),
-                                                      joined_indices.second.end(),
-                                                      bounds_policy,
-                                                      stream,
-                                                      mr);
-
-  return combine_join_columns(probe_table->release(),
-                              probe_noncommon_col,
-                              probe_common_col,
-                              build_table->release(),
-                              build_noncommon_col,
-                              build_common_col,
-                              common_table->release(),
-                              common_columns_output_side);
-}
-
 std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table> &&left,
                                                 std::unique_ptr<cudf::table> &&right)
 {
@@ -499,147 +297,112 @@ std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table> &&l
 hash_join::hash_join_impl::~hash_join_impl() = default;
 
 hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
-                                          std::vector<size_type> const &build_on,
                                           null_equality compare_nulls,
                                           rmm::cuda_stream_view stream)
-  : _build(build),
-    _build_selected(build.select(build_on)),
-    _build_on(build_on),
-    _hash_table(nullptr)
+  : _build(build), _hash_table(nullptr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(0 != _build.num_columns(), "Hash join build table is empty");
   CUDF_EXPECTS(_build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Build column size is too big for hash join");
 
-  if (_build_on.empty() || 0 == build.num_rows()) { return; }
+  if (0 == build.num_rows()) { return; }
 
-  _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream);
+  _hash_table = build_join_hash_table(_build, compare_nulls, stream);
 }
 
-std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
-hash_join::hash_join_impl::inner_join(
-  cudf::table_view const &probe,
-  std::vector<size_type> const &probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
-  common_columns_output_side common_columns_output_side,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::hash_join_impl::inner_join(cudf::table_view const &probe,
+                                      null_equality compare_nulls,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
-  return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(
-    probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr);
+  return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(probe, compare_nulls, stream, mr);
 }
 
-std::unique_ptr<cudf::table> hash_join::hash_join_impl::left_join(
-  cudf::table_view const &probe,
-  std::vector<size_type> const &probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::hash_join_impl::left_join(cudf::table_view const &probe,
+                                     null_equality compare_nulls,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
-  auto probe_build_pair =
-    compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(probe,
-                                                          probe_on,
-                                                          columns_in_common,
-                                                          common_columns_output_side::PROBE,
-                                                          compare_nulls,
-                                                          stream,
-                                                          mr);
-  return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
-                                          std::move(probe_build_pair.second));
+  return compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(probe, compare_nulls, stream, mr);
 }
 
-std::unique_ptr<cudf::table> hash_join::hash_join_impl::full_join(
-  cudf::table_view const &probe,
-  std::vector<size_type> const &probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::hash_join_impl::full_join(cudf::table_view const &probe,
+                                     null_equality compare_nulls,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
-  auto probe_build_pair =
-    compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(probe,
-                                                          probe_on,
-                                                          columns_in_common,
-                                                          common_columns_output_side::PROBE,
-                                                          compare_nulls,
-                                                          stream,
-                                                          mr);
-  return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
-                                          std::move(probe_build_pair.second));
+  return compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(probe, compare_nulls, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
-std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
-hash_join::hash_join_impl::compute_hash_join(
-  cudf::table_view const &probe,
-  std::vector<size_type> const &probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
-  common_columns_output_side common_columns_output_side,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe,
+                                             null_equality compare_nulls,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource *mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Probe column size is too big for hash join");
-  CUDF_EXPECTS(_build_on.size() == probe_on.size(),
+  CUDF_EXPECTS(_build.num_columns() == probe.num_columns(),
                "Mismatch in number of columns to be joined on");
 
-  CUDF_EXPECTS(std::all_of(columns_in_common.begin(),
-                           columns_in_common.end(),
-                           [this, &probe_on](auto pair) {
-                             size_t p = std::find(probe_on.begin(), probe_on.end(), pair.first) -
-                                        probe_on.begin();
-                             size_t b = std::find(_build_on.begin(), _build_on.end(), pair.second) -
-                                        _build_on.begin();
-                             return (p != probe_on.size()) && (b != _build_on.size()) && (p == b);
-                           }),
-               "Invalid values passed to columns_in_common");
-
-  if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) {
-    return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side);
+  if (is_trivial_join(probe, _build, JoinKind)) {
+    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  auto probe_selected = probe.select(probe_on);
-  CUDF_EXPECTS(std::equal(std::cbegin(_build_selected),
-                          std::cend(_build_selected),
-                          std::cbegin(probe_selected),
-                          std::cend(probe_selected),
+  CUDF_EXPECTS(std::equal(std::cbegin(_build),
+                          std::cend(_build),
+                          std::cbegin(probe),
+                          std::cend(probe),
                           [](const auto &b, const auto &p) { return b.type() == p.type(); }),
                "Mismatch in joining column data types");
 
-  constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN)
-                                                      ? cudf::detail::join_kind::LEFT_JOIN
-                                                      : JoinKind;
-  auto joined_indices = probe_join_indices<ProbeJoinKind>(probe_selected, compare_nulls, stream);
-  return cudf::detail::construct_join_output_df<JoinKind>(
-    probe, _build, joined_indices, columns_in_common, common_columns_output_side, stream, mr);
+  return probe_join_indices<JoinKind>(probe, compare_nulls, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
-std::enable_if_t<JoinKind != cudf::detail::join_kind::FULL_JOIN,
-                 std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>>
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe,
                                               null_equality compare_nulls,
-                                              rmm::cuda_stream_view stream) const
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource *mr) const
 {
   // Trivial left join case - exit early
-  if (!_hash_table && JoinKind == cudf::detail::join_kind::LEFT_JOIN) {
-    return get_trivial_left_join_indices(probe, stream);
+  if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) {
+    return get_trivial_left_join_indices(probe, stream, mr);
   }
 
   CUDF_EXPECTS(_hash_table, "Hash table of hash join is null.");
 
-  auto build_table = cudf::table_device_view::create(_build_selected, stream);
+  auto build_table = cudf::table_device_view::create(_build, stream);
   auto probe_table = cudf::table_device_view::create(probe, stream);
-  return cudf::detail::probe_join_hash_table<JoinKind>(
-    *build_table, *probe_table, *_hash_table, compare_nulls, stream);
+
+  constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN)
+                                                      ? cudf::detail::join_kind::LEFT_JOIN
+                                                      : JoinKind;
+  auto join_indices = cudf::detail::probe_join_hash_table<ProbeJoinKind>(
+    *build_table, *probe_table, *_hash_table, compare_nulls, stream, mr);
+
+  if (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
+    auto complement_indices = detail::get_left_join_indices_complement(
+      join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr);
+    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
+  }
+  return join_indices;
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index b37f228f6d3..aaa25e8f941 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -15,6 +15,9 @@
  */
 #pragma once
 
+#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <join/join_common_utils.hpp>
 #include <join/join_kernels.cuh>
 
@@ -25,7 +28,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/sequence.h>
@@ -178,19 +181,29 @@ size_type estimate_join_output_size(table_device_view build_table,
  *
  * @param left Table of left columns to join
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the result
  *
  * @return Join output indices vector pair
  */
-inline std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
-get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stream)
+inline std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+                 std::unique_ptr<rmm::device_uvector<size_type>>>
+get_trivial_left_join_indices(
+  table_view const& left,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  rmm::device_vector<size_type> left_indices(left.num_rows());
-  thrust::sequence(rmm::exec_policy(stream), left_indices.begin(), left_indices.end(), 0);
-  rmm::device_vector<size_type> right_indices(left.num_rows());
-  thrust::fill(rmm::exec_policy(stream), right_indices.begin(), right_indices.end(), JoinNoneValue);
+  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+  thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
+  auto right_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+  thrust::fill(
+    rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue);
   return std::make_pair(std::move(left_indices), std::move(right_indices));
 }
 
+std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
+  table_view const& probe, table_view const& build);
+
 std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
                                                 std::unique_ptr<cudf::table>&& right);
 
@@ -207,106 +220,52 @@ struct hash_join::hash_join_impl {
 
  private:
   cudf::table_view _build;
-  cudf::table_view _build_selected;
-  std::vector<size_type> _build_on;
   std::unique_ptr<cudf::detail::multimap_type, std::function<void(cudf::detail::multimap_type*)>>
     _hash_table;
 
  public:
   /**
-   * @brief Constructor that internally builds the hash table based on the given `build` table and
-   * column indices specified by `build_on` for subsequent probe calls.
+   * @brief Constructor that internally builds the hash table based on the given `build` table
    *
    * @throw cudf::logic_error if the number of columns in `build` table is 0.
    * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
-   * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build`
-   * table.
    *
    * @param build The build table, from which the hash table is built.
-   * @param build_on The column indices from `build` to join on.
    * @param compare_nulls Controls whether null join-key values should match or not.
    */
   hash_join_impl(cudf::table_view const& build,
-                 std::vector<size_type> const& build_on,
                  null_equality compare_nulls,
                  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
-  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> inner_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    common_columns_output_side common_columns_output_side,
-    null_equality compare_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const;
-
-  std::unique_ptr<cudf::table> left_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    null_equality compare_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const;
-
-  std::unique_ptr<cudf::table> full_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    null_equality compare_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  inner_join(cudf::table_view const& probe,
+             null_equality compare_nulls,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr) const;
+
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  left_join(cudf::table_view const& probe,
+            null_equality compare_nulls,
+            rmm::cuda_stream_view stream,
+            rmm::mr::device_memory_resource* mr) const;
+
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  full_join(cudf::table_view const& probe,
+            null_equality compare_nulls,
+            rmm::cuda_stream_view stream,
+            rmm::mr::device_memory_resource* mr) const;
 
  private:
-  /**
-   * @brief Performs hash join by probing the columns provided in `probe` as per
-   * the joining indices given in `probe_on` and returns a (`probe`, `_build`) table pair, which
-   * contains the probe and build portions of the logical joined table respectively.
-   *
-   * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
-   * (`P`, `B`) where `P` does not exist in `probe_on` or `B` does not exist in
-   * `_build_on`.
-   * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
-   * (`P`, `B`) such that the location of `P` within `probe_on` is not equal to
-   * the location of `B` within `_build_on`.
-   * @throw cudf::logic_error if the number of elements in `probe_on` and
-   * `_build_on` are not equal.
-   * @throw cudf::logic_error if the number of columns in `probe` is 0.
-   * @throw cudf::logic_error if the number of rows in `probe` table exceeds MAX_JOIN_SIZE.
-   * @throw std::out_of_range if elements of `probe_on` exceed the number of columns in the `probe`
-   * table.
-   * @throw cudf::logic_error if types do not match between joining columns.
-   *
-   * @tparam JoinKind The type of join to be performed.
-   *
-   * @param probe The probe table.
-   * @param probe_on The column's indices from `probe` to join on.
-   * Column `i` from `probe_on` will be compared against column `i` of `_build_on`.
-   * @param columns_in_common is a vector of pairs of column indices into
-   * `probe` and `_build`, respectively, that are "in common". For "common"
-   * columns, only a single output column will be produced, which is gathered
-   * from `probe_on` columns. Else, for every column in `probe_on` and `_build_on`,
-   * an output column will be produced. For each of these pairs (P, B), P
-   * should exist in `probe_on` and B should exist in `_build_on`.
-   * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side.
-   * @param compare_nulls Controls whether null join-key values should match or not.
-   * @param mr Device memory resource used to allocate the returned table's device memory.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return Table pair of (`probe`, `_build`) of joining both tables on the columns
-   * specified by `probe_on` and `_build_on`. The resulting table pair will be joined columns of
-   * (`probe(including common columns)`, `_build(excluding common columns)`) if
-   * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`,
-   * `_build(including common columns)`) if `common_columns_output_side` is `BUILD`.
-   */
   template <cudf::detail::join_kind JoinKind>
-  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> compute_hash_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    common_columns_output_side common_columns_output_side,
-    null_equality compare_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  compute_hash_join(cudf::table_view const& probe,
+                    null_equality compare_nulls,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr) const;
 
   /**
    * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`,
@@ -320,15 +279,17 @@ struct hash_join::hash_join_impl {
    * @param probe_table Table of probe side columns to join.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned vectors.
    *
    * @return Join output indices vector pair.
    */
   template <cudf::detail::join_kind JoinKind>
-  std::enable_if_t<JoinKind != cudf::detail::join_kind::FULL_JOIN,
-                   std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>>
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
   probe_join_indices(cudf::table_view const& probe,
                      null_equality compare_nulls,
-                     rmm::cuda_stream_view stream) const;
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const;
 };
 
 }  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index ce27cfcd616..f2e4bab02c6 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -26,68 +26,102 @@
 namespace cudf {
 namespace detail {
 
-std::unique_ptr<table> inner_join(
-  table_view const& left_input,
-  table_view const& right_input,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+inner_join(table_view const& left_input,
+           table_view const& right_input,
+           null_equality compare_nulls,
+           rmm::cuda_stream_view stream,
+           rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input.select(left_on), right_input.select(right_on)},
+    {left_input, right_input},
     stream,
     rmm::mr::get_current_device_resource());  // temporary objects returned
 
   // now rebuild the table views with the updated ones
-  auto const left  = scatter_columns(matched.second.front(), left_on, left_input);
-  auto const right = scatter_columns(matched.second.back(), right_on, right_input);
+  auto const left  = matched.second.front();
+  auto const right = matched.second.back();
 
   // For `inner_join`, we can freely choose either the `left` or `right` table to use for
   // building/probing the hash map. Because building is typically more expensive than probing, we
   // build the hash map from the smaller table.
   if (right.num_rows() > left.num_rows()) {
-    cudf::hash_join hj_obj(left, left_on, compare_nulls, stream);
-    auto actual_columns_in_common = columns_in_common;
-    std::for_each(actual_columns_in_common.begin(), actual_columns_in_common.end(), [](auto& pair) {
-      std::swap(pair.first, pair.second);
-    });
-    auto probe_build_pair = hj_obj.inner_join(right,
-                                              right_on,
-                                              actual_columns_in_common,
-                                              cudf::hash_join::common_columns_output_side::BUILD,
-                                              compare_nulls,
-                                              stream,
-                                              mr);
-    return cudf::detail::combine_table_pair(std::move(probe_build_pair.second),
-                                            std::move(probe_build_pair.first));
+    cudf::hash_join hj_obj(left, compare_nulls, stream);
+    auto result = hj_obj.inner_join(right, compare_nulls, stream, mr);
+    return std::make_pair(std::move(result.second), std::move(result.first));
   } else {
-    cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
-    auto probe_build_pair = hj_obj.inner_join(left,
-                                              left_on,
-                                              columns_in_common,
-                                              cudf::hash_join::common_columns_output_side::PROBE,
-                                              compare_nulls,
-                                              stream,
-                                              mr);
-    return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
-                                            std::move(probe_build_pair.second));
+    cudf::hash_join hj_obj(right, compare_nulls, stream);
+    return hj_obj.inner_join(left, compare_nulls, stream, mr);
   }
 }
 
-std::unique_ptr<table> left_join(
-  table_view const& left_input,
-  table_view const& right_input,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> inner_join(table_view const& left_input,
+                                  table_view const& right_input,
+                                  std::vector<size_type> const& left_on,
+                                  std::vector<size_type> const& right_on,
+                                  null_equality compare_nulls,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  // Make sure any dictionary columns have matched key sets.
+  // This will return any new dictionary columns created as well as updated table_views.
+  auto matched = cudf::dictionary::detail::match_dictionaries(
+    {left_input.select(left_on), right_input.select(right_on)},
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
+
+  // now rebuild the table views with the updated ones
+  auto const left  = scatter_columns(matched.second.front(), left_on, left_input);
+  auto const right = scatter_columns(matched.second.back(), right_on, right_input);
+
+  auto join_indices = inner_join(left.select(left_on), right.select(right_on), compare_nulls, mr);
+  std::unique_ptr<table> left_result  = detail::gather(left,
+                                                      join_indices.first->begin(),
+                                                      join_indices.first->end(),
+                                                      out_of_bounds_policy::DONT_CHECK,
+                                                      stream,
+                                                      mr);
+  std::unique_ptr<table> right_result = detail::gather(right,
+                                                       join_indices.second->begin(),
+                                                       join_indices.second->end(),
+                                                       out_of_bounds_policy::DONT_CHECK,
+                                                       stream,
+                                                       mr);
+  return combine_table_pair(std::move(left_result), std::move(right_result));
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+left_join(table_view const& left_input,
+          table_view const& right_input,
+          null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
+          rmm::mr::device_memory_resource* mr)
+{
+  // Make sure any dictionary columns have matched key sets.
+  // This will return any new dictionary columns created as well as updated table_views.
+  auto matched = cudf::dictionary::detail::match_dictionaries(
+    {left_input, right_input},  // these should match
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
+  // now rebuild the table views with the updated ones
+  table_view const left  = matched.second.front();
+  table_view const right = matched.second.back();
+
+  cudf::hash_join hj_obj(right, compare_nulls, stream);
+  return hj_obj.left_join(left, compare_nulls, stream, mr);
+}
+
+std::unique_ptr<table> left_join(table_view const& left_input,
+                                 table_view const& right_input,
+                                 std::vector<size_type> const& left_on,
+                                 std::vector<size_type> const& right_on,
+                                 null_equality compare_nulls,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -99,19 +133,58 @@ std::unique_ptr<table> left_join(
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
-  return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr);
+  auto join_indices = left_join(left.select(left_on), right.select(right_on), compare_nulls);
+
+  if ((left_on.empty() || right_on.empty()) ||
+      is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) {
+    auto probe_build_pair = get_empty_joined_table(left, right);
+    return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
+                                            std::move(probe_build_pair.second));
+  }
+  std::unique_ptr<table> left_result  = detail::gather(left,
+                                                      join_indices.first->begin(),
+                                                      join_indices.first->end(),
+                                                      out_of_bounds_policy::NULLIFY,
+                                                      stream,
+                                                      mr);
+  std::unique_ptr<table> right_result = detail::gather(right,
+                                                       join_indices.second->begin(),
+                                                       join_indices.second->end(),
+                                                       out_of_bounds_policy::NULLIFY,
+                                                       stream,
+                                                       mr);
+  return combine_table_pair(std::move(left_result), std::move(right_result));
 }
 
-std::unique_ptr<table> full_join(
-  table_view const& left_input,
-  table_view const& right_input,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+full_join(table_view const& left_input,
+          table_view const& right_input,
+          null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
+          rmm::mr::device_memory_resource* mr)
+{
+  // Make sure any dictionary columns have matched key sets.
+  // This will return any new dictionary columns created as well as updated table_views.
+  auto matched = cudf::dictionary::detail::match_dictionaries(
+    {left_input, right_input},  // these should match
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
+  // now rebuild the table views with the updated ones
+  table_view const left  = matched.second.front();
+  table_view const right = matched.second.back();
+
+  cudf::hash_join hj_obj(right, compare_nulls, stream);
+  return hj_obj.full_join(left, compare_nulls, stream, mr);
+}
+
+std::unique_ptr<table> full_join(table_view const& left_input,
+                                 table_view const& right_input,
+                                 std::vector<size_type> const& left_on,
+                                 std::vector<size_type> const& right_on,
+                                 null_equality compare_nulls,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -123,8 +196,27 @@ std::unique_ptr<table> full_join(
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
-  return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, stream, mr);
+  auto join_indices = full_join(left.select(left_on), right.select(right_on), compare_nulls);
+
+  if ((left_on.empty() || right_on.empty()) ||
+      is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) {
+    auto probe_build_pair = get_empty_joined_table(left, right);
+    return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
+                                            std::move(probe_build_pair.second));
+  }
+  std::unique_ptr<table> left_result  = detail::gather(left,
+                                                      join_indices.first->begin(),
+                                                      join_indices.first->end(),
+                                                      out_of_bounds_policy::NULLIFY,
+                                                      stream,
+                                                      mr);
+  std::unique_ptr<table> right_result = detail::gather(right,
+                                                       join_indices.second->begin(),
+                                                       join_indices.second->end(),
+                                                       out_of_bounds_policy::NULLIFY,
+                                                       stream,
+                                                       mr);
+  return combine_table_pair(std::move(left_result), std::move(right_result));
 }
 
 }  // namespace detail
@@ -132,90 +224,111 @@ std::unique_ptr<table> full_join(
 hash_join::~hash_join() = default;
 
 hash_join::hash_join(cudf::table_view const& build,
-                     std::vector<size_type> const& build_on,
                      null_equality compare_nulls,
                      rmm::cuda_stream_view stream)
-  : impl{std::make_unique<const hash_join::hash_join_impl>(build, build_on, compare_nulls, stream)}
+  : impl{std::make_unique<const hash_join::hash_join_impl>(build, compare_nulls, stream)}
 {
 }
 
-std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> hash_join::inner_join(
-  cudf::table_view const& probe,
-  std::vector<size_type> const& probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-  common_columns_output_side common_columns_output_side,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::inner_join(cudf::table_view const& probe,
+                      null_equality compare_nulls,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr) const
 {
-  return impl->inner_join(
-    probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr);
+  return impl->inner_join(probe, compare_nulls, stream, mr);
 }
 
-std::unique_ptr<cudf::table> hash_join::left_join(
-  cudf::table_view const& probe,
-  std::vector<size_type> const& probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::left_join(cudf::table_view const& probe,
+                     null_equality compare_nulls,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const
 {
-  return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr);
+  return impl->left_join(probe, compare_nulls, stream, mr);
 }
 
-std::unique_ptr<cudf::table> hash_join::full_join(
-  cudf::table_view const& probe,
-  std::vector<size_type> const& probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::full_join(cudf::table_view const& probe,
+                     null_equality compare_nulls,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const
 {
-  return impl->full_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr);
+  return impl->full_join(probe, compare_nulls, stream, mr);
 }
 
 // external APIs
 
-std::unique_ptr<table> inner_join(
-  table_view const& left,
-  table_view const& right,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+inner_join(table_view const& left,
+           table_view const& right,
+           null_equality compare_nulls,
+           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::inner_join(left, right, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<table> inner_join(table_view const& left,
+                                  table_view const& right,
+                                  std::vector<size_type> const& left_on,
+                                  std::vector<size_type> const& right_on,
+                                  null_equality compare_nulls,
+                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::inner_join(
-    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> left_join(
-  table_view const& left,
-  table_view const& right,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+left_join(table_view const& left,
+          table_view const& right,
+          null_equality compare_nulls,
+          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::left_join(left, right, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<table> left_join(table_view const& left,
+                                 table_view const& right,
+                                 std::vector<size_type> const& left_on,
+                                 std::vector<size_type> const& right_on,
+                                 null_equality compare_nulls,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_join(
-    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+full_join(table_view const& left,
+          table_view const& right,
+          null_equality compare_nulls,
+          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::full_join(left, right, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> full_join(
-  table_view const& left,
-  table_view const& right,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> full_join(table_view const& left,
+                                 table_view const& right,
+                                 std::vector<size_type> const& left_on,
+                                 std::vector<size_type> const& right_on,
+                                 null_equality compare_nulls,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::full_join(
-    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index f0c158c1ef6..9312704f065 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -19,6 +19,8 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/device_uvector.hpp>
+
 #include <hash/concurrent_unordered_multimap.cuh>
 
 #include <limits>
@@ -29,9 +31,10 @@ constexpr size_type MAX_JOIN_SIZE{std::numeric_limits<size_type>::max()};
 
 constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128;
 constexpr int DEFAULT_JOIN_CACHE_SIZE = 128;
-constexpr size_type JoinNoneValue     = -1;
+constexpr size_type JoinNoneValue     = std::numeric_limits<size_type>::min();
 
-using VectorPair = std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>;
+using VectorPair = std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+                             std::unique_ptr<rmm::device_uvector<size_type>>>;
 
 using multimap_type =
   concurrent_unordered_multimap<hash_value_type,
@@ -49,14 +52,10 @@ using row_equality = cudf::row_equality_comparator<true>;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
 
-inline bool is_trivial_join(table_view const& left,
-                            table_view const& right,
-                            std::vector<size_type> const& left_on,
-                            std::vector<size_type> const& right_on,
-                            join_kind join_type)
+inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type)
 {
   // If there is nothing to join, then send empty table with all columns
-  if (left_on.empty() || right_on.empty()) { return true; }
+  if (left.is_empty() || right.is_empty()) { return true; }
 
   // If left join and the left table is empty, return immediately
   if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; }
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 9d046f9983c..80a1ef9e204 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -17,20 +17,106 @@
 #include <hash/concurrent_unordered_map.cuh>
 #include <join/join_common_utils.hpp>
 
+#include <thrust/distance.h>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sequence.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf {
 namespace detail {
+
+template <join_kind JoinKind>
+std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
+  cudf::table_view const& left_keys,
+  cudf::table_view const& right_keys,
+  null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty");
+  CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty");
+
+  if (is_trivial_join(left_keys, right_keys, JoinKind)) {
+    return std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream, mr);
+  }
+  if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) {
+    auto result =
+      std::make_unique<rmm::device_uvector<cudf::size_type>>(left_keys.num_rows(), stream, mr);
+    thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end());
+    return result;
+  }
+
+  auto const left_num_rows  = left_keys.num_rows();
+  auto const right_num_rows = right_keys.num_rows();
+
+  // Only care about existence, so we'll use an unordered map (other joins need a multimap)
+  using hash_table_type = concurrent_unordered_map<cudf::size_type, bool, row_hash, row_equality>;
+
+  // Create hash table containing all keys found in right table
+  auto right_rows_d            = table_device_view::create(right_keys, stream);
+  size_t const hash_table_size = compute_hash_table_size(right_num_rows);
+  row_hash hash_build{*right_rows_d};
+  row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
+
+  // Going to join it with left table
+  auto left_rows_d = table_device_view::create(left_keys, stream);
+  row_hash hash_probe{*left_rows_d};
+  row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
+
+  auto hash_table_ptr = hash_table_type::create(hash_table_size,
+                                                stream,
+                                                std::numeric_limits<bool>::max(),
+                                                std::numeric_limits<cudf::size_type>::max(),
+                                                hash_build,
+                                                equality_build);
+  auto hash_table     = *hash_table_ptr;
+
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     right_num_rows,
+                     [hash_table] __device__(size_type idx) mutable {
+                       hash_table.insert(thrust::make_pair(idx, true));
+                     });
+
+  //
+  // Now we have a hash table, we need to iterate over the rows of the left table
+  // and check to see if they are contained in the hash table
+  //
+
+  // For semi join we want contains to be true, for anti join we want contains to be false
+  bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN);
+
+  auto gather_map =
+    std::make_unique<rmm::device_uvector<cudf::size_type>>(left_num_rows, stream, mr);
+
+  // gather_map_end will be the end of valid data in gather_map
+  auto gather_map_end = thrust::copy_if(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(left_num_rows),
+    gather_map->begin(),
+    [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) {
+      auto pos = hash_table.find(idx, hash_probe, equality_probe);
+      return (pos != hash_table.end()) == join_type_boolean;
+    });
+
+  auto join_size = thrust::distance(gather_map->begin(), gather_map_end);
+  gather_map->resize(join_size, stream);
+  return gather_map;
+}
+
 /**
  * @brief  Performs a left semi or anti join on the specified columns of two
  * tables (left, right)
@@ -57,8 +143,6 @@ namespace detail {
  *                             The column from `right` indicated by `right_on[i]`
  *                             will be compared against the column from `left`
  *                             indicated by `left_on[i]`.
- * @param[in] return_columns   A vector of column indices from `left` to
- *                             include in the returned table.
  * @param[in] compare_nulls    Controls whether null join-key values should match or not.
  * @param[in] mr               Device memory resource to used to allocate the returned table's
  *                             device memory
@@ -66,8 +150,7 @@ namespace detail {
  * @tparam    join_kind        Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN
  *
  * @returns                    Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`. The resulting table
- *                             will contain `return_columns` from `left` that match in right.
+ *                             specified by `left_on` and `right_on`.
  */
 template <join_kind JoinKind>
 std::unique_ptr<cudf::table> left_semi_anti_join(
@@ -75,27 +158,19 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<cudf::size_type> const& return_columns,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
-  CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty");
   CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on");
 
-  if (0 == return_columns.size()) { return empty_like(left.select(return_columns)); }
-
-  if (is_trivial_join(left, right, left_on, right_on, JoinKind)) {
-    return empty_like(left.select(return_columns));
+  if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, JoinKind)) {
+    return empty_like(left);
   }
 
-  auto const left_num_rows  = left.num_rows();
-  auto const right_num_rows = right.num_rows();
-
-  if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_num_rows)) {
+  if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right.num_rows())) {
     // Everything matches, just copy the proper columns from the left table
-    return std::make_unique<table>(left.select(return_columns), stream, mr);
+    return std::make_unique<table>(left, stream, mr);
   }
 
   // Make sure any dictionary columns have matched key sets.
@@ -108,91 +183,64 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   auto const left_selected  = matched.second.front();
   auto const right_selected = matched.second.back();
 
-  // Only care about existence, so we'll use an unordered map (other joins need a multimap)
-  using hash_table_type = concurrent_unordered_map<cudf::size_type, bool, row_hash, row_equality>;
-
-  // Create hash table containing all keys found in right table
-  auto right_rows_d            = table_device_view::create(right_selected, stream);
-  size_t const hash_table_size = compute_hash_table_size(right_num_rows);
-  row_hash hash_build{*right_rows_d};
-  row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
-
-  // Going to join it with left table
-  auto left_rows_d = table_device_view::create(left_selected, stream);
-  row_hash hash_probe{*left_rows_d};
-  row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
-
-  auto hash_table_ptr = hash_table_type::create(hash_table_size,
-                                                stream,
-                                                std::numeric_limits<bool>::max(),
-                                                std::numeric_limits<cudf::size_type>::max(),
-                                                hash_build,
-                                                equality_build);
-  auto hash_table     = *hash_table_ptr;
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     right_num_rows,
-                     [hash_table] __device__(size_type idx) mutable {
-                       hash_table.insert(thrust::make_pair(idx, true));
-                     });
-
-  //
-  // Now we have a hash table, we need to iterate over the rows of the left table
-  // and check to see if they are contained in the hash table
-  //
+  auto gather_map =
+    left_semi_anti_join<JoinKind>(left_selected, right_selected, compare_nulls, stream);
 
-  // For semi join we want contains to be true, for anti join we want contains to be false
-  bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN);
-
-  rmm::device_vector<size_type> gather_map(left_num_rows);
-
-  // gather_map_end will be the end of valid data in gather_map
-  auto gather_map_end = thrust::copy_if(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(left_num_rows),
-    gather_map.begin(),
-    [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) {
-      auto pos = hash_table.find(idx, hash_probe, equality_probe);
-      return (pos != hash_table.end()) == join_type_boolean;
-    });
-
-  // rebuild left table for call to gather
   auto const left_updated = scatter_columns(left_selected, left_on, left);
-  return cudf::detail::gather(left_updated.select(return_columns),
-                              gather_map.begin(),
-                              gather_map_end,
+  return cudf::detail::gather(left_updated,
+                              gather_map->begin(),
+                              gather_map->end(),
                               out_of_bounds_policy::DONT_CHECK,
                               stream,
                               mr);
 }
+
 }  // namespace detail
 
 std::unique_ptr<cudf::table> left_semi_join(cudf::table_view const& left,
                                             cudf::table_view const& right,
                                             std::vector<cudf::size_type> const& left_on,
                                             std::vector<cudf::size_type> const& right_on,
-                                            std::vector<cudf::size_type> const& return_columns,
                                             null_equality compare_nulls,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join<detail::join_kind::LEFT_SEMI_JOIN>(
-    left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
+  cudf::table_view const& left,
+  cudf::table_view const& right,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::left_semi_anti_join<detail::join_kind::LEFT_SEMI_JOIN>(
+    left, right, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::table> left_anti_join(cudf::table_view const& left,
                                             cudf::table_view const& right,
                                             std::vector<cudf::size_type> const& left_on,
                                             std::vector<cudf::size_type> const& right_on,
-                                            std::vector<cudf::size_type> const& return_columns,
                                             null_equality compare_nulls,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join<detail::join_kind::LEFT_ANTI_JOIN>(
-    left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
+  cudf::table_view const& left,
+  cudf::table_view const& right,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::left_semi_anti_join<detail::join_kind::LEFT_ANTI_JOIN>(
+    left, right, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index efc5330ea7d..32192234c56 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -33,11 +33,15 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <limits>
+
 template <typename T>
 using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
 using strcol_wrapper = cudf::test::strings_column_wrapper;
 using CVector        = std::vector<std::unique_ptr<cudf::column>>;
 using Table          = cudf::table;
+constexpr cudf::size_type NoneValue =
+  std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
 
 struct JoinTest : public cudf::test::BaseFixture {
 };
@@ -58,58 +62,11 @@ TEST_F(JoinTest, EmptySentinelRepro)
   cudf::table_view left({left_first_col, left_second_col, left_third_col});
   cudf::table_view right({right_first_col, right_second_col, right_third_col});
 
-  auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2}, {{0, 0}, {1, 1}, {2, 2}});
+  auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2});
 
   EXPECT_EQ(result->num_rows(), 1);
 }
 
-TEST_F(JoinTest, InvalidCommonColumnIndices)
-{
-  column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 3}};
-  column_wrapper<int32_t> col0_1{{0, 1, 2, 4, 1}};
-
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}};
-
-  CVector cols0, cols1;
-  cols0.push_back(col0_0.release());
-  cols0.push_back(col0_1.release());
-  cols1.push_back(col1_0.release());
-  cols1.push_back(col1_1.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  EXPECT_THROW(cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 1}, {1, 0}}), cudf::logic_error);
-}
-
-TEST_F(JoinTest, FullJoinNoCommon)
-{
-  column_wrapper<int32_t> col0_0{{0, 1}};
-  column_wrapper<int32_t> col1_0{{0, 2}};
-  CVector cols0, cols1;
-  cols0.push_back(col0_0.release());
-  cols1.push_back(col1_0.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  column_wrapper<int32_t> exp_col0_0{{0, 1, -1}, {1, 1, 0}};
-  column_wrapper<int32_t> exp_col0_1{{0, -1, 2}, {1, 0, 1}};
-  CVector exp_cols;
-  exp_cols.push_back(exp_col0_0.release());
-  exp_cols.push_back(exp_col0_1.release());
-  Table gold(std::move(exp_cols));
-
-  auto result            = cudf::full_join(t0, t1, {0}, {0}, {});
-  auto result_sort_order = cudf::sorted_order(result->view());
-  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-  auto gold_sort_order = cudf::sorted_order(gold.view());
-  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-}
-
 TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 3}};
@@ -131,7 +88,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0}, {0}, {});
+  auto result            = cudf::left_join(t0, t1, {0}, {0});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -153,7 +110,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon)
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, FullJoinNoNulls)
@@ -177,24 +134,32 @@ TEST_F(JoinTest, FullJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{2, 2, 0, 4, 3, 3, 1, 2, 0}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"});
-  column_wrapper<int32_t> col_gold_2{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""},
+                            {1, 1, 1, 1, 1, 0, 0, 0, 0});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+  strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"},
+                            {0, 0, 0, 0, 1, 1, 1, 1, 1});
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
+
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, FullJoinWithNulls)
@@ -218,24 +183,32 @@ TEST_F(JoinTest, FullJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{2, 2, 0, -1, 3, 3, 1, 2, 0}, {1, 1, 1, 0, 1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"});
-  column_wrapper<int32_t> col_gold_2{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""},
+                            {1, 1, 1, 1, 1, 0, 0, 0, 0});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 0}};
+  strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"},
+                            {0, 0, 0, 0, 1, 1, 1, 1, 1});
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
+
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, FullJoinOnNulls)
@@ -262,7 +235,7 @@ TEST_F(JoinTest, FullJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -273,20 +246,26 @@ TEST_F(JoinTest, FullJoinOnNulls)
   cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t");
   cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t");
 #endif
- 
-  column_wrapper<int32_t> col_gold_0{{   2,    5,    3,    -1},
-                                     {   1,    1,    1,     0}};
-  strcol_wrapper          col_gold_1({ "s1", "s0", "s0",  "s1"});
-  column_wrapper<int32_t> col_gold_2{{  -1,   -1,    0,     1}, 
-                                     {   0,    0,    1,     1}};
-  column_wrapper<int32_t> col_gold_3{{   1,    4,    2,     8}, 
-                                     {   1,    1,    1,     1}};
+
+  column_wrapper<int32_t> col_gold_0{{   3,   -1,   -1,    -1},
+                                     {   1,    0,    0,     0}};
+  strcol_wrapper          col_gold_1{{ "s0", "s1",  "",    ""},
+                                     {   1,    1,    0,     0}};
+  column_wrapper<int32_t> col_gold_2{{   0,    1,   -1,    -1},
+                                     {   1,    1,    0,     0}};
+  column_wrapper<int32_t> col_gold_3{{   3,   -1,    2,     5},
+                                     {   1,    0,    1,     1}};
+  strcol_wrapper          col_gold_4{{ "s0", "s1", "s1",  "s0"}};
+  column_wrapper<int32_t> col_gold_5{{   2,    8,    1,     4}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
+  
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
@@ -300,22 +279,27 @@ TEST_F(JoinTest, FullJoinOnNulls)
   cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t");
 #endif
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL);
+  result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  col_gold_0 =               {{   2,    5,    3,    -1,   -1},
-                              {   1,    1,    1,     0,    0}};
-  col_gold_1 = strcol_wrapper({ "s1", "s0", "s0",  "s1", "s1"});
-  col_gold_2 =               {{  -1,   -1,    0,    -1,    1}, 
-                              {   0,    0,    1,     0,    1}};
-  col_gold_3 =               {{   1,    4,    2,     8,   -1}, 
-                              {   1,    1,    1,     1,    0}};
+  col_gold_0 =               {{   3,   -1,   -1,    -1,   -1},
+                              {   1,    0,    0,     0,    0}};
+  col_gold_1 = strcol_wrapper{{ "s0", "s1",   "",    "",   ""},
+                              {   1,    1,    0,     0,    0}};
+  col_gold_2 =               {{   0,    1,   -1,    -1,   -1},
+                              {   1,    1,    0,     0,    0}};
+  col_gold_3 =               {{   3,   -1,    2,     5,   -1},
+                              {   1,    0,    1,     1,    0}};
+  col_gold_4 = strcol_wrapper{{ "s0",  "",  "s1",  "s0",  "s1"},
+                              {   1,    0,    1,     1,    1}};
+  col_gold_5 =               {{   2,   -1,    1,     4,    8},
+                              {   1,    0,    1,     1,    1}};
 
   // clang-format on
 
@@ -324,23 +308,26 @@ TEST_F(JoinTest, FullJoinOnNulls)
   cols_gold_nulls_unequal.push_back(col_gold_1.release());
   cols_gold_nulls_unequal.push_back(col_gold_2.release());
   cols_gold_nulls_unequal.push_back(col_gold_3.release());
+  cols_gold_nulls_unequal.push_back(col_gold_4.release());
+  cols_gold_nulls_unequal.push_back(col_gold_5.release());
+
   Table gold_nulls_unequal{std::move(cols_gold_nulls_unequal)};
 
   gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view());
   sorted_gold     = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, LeftJoinNoNulls)
 {
-  column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 3}};
+  column_wrapper<int32_t> col0_0({3, 1, 2, 0, 3});
   strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"});
-  column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
+  column_wrapper<int32_t> col0_2({0, 1, 2, 4, 1});
 
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
+  column_wrapper<int32_t> col1_0({2, 2, 0, 4, 3});
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}};
+  column_wrapper<int32_t> col1_2({1, 0, 1, 2, 1});
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -353,30 +340,34 @@ TEST_F(JoinTest, LeftJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_2{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0({3, 1, 2, 0, 3});
+  strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1"});
+  column_wrapper<int32_t> col_gold_2({0, 1, 2, 4, 1});
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3}, {0, 0, 0, 0, 1}};
+  strcol_wrapper col_gold_4{{"", "", "", "", "s1"}, {0, 0, 0, 0, 1}};
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1}, {0, 0, 0, 0, 1}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, LeftJoinWithNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
@@ -394,24 +385,29 @@ TEST_F(JoinTest, LeftJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 2, 1, 2, 0}, {1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s1", "", "s4"}, {1, 1, 1, 0, 1});
-  column_wrapper<int32_t> col_gold_2{{0, 1, 1, 2, 4}, {1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {1, 1, 1, 1, 1}};
+  strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1}, {1, 1, 1, 1, 1}};
+  column_wrapper<int32_t> col_gold_3{{3, -1, -1, -1, 2}, {1, 0, 0, 0, 1}};
+  strcol_wrapper col_gold_4{{"s1", "", "", "", "s0"}, {1, 0, 0, 0, 1}};
+  column_wrapper<int32_t> col_gold_5{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}};
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, LeftJoinOnNulls)
@@ -438,7 +434,7 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -449,21 +445,27 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t");
   cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t");
 #endif
- 
+
   column_wrapper<int32_t> col_gold_0{{   3,    -1,    2},
                                      {   1,     0,    1}};
   strcol_wrapper          col_gold_1({ "s0",  "s1", "s2"},
                                      {   1,     1,    1});
-  column_wrapper<int32_t> col_gold_2{{   0,     1,    2}, 
+  column_wrapper<int32_t> col_gold_2{{   0,     1,    2},
                                      {   1,     1,    1}};
-  column_wrapper<int32_t> col_gold_3{{   2,     8,   -1}, 
+  column_wrapper<int32_t> col_gold_3{{   3,    -1,   -1},
+                                     {   1,     0,    0}};
+  strcol_wrapper          col_gold_4({ "s0",  "s1",  ""},
+                                     {   1,     1,    0});
+  column_wrapper<int32_t> col_gold_5{{   2,     8,   -1},
                                      {   1,     1,    0}};
-
+  
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
@@ -477,23 +479,28 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t");
 #endif
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL);
+  result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  col_gold_0 =               {{   3,    -1,    2},
-                              {   1,     0,    1}};
-  col_gold_1 = strcol_wrapper({ "s0",  "s1", "s2"},
-                              {   1,     1,    1});
-  col_gold_2 =               {{   0,     1,    2}, 
-                              {   1,     1,    1}};
-  col_gold_3 =               {{   2,    -1,   -1}, 
-                              {   1,     0,    0}};
+  
+  col_gold_0 = {{   3,    -1,    2},
+                {   1,     0,    1}};
+  col_gold_1 = {{ "s0",  "s1", "s2"},
+                {   1,     1,    1}};
+  col_gold_2 = {{   0,     1,    2},
+                {   1,     1,    1}};
+  col_gold_3 = {{   3,    -1,   -1},
+                {   1,     0,    0}};
+  col_gold_4 = {{ "s0",   "",   ""},
+                {   1,     0,    0}};
+  col_gold_5 = {{   2,    -1,   -1},
+                {   1,     0,    0}};
 
   // clang-format on
   CVector cols_gold_nulls_unequal;
@@ -506,7 +513,7 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view());
   sorted_gold     = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, InnerJoinSizeOverflow)
@@ -529,7 +536,7 @@ TEST_F(JoinTest, InnerJoinSizeOverflow)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}), cudf::logic_error);
+  EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}), cudf::logic_error);
 }
 
 TEST_F(JoinTest, InnerJoinNoNulls)
@@ -553,86 +560,28 @@ TEST_F(JoinTest, InnerJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0{{3, 2, 2}};
   strcol_wrapper col_gold_1({"s1", "s0", "s0"});
   column_wrapper<int32_t> col_gold_2{{0, 2, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, 0, 0}};
+  column_wrapper<int32_t> col_gold_3{{3, 2, 2}};
+  strcol_wrapper col_gold_4({"s1", "s0", "s0"});
+  column_wrapper<int32_t> col_gold_5{{1, 0, 0}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-}
-
-TEST_F(JoinTest, InnerJoinNonAlignedCommon)
-{
-  CVector cols0, cols1;
-  cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-  cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-  cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release());
-  cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
-  cols1.emplace_back(column_wrapper<int32_t>{{2, 2, 0, 4, 3}}.release());
-  cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1"}).release());
-  cols1.emplace_back(column_wrapper<int32_t>{{1, 0, 1, 2, 1}}.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  auto result            = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}});
-  auto result_sort_order = cudf::sorted_order(result->view());
-  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-  CVector cols_gold;
-  cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-  cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{0, 2, 1}}.release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 0}}.release());
-  Table gold(std::move(cols_gold));
-
-  auto gold_sort_order = cudf::sorted_order(gold.view());
-  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-}
-
-TEST_F(JoinTest, InnerJoinNonAlignedCommonSwap)
-{
-  CVector cols0, cols1;
-  cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-  cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-  cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release());
-  cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
-  cols1.emplace_back(column_wrapper<int32_t>{{2, 2, 0, 4, 3, 5}}.release());
-  cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0"}).release());
-  cols1.emplace_back(column_wrapper<int32_t>{{1, 0, 1, 2, 1, 0}}.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  auto result            = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}});
-  auto result_sort_order = cudf::sorted_order(result->view());
-  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-  CVector cols_gold;
-  cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-  cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{0, 2, 1}}.release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 0}}.release());
-  Table gold(std::move(cols_gold));
-
-  auto gold_sort_order = cudf::sorted_order(gold.view());
-  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, InnerJoinWithNulls)
@@ -656,37 +605,41 @@ TEST_F(JoinTest, InnerJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
   strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
   column_wrapper<int32_t> col_gold_2{{0, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, -1}, {1, 0}};
+  column_wrapper<int32_t> col_gold_3{{3, 2}};
+  strcol_wrapper col_gold_4({"s1", "s0"}, {1, 1});
+  column_wrapper<int32_t> col_gold_5{{1, -1}, {1, 0}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
-// Test to check join behaviour when join keys are null.
+// // Test to check join behaviour when join keys are null.
 TEST_F(JoinTest, InnerJoinOnNulls)
 {
   // clang-format off
   column_wrapper<int32_t> col0_0{{  3,    1,    2,    0,    2}};
-  strcol_wrapper          col0_1({"s1", "s1", "s8", "s4", "s0"}, 
+  strcol_wrapper          col0_1({"s1", "s1", "s8", "s4", "s0"},
                                  {  1,    1,    0,    1,    1});
   column_wrapper<int32_t> col0_2{{  0,    1,    2,    4,    1}};
 
   column_wrapper<int32_t> col1_0{{  2,    2,    0,    4,    3}};
-  strcol_wrapper          col1_1({"s1", "s0", "s1", "s2", "s1"}, 
+  strcol_wrapper          col1_1({"s1", "s0", "s1", "s2", "s1"},
                                  {  1,    0,    1,    1,    1});
   column_wrapper<int32_t> col1_2{{  1,    0,    1,    2,    1}};
 
@@ -701,38 +654,47 @@ TEST_F(JoinTest, InnerJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0 {{  3,    2}};
-  strcol_wrapper          col_gold_1 ({"s1", "s0"}, 
+  strcol_wrapper          col_gold_1 ({"s1", "s0"},
                                       {  1,    0});
   column_wrapper<int32_t> col_gold_2{{   0,    2}};
-  column_wrapper<int32_t> col_gold_3{{   1,    0}};
+  column_wrapper<int32_t> col_gold_3 {{  3,    2}};
+  strcol_wrapper          col_gold_4 ({"s1", "s0"},
+                                      {  1,    0});
+  column_wrapper<int32_t> col_gold_5{{   1,    0}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
+  
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-  
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL);
+  result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1},  cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   col_gold_0 =               {{  3}};
-  col_gold_1 = strcol_wrapper({"s1"}, 
+  col_gold_1 = strcol_wrapper({"s1"},
                               {  1});
   col_gold_2 =               {{  0}};
-  col_gold_3 =               {{  1}};
+  col_gold_3 =               {{  3}};
+  col_gold_4 = strcol_wrapper({"s1"},
+                              {  1});
+  col_gold_5 =               {{  1}};
 
   // clang-format on
 
@@ -741,11 +703,13 @@ TEST_F(JoinTest, InnerJoinOnNulls)
   cols_gold_sql.push_back(col_gold_1.release());
   cols_gold_sql.push_back(col_gold_2.release());
   cols_gold_sql.push_back(col_gold_3.release());
+  cols_gold_sql.push_back(col_gold_4.release());
+  cols_gold_sql.push_back(col_gold_5.release());
   Table gold_sql(std::move(cols_gold_sql));
 
   gold_sort_order = cudf::sorted_order(gold_sql.view());
   sorted_gold     = cudf::gather(gold_sql.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 // Empty Left Table
@@ -766,8 +730,8 @@ TEST_F(JoinTest, EmptyLeftTableInnerJoin)
   Table empty0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result);
+  auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result);
 }
 
 TEST_F(JoinTest, EmptyLeftTableLeftJoin)
@@ -787,36 +751,8 @@ TEST_F(JoinTest, EmptyLeftTableLeftJoin)
   Table empty0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result);
-}
-
-TEST_F(JoinTest, EmptyLeftTableLeftJoinNonAlignedCommon)
-{
-  column_wrapper<int32_t> col0_0;
-
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
-
-  CVector cols0, cols1;
-  cols0.emplace_back(col0_0.release());
-  cols1.emplace_back(col1_0.release());
-  cols1.emplace_back(col1_1.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  column_wrapper<int32_t> col_gold_0;
-  column_wrapper<int32_t> col_gold_1;
-
-  CVector cols_gold;
-  cols_gold.emplace_back(col_gold_0.release());
-  cols_gold.emplace_back(col_gold_1.release());
-
-  Table gold(std::move(cols_gold));
-
-  auto result = cudf::left_join(t0, t1, {0}, {1}, {{0, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result);
 }
 
 TEST_F(JoinTest, EmptyLeftTableFullJoin)
@@ -833,11 +769,29 @@ TEST_F(JoinTest, EmptyLeftTableFullJoin)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table empty0(std::move(cols0));
-  Table t1(std::move(cols1));
+  Table lhs(std::move(cols0));
+  Table rhs(std::move(cols1));
+
+  auto result            = cudf::full_join(lhs, rhs, {0, 1}, {0, 1});
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_1{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_2{{2, 2, 0, 4, 3}};
+  column_wrapper<int32_t> col_gold_3{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
 
-  auto result = cudf::full_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(t1, *result);
+  CVector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 // Empty Right Table
@@ -858,36 +812,8 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result);
-}
-
-TEST_F(JoinTest, EmptyRightTableInnerJoinNonAlignedCommon)
-{
-  column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
-
-  column_wrapper<int32_t> col1_0;
-
-  CVector cols0, cols1;
-  cols0.emplace_back(col0_0.release());
-  cols0.emplace_back(col0_1.release());
-  cols1.emplace_back(col1_0.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  column_wrapper<int32_t> col_gold_0;
-  column_wrapper<int32_t> col_gold_1;
-
-  CVector cols_gold;
-  cols_gold.emplace_back(col_gold_0.release());
-  cols_gold.emplace_back(col_gold_1.release());
-
-  Table gold(std::move(cols_gold));
-
-  auto result = cudf::inner_join(t0, t1, {1}, {0}, {{1, 0}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
 TEST_F(JoinTest, EmptyRightTableLeftJoin)
@@ -907,8 +833,8 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result);
+  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
 }
 
 TEST_F(JoinTest, EmptyRightTableFullJoin)
@@ -928,8 +854,8 @@ TEST_F(JoinTest, EmptyRightTableFullJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result);
+  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
 }
 
 // Both tables empty
@@ -950,8 +876,8 @@ TEST_F(JoinTest, BothEmptyInnerJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result);
+  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
 TEST_F(JoinTest, BothEmptyLeftJoin)
@@ -971,8 +897,8 @@ TEST_F(JoinTest, BothEmptyLeftJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result);
+  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
 TEST_F(JoinTest, BothEmptyFullJoin)
@@ -992,11 +918,11 @@ TEST_F(JoinTest, BothEmptyFullJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result);
+  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
-// EqualValues X Inner,Left,Full
+// // EqualValues X Inner,Left,Full
 
 TEST_F(JoinTest, EqualValuesInnerJoin)
 {
@@ -1015,16 +941,22 @@ TEST_F(JoinTest, EqualValuesInnerJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"});
+  column_wrapper<int32_t> col_gold_2{{0, 0, 0, 0}};
+  strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"});
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+
   Table gold(std::move(cols_gold));
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result);
 }
 
 TEST_F(JoinTest, EqualValuesLeftJoin)
@@ -1044,16 +976,21 @@ TEST_F(JoinTest, EqualValuesLeftJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1});
+  column_wrapper<int32_t> col_gold_2{{0, 0, 0, 0}, {1, 1, 1, 1}};
+  strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1});
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
   Table gold(std::move(cols_gold));
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result);
 }
 
 TEST_F(JoinTest, EqualValuesFullJoin)
@@ -1073,16 +1010,21 @@ TEST_F(JoinTest, EqualValuesFullJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"});
+  column_wrapper<int32_t> col_gold_2{{0, 0, 0, 0}};
+  strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"});
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
   Table gold(std::move(cols_gold));
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result);
 }
 
 TEST_F(JoinTest, InnerJoinCornerCase)
@@ -1097,18 +1039,20 @@ TEST_F(JoinTest, InnerJoinCornerCase)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}});
+  auto result            = cudf::inner_join(t0, t1, {0}, {0});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int64_t> col_gold_0{{2, 2, 2, 2}};
+  column_wrapper<int64_t> col_gold_1{{2, 2, 2, 2}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, HashJoinSequentialProbes)
@@ -1116,129 +1060,106 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
   CVector cols1;
   cols1.emplace_back(column_wrapper<int32_t>{{2, 2, 0, 4, 3}}.release());
   cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release());
-  cols1.emplace_back(column_wrapper<int32_t>{{1, 0, 1, 2, 1}}.release());
 
   Table t1(std::move(cols1));
 
-  cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL);
+  cudf::hash_join hash_join(t1, cudf::null_equality::EQUAL);
 
   {
     CVector cols0;
     cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 3}}.release());
     cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release());
-    cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
 
     Table t0(std::move(cols0));
 
-    auto result            = hash_join.full_join(t0, {0, 1}, {{0, 0}, {1, 1}});
-    auto result_sort_order = cudf::sorted_order(result->view());
-    auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+    auto result = hash_join.full_join(t0);
+
+    auto result_table =
+      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.first->size()),
+                                          result.first->data()},
+                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.second->size()),
+                                          result.second->data()}});
+    auto result_sort_order = cudf::sorted_order(result_table);
+    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
+
+    column_wrapper<int32_t> col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}};
+    column_wrapper<int32_t> col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}};
 
     CVector cols_gold;
-    cols_gold.emplace_back(column_wrapper<int32_t>{{2, 2, 0, 4, 3, 3, 1, 2, 0}}.release());
-    cols_gold.emplace_back(
-      strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}).release());
-    cols_gold.emplace_back(
-      column_wrapper<int32_t>{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}
-        .release());
-    cols_gold.emplace_back(
-      column_wrapper<int32_t>{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}
-        .release());
-    Table gold(std::move(cols_gold));
+    cols_gold.push_back(col_gold_0.release());
+    cols_gold.push_back(col_gold_1.release());
 
+    Table gold(std::move(cols_gold));
     auto gold_sort_order = cudf::sorted_order(gold.view());
     auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 
   {
     CVector cols0;
     cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 3}}.release());
     cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release());
-    cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
 
     Table t0(std::move(cols0));
 
-    auto result            = hash_join.left_join(t0, {0, 1}, {{0, 0}, {1, 1}});
-    auto result_sort_order = cudf::sorted_order(result->view());
-    auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-    CVector cols_gold;
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}}.release());
-    cols_gold.emplace_back(
-      strcol_wrapper({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}).release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}.release());
-    Table gold(std::move(cols_gold));
-
-    auto gold_sort_order = cudf::sorted_order(gold.view());
-    auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-  }
-
-  {
-    CVector cols0;
-    cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-    cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-    cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release());
-    cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
-
-    Table t0(std::move(cols0));
+    auto result = hash_join.left_join(t0);
+    auto result_table =
+      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.first->size()),
+                                          result.first->data()},
+                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.second->size()),
+                                          result.second->data()}});
+    auto result_sort_order = cudf::sorted_order(result_table);
+    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
 
-    auto probe_build_pair = hash_join.inner_join(t0, {1, 2}, {{1, 0}, {2, 1}});
-    auto joined_cols      = probe_build_pair.first->release();
-    auto build_cols       = probe_build_pair.second->release();
-    joined_cols.insert(joined_cols.end(),
-                       std::make_move_iterator(build_cols.begin()),
-                       std::make_move_iterator(build_cols.end()));
-    auto result            = std::make_unique<cudf::table>(std::move(joined_cols));
-    auto result_sort_order = cudf::sorted_order(result->view());
-    auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+    column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
+    column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}};
 
     CVector cols_gold;
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-    cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{0, 2, 1}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 0}}.release());
-    Table gold(std::move(cols_gold));
+    cols_gold.push_back(col_gold_0.release());
+    cols_gold.push_back(col_gold_1.release());
 
+    Table gold(std::move(cols_gold));
     auto gold_sort_order = cudf::sorted_order(gold.view());
     auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 
   {
     CVector cols0;
     cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-    cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
     cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release());
-    cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
 
     Table t0(std::move(cols0));
 
-    auto probe_build_pair = hash_join.inner_join(
-      t0, {1, 2}, {{1, 0}, {2, 1}}, cudf::hash_join::common_columns_output_side::BUILD);
-    auto joined_cols = probe_build_pair.second->release();
-    auto probe_cols  = probe_build_pair.first->release();
-    joined_cols.insert(joined_cols.end(),
-                       std::make_move_iterator(probe_cols.begin()),
-                       std::make_move_iterator(probe_cols.end()));
-    auto result            = std::make_unique<cudf::table>(std::move(joined_cols));
-    auto result_sort_order = cudf::sorted_order(result->view());
-    auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+    auto result = hash_join.inner_join(t0);
+    auto result_table =
+      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.first->size()),
+                                          result.first->data()},
+                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.second->size()),
+                                          result.second->data()}});
+    auto result_sort_order = cudf::sorted_order(result_table);
+    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
+
+    column_wrapper<int32_t> col_gold_0{{2, 4, 0}};
+    column_wrapper<int32_t> col_gold_1{{1, 1, 4}};
 
     CVector cols_gold;
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-    cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 0}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{0, 2, 1}}.release());
-    Table gold(std::move(cols_gold));
+    cols_gold.push_back(col_gold_0.release());
+    cols_gold.push_back(col_gold_1.release());
 
+    Table gold(std::move(cols_gold));
     auto gold_sort_order = cudf::sorted_order(gold.view());
     auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 }
 
@@ -1262,7 +1183,7 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls)
   auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2});
   {
-    auto result      = cudf::left_join(t0, t1, {0}, {0}, {});
+    auto result      = cudf::left_join(t0, t1, {0}, {0});
     auto result_view = result->view();
     auto decoded1    = cudf::dictionary::decode(result_view.column(1));
     auto decoded4    = cudf::dictionary::decode(result_view.column(4));
@@ -1273,18 +1194,8 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls)
                                                    decoded4->view(),
                                                    result_view.column(5)});
 
-    auto gold = cudf::left_join(g0, g1, {0}, {0}, {});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
-  }
-  {
-    auto result      = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-    auto result_view = result->view();
-    auto decoded1    = cudf::dictionary::decode(result_view.column(1));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)});
-
-    auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
+    auto gold = cudf::left_join(g0, g1, {0}, {0});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
   }
 }
 
@@ -1303,17 +1214,21 @@ TEST_F(JoinDictionaryTest, LeftJoinWithNulls)
   auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()});
   auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()});
 
-  auto result      = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::left_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded2    = cudf::dictionary::decode(result_view.column(2));
-  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()});
+  auto decoded5    = cudf::dictionary::decode(result_view.column(5));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 result_view.column(1),
+                                                 decoded2->view(),
+                                                 result_view.column(3),
+                                                 result_view.column(4),
+                                                 decoded5->view()});
 
   auto g0   = cudf::table_view({col0_0, col0_1, col0_2_w});
   auto g1   = cudf::table_view({col1_0, col1_1, col1_2_w});
-  auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
+  auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
 TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
@@ -1331,15 +1246,20 @@ TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
   auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2});
   auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2});
 
-  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded1    = cudf::dictionary::decode(result_view.column(1));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)});
+  auto decoded4    = cudf::dictionary::decode(result_view.column(4));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 decoded1->view(),
+                                                 result_view.column(2),
+                                                 result_view.column(3),
+                                                 decoded4->view(),
+                                                 result_view.column(5)});
 
   auto g0   = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1   = cudf::table_view({col1_0, col1_1_w, col1_2});
-  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
@@ -1358,16 +1278,20 @@ TEST_F(JoinDictionaryTest, InnerJoinWithNulls)
   auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()});
   auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()});
 
-  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded2    = cudf::dictionary::decode(result_view.column(2));
-  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()});
+  auto decoded5    = cudf::dictionary::decode(result_view.column(5));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 result_view.column(1),
+                                                 decoded2->view(),
+                                                 result_view.column(3),
+                                                 result_view.column(4),
+                                                 decoded5->view()});
 
   auto g0   = cudf::table_view({col0_0, col0_1, col0_2_w});
   auto g1   = cudf::table_view({col1_0, col1_1, col1_2_w});
-  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
@@ -1386,16 +1310,21 @@ TEST_F(JoinDictionaryTest, FullJoinNoNulls)
   auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2});
   auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2});
 
-  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded1    = cudf::dictionary::decode(result_view.column(1));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)});
+  auto decoded4    = cudf::dictionary::decode(result_view.column(4));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 decoded1->view(),
+                                                 result_view.column(2),
+                                                 result_view.column(3),
+                                                 decoded4->view(),
+                                                 result_view.column(5)});
 
   auto g0   = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1   = cudf::table_view({col1_0, col1_1_w, col1_2});
-  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
+  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
 TEST_F(JoinDictionaryTest, FullJoinWithNulls)
@@ -1413,16 +1342,21 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls)
   auto t0 = cudf::table_view({col0_0->view(), col0_1, col0_2});
   auto t1 = cudf::table_view({col1_0->view(), col1_1, col1_2});
 
-  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded0    = cudf::dictionary::decode(result_view.column(0));
-  std::vector<cudf::column_view> result_decoded(
-    {decoded0->view(), result_view.column(1), result_view.column(2), result_view.column(3)});
+  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
+  std::vector<cudf::column_view> result_decoded({decoded0->view(),
+                                                 result_view.column(1),
+                                                 result_view.column(2),
+                                                 decoded3->view(),
+                                                 result_view.column(4),
+                                                 result_view.column(5)});
 
   auto g0   = cudf::table_view({col0_0_w, col0_1, col0_2});
   auto g1   = cudf::table_view({col1_0_w, col1_1, col1_2});
-  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
+  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/join/semi_join_tests.cpp b/cpp/tests/join/semi_join_tests.cpp
index 13c74616484..8de9610b07d 100644
--- a/cpp/tests/join/semi_join_tests.cpp
+++ b/cpp/tests/join/semi_join_tests.cpp
@@ -20,6 +20,7 @@
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -33,809 +34,3 @@ using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
 
 struct JoinTest : public cudf::test::BaseFixture {
 };
-
-TEST_F(JoinTest, LeftSemiJoin)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result"};
-  std::vector<const char*> e_strings{"quick", "composéd", "result", ""};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{10, 20, 20, 20};
-  column_wrapper<float> expect_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> expect_2{90, 61, 62, 63};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftSemiJoin_with_a_string_key)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result"};
-  std::vector<const char*> e_strings{"quick", "result"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{10, 20};
-  column_wrapper<float> expect_1{5.0, .7};
-  column_wrapper<int8_t> expect_2{90, 62};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftSemiJoin_with_null)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{"quick", "result"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{10, 20};
-  column_wrapper<float> expect_1{5.0, .7};
-  column_wrapper<int8_t> expect_2{90, 62};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result"};
-  std::vector<const char*> e_strings{"accénted", "turtlé", "words"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{20, 20, 50};
-  column_wrapper<float> expect_1{.5, .5, .7};
-  column_wrapper<int8_t> expect_2{77, 78, 41};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin_with_a_string_key)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result"};
-  std::vector<const char*> e_strings{"accénted", "turtlé", "composéd", "", "words"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{20, 20, 20, 20, 50};
-  column_wrapper<float> expect_1{.5, .5, .7, .7, .7};
-  column_wrapper<int8_t> expect_2{77, 78, 61, 63, 41};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin_with_null)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{"accénted", "turtlé", "composéd", "", "words"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{20, 20, 20, 20, 50};
-  column_wrapper<float> expect_1{.5, .5, .7, .7, .7};
-  column_wrapper<int8_t> expect_2{77, 78, 61, 63, 41};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftSemiAntiJoin_exceptions)
-{
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  //
-  //  table_a has no columns, table_b has columns
-  //  Let's check different permutations of passing table
-  //  with no columns to verify that exceptions are thrown
-  //
-  EXPECT_THROW(cudf::left_semi_join(table_a, table_b, {}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_anti_join(table_a, table_b, {}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_semi_join(table_b, table_a, {}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_anti_join(table_b, table_a, {}, {}, {}), cudf::logic_error);
-
-  //
-  //  table_b has columns, so we'll pass the column checks, but
-  //  these should fail the exception check that the number of
-  //  join columns must be the same for each table
-  //
-  EXPECT_THROW(cudf::left_semi_join(table_b, table_b, {0}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_anti_join(table_b, table_b, {0}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_semi_join(table_b, table_b, {}, {0}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_anti_join(table_b, table_b, {}, {0}, {}), cudf::logic_error);
-}
-
-TEST_F(JoinTest, LeftSemiJoin_empty_result)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{};
-  column_wrapper<float> expect_1{};
-  column_wrapper<int8_t> expect_2{};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {});
-
-  EXPECT_EQ(join_table->num_columns(), 0);
-  EXPECT_EQ(join_table->num_rows(), 0);
-
-  auto join_table2 = cudf::left_semi_join(table_a, table_b, {}, {}, {0, 1, 3});
-
-  EXPECT_EQ(join_table2->num_columns(), 3);
-  EXPECT_EQ(join_table2->num_rows(), 0);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(2), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin_empty_result)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{};
-  column_wrapper<float> expect_1{};
-  column_wrapper<int8_t> expect_2{};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {});
-
-  EXPECT_EQ(join_table->num_columns(), 0);
-  EXPECT_EQ(join_table->num_rows(), 0);
-
-  auto join_table2 = cudf::left_anti_join(table_a, table_b, {}, {}, {0, 1, 3});
-
-  EXPECT_EQ(join_table2->num_columns(), 3);
-  EXPECT_EQ(join_table2->num_rows(), 0);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(2), expect_3);
-}
-
-TEST_F(JoinTest, LeftSemiAntiJoin_empty_table)
-{
-  std::vector<const char*> a_strings{};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{};
-
-  column_wrapper<int32_t> a_0{};
-  column_wrapper<float> a_1{};
-  column_wrapper<int8_t> a_2{};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{};
-  column_wrapper<float> expect_1{};
-  column_wrapper<int8_t> expect_2{};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table->get_column(3), expect_3);
-
-  auto join_table2 = cudf::left_semi_join(table_b, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(3), expect_3);
-
-  auto join_table3 = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table3->get_column(3), expect_3);
-
-  auto join_table4 = cudf::left_anti_join(table_a, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table4->get_column(3), expect_3);
-
-  auto join_table5 = cudf::left_anti_join(table_a, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table5->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin_empty_right_table)
-{
-  std::vector<const char*> a_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> b_strings{};
-  std::vector<const char*> e_strings{"quick", "words", "result", nullptr};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{};
-  column_wrapper<float> b_1{};
-  column_wrapper<int8_t> b_2{};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{10, 20, 20, 50};
-  column_wrapper<float> expect_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> expect_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-struct JoinDictionaryTest : public cudf::test::BaseFixture {
-};
-
-TEST_F(JoinDictionaryTest, LeftSemiJoin)
-{
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-  cudf::test::strings_column_wrapper a_3_w(
-    {"quick", "accénted", "turtlé", "composéd", "result", "", "words"});
-  auto a_3 = cudf::dictionary::encode(a_3_w);
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-  cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result"});
-  auto b_3 = cudf::dictionary::encode(b_3_w);
-
-  auto table_a  = cudf::table_view({a_0, a_1, a_2, a_3->view()});
-  auto table_b  = cudf::table_view({b_0, b_1, b_2, b_3->view()});
-  auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w});
-  auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w});
-  {
-    auto result      = cudf::left_semi_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-    auto result_view = result->view();
-    auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-    auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view(result_decoded), *expected);
-  }
-  {
-    auto result      = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-    auto result_view = result->view();
-    auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-    auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected);
-  }
-}
-
-TEST_F(JoinDictionaryTest, LeftSemiJoinWithNulls)
-{
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-  cudf::test::strings_column_wrapper a_3_w(
-    {"quick", "accénted", "turtlé", "composéd", "result", "", "words"});
-  auto a_3 = cudf::dictionary::encode(a_3_w);
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-  cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result", ""}, {1, 1, 1, 0});
-  auto b_3 = cudf::dictionary::encode(b_3_w);
-
-  auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()});
-  auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()});
-
-  auto result      = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-  auto result_view = result->view();
-  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-  auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w});
-  auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w});
-  auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected);
-}
-
-TEST_F(JoinDictionaryTest, LeftAntiJoin)
-{
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-  cudf::test::strings_column_wrapper a_3_w(
-    {"quick", "accénted", "turtlé", "composéd", "result", "", "words"});
-  auto a_3 = cudf::dictionary::encode(a_3_w);
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-  cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result"});
-  auto b_3 = cudf::dictionary::encode(b_3_w);
-
-  auto table_a  = cudf::table_view({a_0, a_1, a_2, a_3->view()});
-  auto table_b  = cudf::table_view({b_0, b_1, b_2, b_3->view()});
-  auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w});
-  auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w});
-  {
-    auto result      = cudf::left_anti_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-    auto result_view = result->view();
-    auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-    auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view(result_decoded), *expected);
-  }
-  {
-    auto result      = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-    auto result_view = result->view();
-    auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-    auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected);
-  }
-}
-
-TEST_F(JoinDictionaryTest, LeftAntiJoinWithNulls)
-{
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-  cudf::test::strings_column_wrapper a_3_w(
-    {"quick", "accénted", "turtlé", "composéd", "result", "", "words"});
-  auto a_3 = cudf::dictionary::encode(a_3_w);
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-  cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result", ""}, {1, 1, 1, 0});
-  auto b_3 = cudf::dictionary::encode(b_3_w);
-
-  auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()});
-  auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()});
-
-  auto result      = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-  auto result_view = result->view();
-  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-  auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w});
-  auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w});
-  auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected);
-}
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index e5501428624..4c72ba2e055 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -134,11 +134,16 @@ def copy_range(Column input_column,
                            input_begin, input_end, target_begin)
 
 
-def gather(Table source_table, Column gather_map, bool keep_index=True):
+def gather(
+    Table source_table,
+    Column gather_map,
+    bool keep_index=True,
+    bool nullify=False
+):
     if not pd.api.types.is_integer_dtype(gather_map.dtype):
         raise ValueError("Gather map is not integer dtype.")
 
-    if len(gather_map) > 0:
+    if len(gather_map) > 0 and not nullify:
         gm_min, gm_max = minmax(gather_map)
         if gm_min < -len(source_table) or gm_max >= len(source_table):
             raise IndexError(f"Gather map index with min {gm_min},"
@@ -154,7 +159,8 @@ def gather(Table source_table, Column gather_map, bool keep_index=True):
         source_table_view = source_table.data_view()
     cdef column_view gather_map_view = gather_map.view()
     cdef cpp_copying.out_of_bounds_policy policy = (
-        cpp_copying.out_of_bounds_policy.DONT_CHECK
+        cpp_copying.out_of_bounds_policy.NULLIFY if nullify
+        else cpp_copying.out_of_bounds_policy.DONT_CHECK
     )
 
     with nogil:
diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/cpp/join.pxd
index 10edf370f5d..c221fea926d 100644
--- a/python/cudf/cudf/_lib/cpp/join.pxd
+++ b/python/cudf/cudf/_lib/cpp/join.pxd
@@ -4,44 +4,40 @@ from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
 from libcpp cimport bool
+from libcpp.pair cimport pair
+from libcpp.memory cimport unique_ptr
 
+from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
+from rmm._lib.device_uvector cimport device_uvector
 
 
+ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
+
 cdef extern from "cudf/join.hpp" namespace "cudf" nogil:
-    cdef unique_ptr[table] inner_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[pair[int, int]] columns_in_common
+    cdef pair[gather_map_type, gather_map_type] inner_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
-    cdef unique_ptr[table] left_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[pair[int, int]] columns_in_common
+
+    cdef pair[gather_map_type, gather_map_type] left_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
-    cdef unique_ptr[table] full_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[pair[int, int]] columns_in_common
+
+    cdef pair[gather_map_type, gather_map_type] full_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
-    cdef unique_ptr[table] left_semi_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[int] return_columns
+
+    cdef gather_map_type left_semi_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
-    cdef unique_ptr[table] left_anti_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[int] return_columns
+
+    cdef gather_map_type left_anti_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
diff --git a/python/cudf/cudf/_lib/cpp/table/table_view.pxd b/python/cudf/cudf/_lib/cpp/table/table_view.pxd
index 2f386d337cd..7bbfa69836c 100644
--- a/python/cudf/cudf/_lib/cpp/table/table_view.pxd
+++ b/python/cudf/cudf/_lib/cpp/table/table_view.pxd
@@ -15,6 +15,7 @@ cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil:
         column_view column(size_type column_index) except +
         size_type num_columns() except +
         size_type num_rows() except +
+        table_view select(vector[size_type] column_indices) except +
 
     cdef cppclass mutable_table_view:
         mutable_table_view() except +
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 38f13b9f994..69b8004cede 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -1,222 +1,88 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+import cudf
+
 from collections import OrderedDict
 from itertools import chain
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport unique_ptr, make_unique
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
 from libcpp cimport bool
 
+from cudf._lib.column cimport Column
 from cudf._lib.table cimport Table, columns_from_ptr
 
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.types cimport size_type, data_type, type_id
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 cimport cudf._lib.cpp.join as cpp_join
 
-cpdef join(Table lhs,
-           Table rhs,
-           object how,
-           object method,
-           object left_on=None,
-           object right_on=None,
-           bool left_index=False,
-           bool right_index=False
-           ):
-    """
-    Call libcudf++ join for full outer, inner and left joins.
-    """
-
-    cdef Table c_lhs = lhs
-    cdef Table c_rhs = rhs
-
-    # Views might or might not include index
-    cdef table_view lhs_view
-    cdef table_view rhs_view
-
-    # Will hold the join column indices into L and R tables
-    cdef vector[int] left_on_ind
-    cdef vector[int] right_on_ind
-
-    # If left/right index, will pass a full view
-    # must offset the data column indices by # of index columns
-    num_inds_left = len(left_on) + (lhs._num_indices * left_index)
-    num_inds_right = len(right_on) + (rhs._num_indices * right_index)
-    left_on_ind.reserve(num_inds_left)
-    right_on_ind.reserve(num_inds_right)
-
-    # Only used for semi or anti joins
-    # The result columns are only the left hand columns
-    cdef vector[int] all_left_inds = range(
-        lhs._num_columns + (lhs._num_indices * left_index)
-    )
-    cdef vector[int] all_right_inds = range(
-        rhs._num_columns + (rhs._num_indices * right_index)
-    )
 
-    result_col_names = compute_result_col_names(lhs, rhs, how)
-
-    columns_in_common = OrderedDict()
-    cdef vector[pair[int, int]] c_columns_in_common
-
-    # keep track of where the desired index column will end up
-    result_index_pos = None
-    if left_index or right_index:
-        # If either true, we need to process both indices as columns
-        lhs_view = c_lhs.view()
-        rhs_view = c_rhs.view()
-
-        left_join_cols = list(lhs._index_names) + list(lhs._data.keys())
-        right_join_cols = list(rhs._index_names) + list(rhs._data.keys())
-        if left_index and right_index:
-            # Index columns will be common, on the left, dropped from right
-            # Index name is from the left
-            # Both views, must take index column indices
-            left_on_indices = right_on_indices = range(lhs._num_indices)
-            result_idx_positions = range(lhs._num_indices)
-            result_index_names = lhs._index_names
-
-        elif left_index:
-            # Joins left index columns with right 'on' columns
-            left_on_indices = range(lhs._num_indices)
-            right_on_indices = [
-                right_join_cols.index(on_col) for on_col in right_on
-            ]
-
-            # The left index columns 'become' the new RHS columns
-            # and the right index 'survives'
-            result_idx_positions = range(
-                len(left_join_cols), len(left_join_cols) + lhs._num_indices
-            )
-            result_index_names = rhs._index_names
-
-            # but since the common columns are gathered from the left
-            # the rhs 'on' cols are returned on the left of the result
-            # rearrange the names so account for this
-            common = [None] * rhs._num_indices
-            for i in range(rhs._num_indices):
-                common[i] = result_col_names.pop(
-                    result_col_names.index(right_on[i])
-                )
-            result_col_names = common + result_col_names
-        elif right_index:
-            # Joins right index columns with left 'on' columns
-            right_on_indices = range(rhs._num_indices)
-            left_on_indices = [
-                left_join_cols.index(on_col) for on_col in left_on
-            ]
-
-            # The right index columns 'become' the new LHS columns
-            # and the left index survives
-            # since they are already gathered from the left,
-            # no rearranging has to be done
-            result_idx_positions = range(lhs._num_indices)
-            result_index_names = lhs._index_names
-        for i_l, i_r in zip(left_on_indices, right_on_indices):
-            left_on_ind.push_back(i_l)
-            right_on_ind.push_back(i_r)
-            columns_in_common[(i_l, i_r)] = None
-    else:
-        # cuDF's Python layer will create a new RangeIndex for this case
-        lhs_view = c_lhs.data_view()
-        rhs_view = c_rhs.data_view()
-
-        left_join_cols = list(lhs._data.keys())
-        right_join_cols = list(rhs._data.keys())
-
-    # If both left/right_index, joining on indices plus additional cols
-    # If neither, joining on just cols, not indices
-    # In both cases, must match up additional column indices in lhs/rhs
-    if left_index == right_index:
-        for name in left_on:
-            left_on_ind.push_back(left_join_cols.index(name))
-            if name in right_on:
-                if (left_on.index(name) == right_on.index(name)):
-                    columns_in_common[(
-                        left_join_cols.index(name),
-                        right_join_cols.index(name)
-                    )] = None
-        for name in right_on:
-            right_on_ind.push_back(right_join_cols.index(name))
-    c_columns_in_common = list(columns_in_common.keys())
-    cdef unique_ptr[table] c_result
-    if how == 'inner':
-        with nogil:
-            c_result = move(cpp_join.inner_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                c_columns_in_common
-            ))
-    elif how == 'left':
-        with nogil:
-            c_result = move(cpp_join.left_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                c_columns_in_common
-            ))
-    elif how == 'outer':
-        with nogil:
-            c_result = move(cpp_join.full_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                c_columns_in_common
-            ))
-    elif how == 'leftsemi':
-        with nogil:
-            c_result = move(cpp_join.left_semi_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                all_left_inds
-            ))
-    elif how == 'leftanti':
-        with nogil:
-            c_result = move(cpp_join.left_anti_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                all_left_inds
-            ))
-
-    all_cols_py = columns_from_ptr(move(c_result))
-    if left_index or right_index:
-        ind_cols = OrderedDict()
-        for name, pos in zip(
-            result_index_names[::-1], result_idx_positions[::-1]
-        ):
-            ind_cols[name] = all_cols_py.pop(pos)
-        index = OrderedDict()
-        for k, v in reversed(ind_cols.items()):
-            index[k] = v
-        index = Table(index)
+# The functions below return the *gathermaps* that represent
+# the join result when joining on the keys `lhs` and `rhs`.
+
+cpdef join(Table lhs, Table rhs, how=None):
+    cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
+    cdef table_view c_lhs = lhs.view()
+    cdef table_view c_rhs = rhs.view()
+
+    if how == "inner":
+        c_result = move(cpp_join.inner_join(
+            c_lhs,
+            c_rhs
+        ))
+    elif how == "left":
+        c_result = move(cpp_join.left_join(
+            c_lhs,
+            c_rhs
+        ))
+    elif how == "outer":
+        c_result = move(cpp_join.full_join(
+            c_lhs,
+            c_rhs
+        ))
     else:
-        index = None
-    data_ordered_dict = OrderedDict(zip(result_col_names, all_cols_py))
-    return Table(data=data_ordered_dict, index=index)
-
-
-def compute_result_col_names(lhs, rhs, how):
-    """
-    Determine the names of the data columns in the result of
-    a libcudf join, based on the original left and right frames
-    as well as the type of join that was performed.
-    """
-    if how in {"left", "inner", "outer", "leftsemi", "leftanti"}:
-        a = lhs._data.keys()
-        if how not in {"leftsemi", "leftanti"}:
-            return list(chain(a, (k for k in rhs._data.keys()
-                        if k not in lhs._data.keys())))
-        return list(a)
+        raise ValueError(f"Invalid join type {how}")
+
+    cdef Column left_rows = _gather_map_as_column(move(c_result.first))
+    cdef Column right_rows = _gather_map_as_column(move(c_result.second))
+    return left_rows, right_rows
+
+
+cpdef semi_join(Table lhs, Table rhs, how=None):
+    # left-semi and left-anti joins
+    cdef cpp_join.gather_map_type c_result
+    cdef table_view c_lhs = lhs.view()
+    cdef table_view c_rhs = rhs.view()
+
+    if how == "leftsemi":
+        c_result = move(cpp_join.left_semi_join(
+            c_lhs,
+            c_rhs
+        ))
+    elif how == "leftanti":
+        c_result = move(cpp_join.left_anti_join(
+            c_lhs,
+            c_rhs
+        ))
     else:
-        raise NotImplementedError(
-            f"{how} merge not supported yet"
-        )
+        raise ValueError(f"Invalid join type {how}")
+
+    cdef Column left_rows = _gather_map_as_column(move(c_result))
+    return (
+        left_rows,
+        None
+    )
+
+
+cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
+    # helple to convert a gather map to a Column
+    cdef size_type size = gather_map.get()[0].size()
+    cdef unique_ptr[column] c_col = make_unique[column](
+        data_type(type_id.INT32),
+        size,
+        gather_map.get()[0].release())
+    return Column.from_unique_ptr(move(c_col))
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 39c278d2abf..bb1bf3c5d5c 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -750,6 +750,9 @@ def _set_categories(
             ordered=ordered,
         )
 
+    def _decategorize(self) -> ColumnBase:
+        return self._column._get_decategorized_column()
+
 
 class CategoricalColumn(column.ColumnBase):
     """Implements operations for Columns of Categorical type
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index dd06d97d105..e59b395ec0f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -827,7 +827,12 @@ def quantile(
     def median(self, skipna: bool = None) -> ScalarLike:
         raise TypeError(f"cannot perform median with type {self.dtype}")
 
-    def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T:
+    def take(
+        self: T,
+        indices: ColumnBase,
+        keep_index: bool = True,
+        nullify: bool = False,
+    ) -> T:
         """Return Column by taking values from the corresponding *indices*.
         """
         # Handle zero size
@@ -836,7 +841,7 @@ def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T:
         try:
             return (
                 self.as_frame()
-                ._gather(indices, keep_index=keep_index)
+                ._gather(indices, keep_index=keep_index, nullify=nullify)
                 ._as_column()
             )
         except RuntimeError as e:
@@ -1004,7 +1009,9 @@ def sort_by_values(
         ascending: bool = True,
         na_position: builtins.str = "last",
     ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]:
-        col_inds = self.as_frame()._get_sorted_inds(ascending, na_position)
+        col_inds = self.as_frame()._get_sorted_inds(
+            ascending=ascending, na_position=na_position
+        )
         col_keys = self.take(col_inds)
         return col_keys, col_inds
 
@@ -1016,6 +1023,9 @@ def distinct_count(
             raise NotImplementedError(msg)
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
+    def can_cast_safely(self, to_dtype: Dtype) -> bool:
+        raise NotImplementedError()
+
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
         if is_numerical_dtype(dtype):
             return self.as_numerical_column(dtype)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 7ad6eed65a8..da77517c75d 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -362,7 +362,9 @@ def _numeric_quantile(
     ) -> NumericalColumn:
         quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
         # get sorted indices and exclude nulls
-        sorted_indices = self.as_frame()._get_sorted_inds(True, "first")
+        sorted_indices = self.as_frame()._get_sorted_inds(
+            ascending=True, na_position="first"
+        )
         sorted_indices = sorted_indices[self.null_count :]
 
         return cpp_quantile(self, quant, interpolation, sorted_indices, exact)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b5f57356698..01b96151485 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4497,12 +4497,9 @@ def merge(
         else:
             lsuffix, rsuffix = suffixes
 
-        lhs = self.copy(deep=False)
-        rhs = right.copy(deep=False)
-
         # Compute merge
-        gdf_result = super(DataFrame, lhs)._merge(
-            rhs,
+        gdf_result = super()._merge(
+            right,
             on=on,
             left_on=left_on,
             right_on=right_on,
@@ -4510,8 +4507,6 @@ def merge(
             right_index=right_index,
             how=how,
             sort=sort,
-            lsuffix=lsuffix,
-            rsuffix=rsuffix,
             method=method,
             indicator=indicator,
             suffixes=suffixes,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ecff3dee573..fb746d6c794 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -20,6 +20,7 @@
 from cudf import _lib as libcudf
 from cudf._typing import ColumnLike, DataFrameOrSeries
 from cudf.core.column import as_column, build_categorical_column, column_empty
+from cudf.core.join import merge
 from cudf.utils.dtypes import (
     is_categorical_dtype,
     is_column_like,
@@ -595,7 +596,7 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             res.index.names = self._index.names
         return res
 
-    def _get_columns_by_label(self, labels, downcast):
+    def _get_columns_by_label(self, labels, downcast=False):
         """
         Returns columns of the Frame specified by `labels`
 
@@ -612,15 +613,18 @@ def _get_columns_by_index(self, indices):
             data, columns=data.to_pandas_index(), index=self.index
         )
 
-    def _gather(self, gather_map, keep_index=True):
+    def _gather(self, gather_map, keep_index=True, nullify=False):
         if not pd.api.types.is_integer_dtype(gather_map.dtype):
             gather_map = gather_map.astype("int32")
         result = self.__class__._from_table(
             libcudf.copying.gather(
-                self, as_column(gather_map), keep_index=keep_index
+                self,
+                as_column(gather_map),
+                keep_index=keep_index,
+                nullify=nullify,
             )
         )
-        result._copy_type_metadata(self)
+        result._copy_type_metadata(self, include_index=keep_index)
         if keep_index and self._index is not None:
             result._index.names = self._index.names
         return result
@@ -2754,12 +2758,15 @@ def searchsorted(
         else:
             return result
 
-    def _get_sorted_inds(self, ascending=True, na_position="last"):
+    def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
         """
         Sort by the values.
 
         Parameters
         ----------
+        by: list, optional
+            Labels specifying columns to sort by. By default,
+            sort by all columns of `self`
         ascending : bool or list of bool, default True
             If True, sort values in ascending order, otherwise descending.
         na_position : {‘first’ or ‘last’}, default ‘last’
@@ -2794,11 +2801,17 @@ def _get_sorted_inds(self, ascending=True, na_position="last"):
             )
             na_position = 0
 
+        to_sort = (
+            self
+            if by is None
+            else self._get_columns_by_label(by, downcast=False)
+        )
+
         # If given a scalar need to construct a sequence of length # of columns
         if np.isscalar(ascending):
-            ascending = [ascending] * self._num_columns
+            ascending = [ascending] * to_sort._num_columns
 
-        return libcudf.sort.order_by(self, ascending, na_position)
+        return libcudf.sort.order_by(to_sort, ascending, na_position)
 
     def sin(self):
         """
@@ -3329,77 +3342,6 @@ def sqrt(self):
         """
         return self._unaryop("sqrt")
 
-    @staticmethod
-    def _validate_merge_cfg(
-        lhs,
-        rhs,
-        left_on,
-        right_on,
-        on,
-        how,
-        left_index=False,
-        right_index=False,
-        lsuffix=None,
-        rsuffix=None,
-    ):
-        """
-        Error for various combinations of merge input parameters
-        """
-        len_left_on = len(left_on) if left_on is not None else 0
-        len_right_on = len(right_on) if right_on is not None else 0
-
-        # must actually support the requested merge type
-        if how not in ["left", "inner", "outer", "leftanti", "leftsemi"]:
-            raise NotImplementedError(f"{how} merge not supported yet")
-
-        # Passing 'on' with 'left_on' or 'right_on' is potentially ambiguous
-        if on:
-            if left_on or right_on:
-                raise ValueError(
-                    'Can only pass argument "on" OR "left_on" '
-                    'and "right_on", not a combination of both.'
-                )
-
-        # Require same total number of columns to join on in both operands
-        if not (len_left_on + left_index * len(lhs.index.names)) == (
-            len_right_on + right_index * len(rhs.index.names)
-        ):
-            raise ValueError(
-                "Merge operands must have same number of join key columns"
-            )
-
-        # If nothing specified, must have common cols to use implicitly
-        same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys())
-        if not (left_index or right_index):
-            if not (left_on or right_on):
-                if len(same_named_columns) == 0:
-                    raise ValueError("No common columns to perform merge on")
-
-        for name in same_named_columns:
-            if not (
-                name in left_on
-                and name in right_on
-                and (left_on.index(name) == right_on.index(name))
-            ):
-                if not (lsuffix or rsuffix):
-                    raise ValueError(
-                        "there are overlapping columns but "
-                        "lsuffix and rsuffix are not defined"
-                    )
-
-        if on:
-            on_keys = [on] if not isinstance(on, list) else on
-            for key in on_keys:
-                if not (key in lhs._data.keys() and key in rhs._data.keys()):
-                    raise KeyError(f"Key {on} not in both operands")
-        else:
-            for key in left_on:
-                if key not in lhs._data.keys():
-                    raise KeyError(f'Key "{key}" not in left operand')
-            for key in right_on:
-                if key not in rhs._data.keys():
-                    raise KeyError(f'Key "{key}" not in right operand')
-
     def _merge(
         self,
         right,
@@ -3410,84 +3352,33 @@ def _merge(
         right_index=False,
         how="inner",
         sort=False,
-        lsuffix=None,
-        rsuffix=None,
         method="hash",
         indicator=False,
         suffixes=("_x", "_y"),
     ):
-        # Merge doesn't support right, so just swap
+        lhs, rhs = self, right
         if how == "right":
-            return right._merge(
-                self,
-                on=on,
-                left_on=right_on,
-                right_on=left_on,
-                left_index=right_index,
-                right_index=left_index,
-                how="left",
-                sort=sort,
-                lsuffix=rsuffix,
-                rsuffix=lsuffix,
-                method=method,
-                indicator=indicator,
-                suffixes=suffixes,
-            )
-
-        lhs = self
-        rhs = right
-
-        from cudf.core.join import Merge
-
-        mergeop = Merge(
+            # Merge doesn't support right, so just swap
+            how = "left"
+            lhs, rhs = right, self
+            left_on, right_on = right_on, left_on
+            left_index, right_index = right_index, left_index
+            suffixes = (suffixes[1], suffixes[0])
+
+        return merge(
             lhs,
             rhs,
-            on,
-            left_on,
-            right_on,
-            left_index,
-            right_index,
-            how,
-            sort,
-            lsuffix,
-            rsuffix,
-            method,
-            indicator,
-            suffixes,
+            on=on,
+            left_on=left_on,
+            right_on=right_on,
+            left_index=left_index,
+            right_index=right_index,
+            how=how,
+            sort=sort,
+            method=method,
+            indicator=indicator,
+            suffixes=suffixes,
         )
-        to_return = mergeop.perform_merge()
-
-        # If sort=True, Pandas would sort on the key columns in the
-        # same order as given in 'on'. If the indices are used as
-        # keys, the index will be sorted. If one index is specified,
-        # the key column on the other side will be used to sort.
-        # If no index is specified, return a new RangeIndex
-        if sort:
-            to_sort = cudf.DataFrame()
-            if left_index and right_index:
-                by = list(to_return._index._data.columns)
-                if left_on and right_on:
-                    by.extend(to_return[mergeop.left_on]._data.columns)
-            elif left_index:
-                by = list(to_return[mergeop.right_on]._data.columns)
-            elif right_index:
-                by = list(to_return[mergeop.left_on]._data.columns)
-            else:
-                # left_on == right_on, or different names but same columns
-                # in both cases we can sort by either
-                by = [to_return._data[name] for name in mergeop.left_on]
-            for i, col in enumerate(by):
-                to_sort[i] = col
-            inds = to_sort.argsort()
-            if isinstance(to_return, cudf.Index):
-                to_return = to_return.take(inds)
-            else:
-                to_return = to_return.take(
-                    inds, keep_index=(left_index or right_index)
-                )
-            return to_return
-        else:
-            return to_return
 
     def _is_sorted(self, ascending=None, null_position=None):
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 2a5d2647e95..5104629eee0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -13,6 +13,7 @@
 from pandas._config import get_option
 
 import cudf
+from cudf._typing import DtypeObj
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     CategoricalColumn,
@@ -66,6 +67,9 @@ def _to_frame(this_index, index=True, name=None):
 
 
 class Index(Frame, Serializable):
+
+    dtype: DtypeObj
+
     def __new__(
         cls,
         data=None,
@@ -1544,6 +1548,10 @@ def _from_table(cls, table):
         else:
             return as_index(table)
 
+    @classmethod
+    def _from_data(cls, data, index=None):
+        return cls._from_table(Frame(data=data))
+
     _accessors = set()  # type: Set[Any]
 
     @property
diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py
index 6d126c8af4d..0463b8f9df1 100644
--- a/python/cudf/cudf/core/join/__init__.py
+++ b/python/cudf/cudf/core/join/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
-from cudf.core.join.join import Merge
+from cudf.core.join.join import merge
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
new file mode 100644
index 00000000000..3807f408369
--- /dev/null
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+from __future__ import annotations
+
+import collections
+import warnings
+from typing import TYPE_CHECKING, Any, Iterable, Tuple
+
+import numpy as np
+import pandas as pd
+
+import cudf
+from cudf.core.dtypes import CategoricalDtype
+
+if TYPE_CHECKING:
+    from cudf.core.column import CategoricalColumn, ColumnBase
+    from cudf.core.frame import Frame
+
+
+class _Indexer:
+    # Indexer into a column (either a data column or index level).
+    #
+    # >>> df
+    #    a
+    # b
+    # 4  1
+    # 5  2
+    # 6  3
+    # >>> _Indexer("a", column=True).get(df)  # returns column "a" of df
+    # >>> _Indexer("b", index=True).get(df)  # returns index level "b" of df
+
+    def __init__(self, name: Any, column=False, index=False):
+        if column and index:
+            raise ValueError("Cannot specify both column and index")
+        self.name = name
+        self.column, self.index = column, index
+
+    def get(self, obj: Frame) -> ColumnBase:
+        # get the column from `obj`
+        if self.column:
+            return obj._data[self.name]
+        else:
+            if obj._index is not None:
+                return obj._index._data[self.name]
+        raise KeyError()
+
+    def set(self, obj: Frame, value: ColumnBase, validate=False):
+        # set the colum in `obj`
+        if self.column:
+            obj._data.set_by_label(self.name, value, validate=validate)
+        else:
+            if obj._index is not None:
+                obj._index._data.set_by_label(
+                    self.name, value, validate=validate
+                )
+            else:
+                raise KeyError()
+
+
+def _frame_select_by_indexers(
+    frame: Frame, indexers: Iterable[_Indexer]
+) -> Frame:
+    # Select columns from the given `Frame` using `indexers`,
+    # and return a new `Frame`.
+    index_data = frame._data.__class__()
+    data = frame._data.__class__()
+
+    for idx in indexers:
+        if idx.index:
+            index_data.set_by_label(idx.name, idx.get(frame), validate=False)
+        else:
+            data.set_by_label(idx.name, idx.get(frame), validate=False)
+
+    result_index = cudf.Index._from_data(index_data) if index_data else None
+    result = cudf.core.frame.Frame(data=data, index=result_index)
+    return result
+
+
+def _match_join_keys(
+    lcol: ColumnBase, rcol: ColumnBase, how: str
+) -> Tuple[ColumnBase, ColumnBase]:
+    # returns the common dtype that lcol and rcol should be casted to,
+    # before they can be used as left and right join keys.
+    # If no casting is necessary, returns None
+
+    common_type = None
+
+    # cast the keys lcol and rcol to a common dtype
+    ltype = lcol.dtype
+    rtype = rcol.dtype
+
+    # if either side is categorical, different logic
+    if isinstance(ltype, CategoricalDtype) or isinstance(
+        rtype, CategoricalDtype
+    ):
+        return _match_categorical_dtypes(lcol, rcol, how)
+
+    if pd.api.types.is_dtype_equal(ltype, rtype):
+        return lcol, rcol
+
+    if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
+        common_type = (
+            max(ltype, rtype)
+            if ltype.kind == rtype.kind
+            else np.find_common_type([], (ltype, rtype))
+        )
+
+    elif np.issubdtype(ltype, np.datetime64) and np.issubdtype(
+        rtype, np.datetime64
+    ):
+        common_type = max(ltype, rtype)
+
+    if how == "left":
+        if rcol.fillna(0).can_cast_safely(ltype):
+            return lcol, rcol.astype(ltype)
+        else:
+            warnings.warn(
+                f"Can't safely cast column from {rtype} to {ltype}, "
+                "upcasting to {common_type}."
+            )
+
+    return lcol.astype(common_type), rcol.astype(common_type)
+
+
+def _match_categorical_dtypes(
+    lcol: ColumnBase, rcol: ColumnBase, how: str
+) -> Tuple[ColumnBase, ColumnBase]:
+    # cast the keys lcol and rcol to a common dtype
+    # when at least one of them is a categorical type
+    ltype, rtype = lcol.dtype, rcol.dtype
+
+    if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance(
+        rcol, cudf.core.column.CategoricalColumn
+    ):
+        # if both are categoricals, logic is complicated:
+        return _match_categorical_dtypes_both(lcol, rcol, how)
+
+    if isinstance(ltype, CategoricalDtype):
+        if how in {"left", "leftsemi", "leftanti"}:
+            return lcol, rcol.astype(ltype)
+        common_type = ltype.categories.dtype
+    elif isinstance(rtype, CategoricalDtype):
+        common_type = rtype.categories.dtype
+    return lcol.astype(common_type), rcol.astype(common_type)
+
+
+def _match_categorical_dtypes_both(
+    lcol: CategoricalColumn, rcol: CategoricalColumn, how: str
+) -> Tuple[ColumnBase, ColumnBase]:
+    # The commontype depends on both `how` and the specifics of the
+    # categorical variables to be merged.
+
+    ltype, rtype = lcol.dtype, rcol.dtype
+
+    # when both are ordered and both have the same categories,
+    # no casting required:
+    if ltype == rtype:
+        return lcol, rcol
+
+    # Merging categorical variables when only one side is ordered is
+    # ambiguous and not allowed.
+    if ltype.ordered != rtype.ordered:
+        raise TypeError(
+            "Merging on categorical variables with mismatched"
+            " ordering is ambiguous"
+        )
+
+    if ltype.ordered and rtype.ordered:
+        # if we get to here, categories must be what causes the
+        # dtype equality check to fail. And we can never merge
+        # two ordered categoricals with different categories
+        raise TypeError(
+            f"{how} merge between categoricals with "
+            "different categories is only valid when "
+            "neither side is ordered"
+        )
+
+    # the following should now always hold
+    assert not ltype.ordered and not rtype.ordered
+
+    if how == "inner":
+        # cast to category types -- we must cast them back later
+        return _match_join_keys(
+            lcol.cat()._decategorize(), rcol.cat()._decategorize(), how,
+        )
+    elif how in {"left", "leftanti", "leftsemi"}:
+        # always cast to left type
+        return lcol, rcol.astype(ltype)
+    else:
+        # merge categories
+        merged_categories = cudf.concat(
+            [ltype.categories, rtype.categories]
+        ).unique()
+        common_type = cudf.CategoricalDtype(
+            categories=merged_categories, ordered=False
+        )
+        return lcol.astype(common_type), rcol.astype(common_type)
+
+
+def _coerce_to_tuple(obj):
+    if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
+        return tuple(obj)
+    else:
+        return (obj,)
diff --git a/python/cudf/cudf/core/join/casting_logic.py b/python/cudf/cudf/core/join/casting_logic.py
deleted file mode 100644
index eb85cecd14d..00000000000
--- a/python/cudf/cudf/core/join/casting_logic.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-import warnings
-
-import numpy as np
-import pandas as pd
-
-import cudf
-from cudf.core.dtypes import CategoricalDtype
-
-
-def _input_to_libcudf_castrules_both_cat(lcol, rcol, how):
-    """
-    Based off the left and right operands, determine the libcudf
-    merge dtype or error for corner cases where the merge cannot
-    proceed. This function handles categorical variables.
-    Categorical variable typecasting logic depends on both `how`
-    and the specifics of the categorical variables to be merged.
-    Merging categorical variables when only one side is ordered
-    is ambiguous and not allowed. Merging when both categoricals
-    are ordered is allowed, but only when the categories are
-    exactly equal and have equal ordering, and will result in the
-    common dtype.
-    When both sides are unordered, the result categorical depends
-    on the kind of join:
-    - For inner joins, the result will be the intersection of the
-    categories
-    - For left or right joins, the result will be the the left or
-    right dtype respectively. This extends to semi and anti joins.
-    - For outer joins, the result will be the union of categories
-    from both sides.
-
-    """
-    ltype = lcol.dtype
-    rtype = rcol.dtype
-
-    # this function is only to be used to resolve the result when both
-    # sides are categorical
-    if not isinstance(ltype, CategoricalDtype) and isinstance(
-        rtype, CategoricalDtype
-    ):
-        raise TypeError("Both operands must be CategoricalDtype")
-
-    # true for every configuration
-    if ltype == rtype:
-        return ltype
-
-    # raise for any join where ordering doesn't match
-    if ltype.ordered != rtype.ordered:
-        raise TypeError(
-            "Merging on categorical variables with mismatched"
-            " ordering is ambiguous"
-        )
-    elif ltype.ordered and rtype.ordered:
-        # if we get to here, categories must be what causes the
-        # dtype equality check to fail. And we can never merge
-        # two ordered categoricals with different categories
-        raise TypeError(
-            f"{how} merge between categoricals with "
-            "different categories is only valid when "
-            "neither side is ordered"
-        )
-
-    elif how == "inner":
-        # neither ordered, so categories must be different
-        # demote to underlying types
-        return _input_to_libcudf_castrules_any(
-            ltype.categories, rtype.categories, how
-        )
-
-    elif how == "left":
-        return ltype
-    elif how == "right":
-        return rtype
-
-    elif how == "outer":
-        new_cats = cudf.concat([ltype.categories, rtype.categories]).unique()
-        return cudf.CategoricalDtype(categories=new_cats, ordered=False)
-
-
-def _input_to_libcudf_castrules_any_cat(lcol, rcol, how):
-
-    l_is_cat = isinstance(lcol.dtype, CategoricalDtype)
-    r_is_cat = isinstance(rcol.dtype, CategoricalDtype)
-
-    if l_is_cat and r_is_cat:
-        return _input_to_libcudf_castrules_both_cat(lcol, rcol, how)
-    elif l_is_cat or r_is_cat:
-        if l_is_cat and how == "left":
-            return lcol.dtype
-        if r_is_cat and how == "right":
-            return rcol.dtype
-        return (
-            lcol.dtype.categories.dtype
-            if l_is_cat
-            else rcol.dtype.categories.dtype
-        )
-    else:
-        raise ValueError("Neither operand is categorical")
-
-
-def _input_to_libcudf_castrules_any(lcol, rcol, how):
-    """
-    Determine what dtype the left and right hand
-    input columns must be cast to for a libcudf
-    join to proceed.
-    """
-
-    cast_warn = (
-        "can't safely cast column from {} with type"
-        " {} to {}, upcasting to {}"
-    )
-
-    ltype = lcol.dtype
-    rtype = rcol.dtype
-
-    # if either side is categorical, different logic
-    if isinstance(ltype, CategoricalDtype) or isinstance(
-        rtype, CategoricalDtype
-    ):
-        return _input_to_libcudf_castrules_any_cat(lcol, rcol, how)
-
-    libcudf_join_type = None
-    if pd.api.types.is_dtype_equal(ltype, rtype):
-        libcudf_join_type = ltype
-    elif how == "left":
-        check_col = rcol.fillna(0)
-        if not check_col.can_cast_safely(ltype):
-            libcudf_join_type = _input_to_libcudf_castrules_any(
-                lcol, rcol, "inner"
-            )
-            warnings.warn(
-                cast_warn.format("right", rtype, ltype, libcudf_join_type)
-            )
-        else:
-            libcudf_join_type = ltype
-    elif how == "right":
-        check_col = lcol.fillna(0)
-        if not check_col.can_cast_safely(rtype):
-            libcudf_join_type = _input_to_libcudf_castrules_any(
-                lcol, rcol, "inner"
-            )
-            warnings.warn(
-                cast_warn.format("left", ltype, rtype, libcudf_join_type)
-            )
-        else:
-            libcudf_join_type = rtype
-    elif how in {"inner", "outer"}:
-        if (np.issubdtype(ltype, np.number)) and (
-            np.issubdtype(rtype, np.number)
-        ):
-            if ltype.kind == rtype.kind:
-                # both ints or both floats
-                libcudf_join_type = max(ltype, rtype)
-            else:
-                libcudf_join_type = np.find_common_type([], [ltype, rtype])
-        elif np.issubdtype(ltype, np.datetime64) and np.issubdtype(
-            rtype, np.datetime64
-        ):
-            libcudf_join_type = max(ltype, rtype)
-    return libcudf_join_type
-
-
-def _libcudf_to_output_castrules(lcol, rcol, how):
-    """
-    Determine what dtype an output merge key column should be
-    cast to after it has been processed by libcudf. Determine
-    if a column should be promoted to a categorical datatype.
-    For inner merges between unordered categoricals, we get a
-    new categorical variable containing the intersection of
-    the two source variables. For left or right joins, we get
-    the original categorical variable from whichever was the
-    major operand of the join, e.g. left for a left join or
-    right for a right join. In the case of an outer join, the
-    result will be a new categorical variable with both sets
-    of categories.
-    """
-    merge_return_type = None
-
-    ltype = lcol.dtype
-    rtype = rcol.dtype
-
-    if pd.api.types.is_dtype_equal(ltype, rtype):
-        return ltype
-
-    l_is_cat = isinstance(ltype, CategoricalDtype)
-    r_is_cat = isinstance(rtype, CategoricalDtype)
-
-    # we  currently only need to do this for categorical variables
-    if how == "inner":
-        if l_is_cat and r_is_cat:
-            merge_return_type = "category"
-    elif how == "left":
-        if l_is_cat:
-            merge_return_type = ltype
-    elif how == "right":
-        if r_is_cat:
-            merge_return_type = rtype
-    elif how == "outer":
-        if l_is_cat and r_is_cat:
-            new_cats = cudf.concat(
-                [ltype.categories, rtype.categories]
-            ).unique()
-            merge_return_type = cudf.CategoricalDtype(
-                categories=new_cats, ordered=ltype.ordered
-            )
-    return merge_return_type
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index c6da3ee8dc4..1a4826d0570 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,22 +1,85 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
-import itertools
+from __future__ import annotations
 
-import pandas as pd
+import functools
+from collections import namedtuple
+from typing import TYPE_CHECKING, Callable, Tuple
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.join import compute_result_col_names
-from cudf.core.join.casting_logic import (
-    _input_to_libcudf_castrules_any,
-    _libcudf_to_output_castrules,
+from cudf.core.join._join_helpers import (
+    _coerce_to_tuple,
+    _frame_select_by_indexers,
+    _Indexer,
+    _match_join_keys,
 )
 
+if TYPE_CHECKING:
+    from cudf.core.frame import Frame
+
+
+def merge(
+    lhs,
+    rhs,
+    *,
+    on,
+    left_on,
+    right_on,
+    left_index,
+    right_index,
+    how,
+    sort,
+    method,
+    indicator,
+    suffixes,
+):
+    if how in {"leftsemi", "leftanti"}:
+        merge_cls = MergeSemi
+    else:
+        merge_cls = Merge
+    mergeobj = merge_cls(
+        lhs,
+        rhs,
+        on=on,
+        left_on=left_on,
+        right_on=right_on,
+        left_index=left_index,
+        right_index=right_index,
+        how=how,
+        sort=sort,
+        method=method,
+        indicator=indicator,
+        suffixes=suffixes,
+    )
+    return mergeobj.perform_merge()
+
+
+_JoinKeys = namedtuple("JoinKeys", ["left", "right"])
+
 
 class Merge(object):
+    # A namedtuple of indexers representing the left and right keys
+    _keys: _JoinKeys
+
+    # The joiner function must have the following signature:
+    #
+    #     def joiner(
+    #         lhs: Frame,
+    #         rhs: Frame
+    #     ) -> Tuple[Optional[Column], Optional[Column]]:
+    #          ...
+    #
+    # where `lhs` and `rhs` are Frames composed of the left and right
+    # join key. The `joiner` returns a tuple of two Columns
+    # representing the rows to gather from the left- and right- side
+    # tables respectively.
+    _joiner: Callable
+
     def __init__(
         self,
         lhs,
         rhs,
+        *,
         on,
         left_on,
         right_on,
@@ -24,8 +87,6 @@ def __init__(
         right_index,
         how,
         sort,
-        lsuffix,
-        rsuffix,
         method,
         indicator,
         suffixes,
@@ -60,140 +121,252 @@ def __init__(
         sort : bool
             Boolean flag indicating if the output Frame is to be
             sorted on the output's join keys, in left to right order.
-        lsuffix : string
-            The suffix to be appended to left hand column names that
-            are found to exist in the right frame, but are not specified
-            as join keys themselves.
-        rsuffix : string
-            The suffix to be appended to right hand column names that
-            are found to exist in the left frame, but are not specified
-            as join keys themselves.
         suffixes : list like
             Left and right suffixes specified together, unpacked into lsuffix
             and rsuffix.
         """
+        self._validate_merge_params(
+            lhs,
+            rhs,
+            on=on,
+            left_on=left_on,
+            right_on=right_on,
+            left_index=left_index,
+            right_index=right_index,
+            how=how,
+            suffixes=suffixes,
+        )
+        self._joiner = functools.partial(libcudf.join.join, how=how)
+
         self.lhs = lhs
         self.rhs = rhs
+        self.on = on
+        self.left_on = left_on
+        self.right_on = right_on
         self.left_index = left_index
         self.right_index = right_index
-        self.method = method
-        self.sort = sort
-
-        # check that the merge is valid
-
-        self.validate_merge_cfg(
-            lhs,
-            rhs,
-            on,
-            left_on,
-            right_on,
-            left_index,
-            right_index,
-            how,
-            lsuffix,
-            rsuffix,
-            suffixes,
-        )
         self.how = how
-        self.preprocess_merge_params(
-            on, left_on, right_on, lsuffix, rsuffix, suffixes
-        )
-
-    def perform_merge(self):
-        """
-        Call libcudf to perform a merge between the operands. If
-        necessary, cast the input key columns to compatible types.
-        Potentially also cast the output back to categorical.
-        """
-        output_dtypes = self.compute_output_dtypes()
-        self.typecast_input_to_libcudf()
-        libcudf_result = libcudf.join.join(
-            self.lhs,
-            self.rhs,
-            self.how,
-            self.method,
-            left_on=self.left_on,
-            right_on=self.right_on,
-            left_index=self.left_index,
-            right_index=self.right_index,
-        )
-        result = self.out_class._from_table(libcudf_result)
-        result = self.typecast_libcudf_to_output(result, output_dtypes)
-        if isinstance(result, cudf.Index):
-            return result
-        else:
-            return result[
-                compute_result_col_names(self.lhs, self.rhs, self.how)
-            ]
+        self.sort = sort
+        if suffixes:
+            self.lsuffix, self.rsuffix = suffixes
+        self._compute_join_keys()
 
-    def preprocess_merge_params(
-        self, on, left_on, right_on, lsuffix, rsuffix, suffixes
-    ):
-        """
-        Translate a valid configuration of user input parameters into
-        the subset of input configurations handled by the cython layer.
-        Apply suffixes to columns.
-        """
+    @property
+    def _out_class(self):
+        # type of the result
+        out_class = cudf.DataFrame
 
-        self.out_class = cudf.DataFrame
         if isinstance(self.lhs, cudf.MultiIndex) or isinstance(
             self.rhs, cudf.MultiIndex
         ):
-            self.out_class = cudf.MultiIndex
+            out_class = cudf.MultiIndex
         elif isinstance(self.lhs, cudf.Index):
-            self.out_class = self.lhs.__class__
+            out_class = self.lhs.__class__
+        return out_class
 
-        if on:
-            on = [on] if isinstance(on, str) else list(on)
-            left_on = right_on = on
-        else:
-            if left_on:
-                left_on = (
-                    [left_on] if isinstance(left_on, str) else list(left_on)
-                )
-            if right_on:
-                right_on = (
-                    [right_on] if isinstance(right_on, str) else list(right_on)
-                )
+    def perform_merge(self) -> Frame:
+        lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs)
 
-        same_named_columns = set(self.lhs._data.keys()) & set(
-            self.rhs._data.keys()
+        left_table = _frame_select_by_indexers(lhs, self._keys.left)
+        right_table = _frame_select_by_indexers(rhs, self._keys.right)
+
+        left_rows, right_rows = self._joiner(
+            left_table, right_table, how=self.how,
         )
-        if not (left_on or right_on) and not (
-            self.left_index and self.right_index
-        ):
-            left_on = right_on = list(same_named_columns)
-
-        no_suffix_cols = []
-        if left_on and right_on:
-            no_suffix_cols = [
-                left_name
-                for left_name, right_name in zip(left_on, right_on)
-                if left_name == right_name and left_name in same_named_columns
-            ]
+        lhs, rhs = self._restore_categorical_keys(lhs, rhs)
 
-        if suffixes:
-            lsuffix, rsuffix = suffixes
-        for name in same_named_columns:
-            if name not in no_suffix_cols:
-                self.lhs.rename(
-                    {name: f"{name}{lsuffix}"}, inplace=True, axis=1
+        left_result = cudf.core.frame.Frame()
+        right_result = cudf.core.frame.Frame()
+
+        gather_index = self.left_index or self.right_index
+        if left_rows is not None:
+            left_result = lhs._gather(
+                left_rows, nullify=True, keep_index=gather_index
+            )
+        if right_rows is not None:
+            right_result = rhs._gather(
+                right_rows, nullify=True, keep_index=gather_index
+            )
+
+        result = self._merge_results(left_result, right_result)
+
+        if self.sort:
+            result = self._sort_result(result)
+        return result
+
+    def _compute_join_keys(self):
+        # Computes self._keys
+        if (
+            self.left_index
+            or self.right_index
+            or self.left_on
+            or self.right_on
+        ):
+            left_keys = []
+            right_keys = []
+            if self.left_index:
+                left_keys.extend(
+                    [
+                        _Indexer(name=on, index=True)
+                        for on in self.lhs.index.names
+                    ]
                 )
-                self.rhs.rename(
-                    {name: f"{name}{rsuffix}"}, inplace=True, axis=1
+            if self.left_on:
+                # TODO: require left_on or left_index to be specified
+                left_keys.extend(
+                    [
+                        _Indexer(name=on, column=True)
+                        for on in _coerce_to_tuple(self.left_on)
+                    ]
                 )
-                if left_on and name in left_on:
-                    left_on[left_on.index(name)] = f"{name}{lsuffix}"
-                if right_on and name in right_on:
-                    right_on[right_on.index(name)] = f"{name}{rsuffix}"
+            if self.right_index:
+                right_keys.extend(
+                    [
+                        _Indexer(name=on, index=True)
+                        for on in self.rhs.index.names
+                    ]
+                )
+            if self.right_on:
+                # TODO: require right_on or right_index to be specified
+                right_keys.extend(
+                    [
+                        _Indexer(name=on, column=True)
+                        for on in _coerce_to_tuple(self.right_on)
+                    ]
+                )
+        else:
+            # Use `on` if provided. Otherwise,
+            # implicitly use identically named columns as the key columns:
+            on_names = (
+                _coerce_to_tuple(self.on)
+                if self.on is not None
+                else set(self.lhs._data) & set(self.rhs._data)
+            )
+            left_keys = [_Indexer(name=on, column=True) for on in on_names]
+            right_keys = [_Indexer(name=on, column=True) for on in on_names]
+
+        if len(left_keys) != len(right_keys):
+            raise ValueError(
+                "Merge operands must have same number of join key columns"
+            )
+
+        self._keys = _JoinKeys(left=left_keys, right=right_keys)
+
+    def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
+        # Merge the Frames `left_result` and `right_result` into a single
+        # `Frame`, suffixing column names if necessary.
+
+        # If two key columns have the same name, a single output column appears
+        # in the result. For all other join types, the key column from the rhs
+        # is simply dropped. For outer joins, the two key columns are combined
+        # by filling nulls in the left key column with corresponding values
+        # from the right key column:
+        if self.how == "outer":
+            for lkey, rkey in zip(*self._keys):
+                if lkey.name == rkey.name:
+                    # fill nulls in lhs from values in the rhs
+                    lkey.set(
+                        left_result,
+                        lkey.get(left_result).fillna(rkey.get(right_result)),
+                        validate=False,
+                    )
+
+        # Compute the result column names:
+        # left_names and right_names will be a mappings of input column names
+        # to the corresponding names in the final result.
+        left_names = dict(zip(left_result._data, left_result._data))
+        right_names = dict(zip(right_result._data, right_result._data))
+
+        # For any columns from left_result and right_result that have the same
+        # name:
+        # - if they are key columns, keep only the left column
+        # - if they are not key columns, use suffixes to differentiate them
+        #   in the final result
+        common_names = set(left_names) & set(right_names)
+
+        if self.on:
+            key_columns_with_same_name = self.on
+        else:
+            key_columns_with_same_name = [
+                lkey.name
+                for lkey, rkey in zip(*self._keys)
+                if (
+                    (lkey.index, rkey.index) == (False, False)
+                    and lkey.name == rkey.name
+                )
+            ]
+        for name in common_names:
+            if name not in key_columns_with_same_name:
+                left_names[name] = f"{name}{self.lsuffix}"
+                right_names[name] = f"{name}{self.rsuffix}"
+            else:
+                del right_names[name]
+
+        # Assemble the data columns of the result:
+        data = left_result._data.__class__()
+
+        for lcol in left_names:
+            data.set_by_label(
+                left_names[lcol], left_result._data[lcol], validate=False
+            )
+        for rcol in right_names:
+            data.set_by_label(
+                right_names[rcol], right_result._data[rcol], validate=False
+            )
+
+        # Index of the result:
+        if self.left_index and self.right_index:
+            index = left_result._index
+        elif self.left_index:
+            # left_index and right_on
+            index = right_result._index
+        elif self.right_index:
+            # right_index and left_on
+            index = left_result._index
+        else:
+            index = None
 
-        self.left_on = left_on if left_on is not None else []
-        self.right_on = right_on if right_on is not None else []
-        self.lsuffix = lsuffix
-        self.rsuffix = rsuffix
+        # Construct result from data and index:
+        result = self._out_class._from_data(data=data, index=index)
+
+        return result
+
+    def _sort_result(self, result: Frame) -> Frame:
+        # Pandas sorts on the key columns in the
+        # same order as given in 'on'. If the indices are used as
+        # keys, the index will be sorted. If one index is specified,
+        # the key columns on the other side will be used to sort.
+        if self.on:
+            if isinstance(result, cudf.Index):
+                sort_order = result._get_sorted_inds()
+            else:
+                # need a list instead of a tuple here because
+                # _get_sorted_inds calls down to ColumnAccessor.get_by_label
+                # which handles lists and tuples differently
+                sort_order = result._get_sorted_inds(
+                    list(_coerce_to_tuple(self.on))
+                )
+            return result._gather(sort_order, keep_index=False)
+        by = []
+        if self.left_index and self.right_index:
+            if result._index is not None:
+                by.extend(result._index._data.columns)
+        if self.left_on:
+            by.extend(
+                [result._data[col] for col in _coerce_to_tuple(self.left_on)]
+            )
+        if self.right_on:
+            by.extend(
+                [result._data[col] for col in _coerce_to_tuple(self.right_on)]
+            )
+        if by:
+            to_sort = cudf.DataFrame._from_columns(by)
+            sort_order = to_sort.argsort()
+            result = result._gather(sort_order)
+        return result
 
     @staticmethod
-    def validate_merge_cfg(
+    def _validate_merge_params(
         lhs,
         rhs,
         on,
@@ -202,14 +375,11 @@ def validate_merge_cfg(
         left_index,
         right_index,
         how,
-        lsuffix,
-        rsuffix,
         suffixes,
     ):
         """
         Error for various invalid combinations of merge input parameters
         """
-
         # must actually support the requested merge type
         if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}:
             raise NotImplementedError(f"{how} merge not supported yet")
@@ -227,52 +397,8 @@ def validate_merge_cfg(
         ):
             raise ValueError("Can not merge on unnamed Series")
 
-        # Keys need to be in their corresponding operands
-        if on:
-            if isinstance(on, str):
-                on_keys = [on]
-            elif isinstance(on, tuple):
-                on_keys = list(on)
-            else:
-                on_keys = on
-            for key in on_keys:
-                if not (key in lhs._data.keys() and key in rhs._data.keys()):
-                    raise KeyError(f"on key {on} not in both operands")
-        elif left_on and right_on:
-            left_on_keys = (
-                [left_on] if not isinstance(left_on, list) else left_on
-            )
-            right_on_keys = (
-                [right_on] if not isinstance(right_on, list) else right_on
-            )
-
-            for key in left_on_keys:
-                if key not in lhs._data.keys():
-                    raise KeyError(f'Key "{key}" not in left operand')
-            for key in right_on_keys:
-                if key not in rhs._data.keys():
-                    raise KeyError(f'Key "{key}" not in right operand')
-
-        # Require same total number of columns to join on in both operands
-        len_left_on = 0
-        len_right_on = 0
-        if left_on:
-            len_left_on += (
-                len(left_on) if pd.api.types.is_list_like(left_on) else 1
-            )
-        if right_on:
-            len_right_on += (
-                len(right_on) if pd.api.types.is_list_like(right_on) else 1
-            )
-        if not (len_left_on + left_index * lhs._num_indices) == (
-            len_right_on + right_index * rhs._num_indices
-        ):
-            raise ValueError(
-                "Merge operands must have same number of join key columns"
-            )
-
         # If nothing specified, must have common cols to use implicitly
-        same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys())
+        same_named_columns = set(lhs._data) & set(rhs._data)
         if (
             not (left_index or right_index)
             and not (left_on or right_on)
@@ -280,8 +406,7 @@ def validate_merge_cfg(
         ):
             raise ValueError("No common columns to perform merge on")
 
-        if suffixes:
-            lsuffix, rsuffix = suffixes
+        lsuffix, rsuffix = suffixes
         for name in same_named_columns:
             if name == left_on == right_on:
                 continue
@@ -297,134 +422,59 @@ def validate_merge_cfg(
                         "lsuffix and rsuffix are not defined"
                     )
 
-    def typecast_input_to_libcudf(self):
-        """
-        Check each pair of join keys in the left and right hand
-        operands and apply casting rules to match their types
-        before passing the result to libcudf.
-        """
-        lhs_keys, rhs_keys, lhs_cols, rhs_cols = [], [], [], []
-        if self.left_index:
-            lhs_keys.append(self.lhs.index._data.keys())
-            lhs_cols.append(self.lhs.index)
-        if self.right_index:
-            rhs_keys.append(self.rhs.index._data.keys())
-            rhs_cols.append(self.rhs.index)
-        if self.left_on:
-            lhs_keys.append(self.left_on)
-            lhs_cols.append(self.lhs)
-        if self.right_on:
-            rhs_keys.append(self.right_on)
-            rhs_cols.append(self.rhs)
-
-        for l_key_grp, r_key_grp, l_col_grp, r_col_grp in zip(
-            lhs_keys, rhs_keys, lhs_cols, rhs_cols
-        ):
-            for l_key, r_key in zip(l_key_grp, r_key_grp):
-                to_dtype = _input_to_libcudf_castrules_any(
-                    l_col_grp._data[l_key], r_col_grp._data[r_key], self.how
-                )
-                l_col_grp._data[l_key] = l_col_grp._data[l_key].astype(
-                    to_dtype
-                )
-                r_col_grp._data[r_key] = r_col_grp._data[r_key].astype(
-                    to_dtype
-                )
-
-    def compute_output_dtypes(self):
-        """
-        Determine what datatypes should be applied to the result
-        of a libcudf join, baesd on the original left and right
-        frames.
-        """
-
-        index_dtypes = {}
-        l_data_join_cols = {}
-        r_data_join_cols = {}
-
-        data_dtypes = {
-            name: col.dtype
-            for name, col in itertools.chain(
-                self.lhs._data.items(), self.rhs._data.items()
+    def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]:
+        # Match the dtypes of the key columns from lhs and rhs
+        out_lhs = lhs.copy(deep=False)
+        out_rhs = rhs.copy(deep=False)
+        for left_key, right_key in zip(*self._keys):
+            lcol, rcol = left_key.get(lhs), right_key.get(rhs)
+            lcol_casted, rcol_casted = _match_join_keys(
+                lcol, rcol, how=self.how
             )
-        }
-
-        if self.left_index and self.right_index:
-            l_idx_join_cols = list(self.lhs.index._data.values())
-            r_idx_join_cols = list(self.rhs.index._data.values())
-        elif self.left_on and self.right_index:
-            # Keep the orignal dtypes in the LEFT index if possible
-            # should trigger a bunch of no-ops
-            l_idx_join_cols = list(self.lhs.index._data.values())
-            r_idx_join_cols = list(self.lhs.index._data.values())
-            for i, name in enumerate(self.left_on):
-                l_data_join_cols[name] = self.lhs._data[name]
-                r_data_join_cols[name] = list(self.rhs.index._data.values())[i]
-
-        elif self.left_index and self.right_on:
-            # see above
-            l_idx_join_cols = list(self.rhs.index._data.values())
-            r_idx_join_cols = list(self.rhs.index._data.values())
-            for i, name in enumerate(self.right_on):
-                l_data_join_cols[name] = list(self.lhs.index._data.values())[i]
-                r_data_join_cols[name] = self.rhs._data[name]
-
-        if self.left_on and self.right_on:
-            l_data_join_cols = self.lhs._data
-            r_data_join_cols = self.rhs._data
-
-        if self.left_index or self.right_index:
-            for i in range(len(self.lhs.index._data.items())):
-                index_dtypes[i] = _libcudf_to_output_castrules(
-                    l_idx_join_cols[i], r_idx_join_cols[i], self.how
-                )
-
-        for name in itertools.chain(self.left_on, self.right_on):
-            if name in self.left_on and name in self.right_on:
-                data_dtypes[name] = _libcudf_to_output_castrules(
-                    l_data_join_cols[name], r_data_join_cols[name], self.how
-                )
-        return (index_dtypes, data_dtypes)
+            if lcol is not lcol_casted:
+                left_key.set(out_lhs, lcol_casted, validate=False)
+            if rcol is not rcol_casted:
+                right_key.set(out_rhs, rcol_casted, validate=False)
+        return out_lhs, out_rhs
+
+    def _restore_categorical_keys(
+        self, lhs: Frame, rhs: Frame
+    ) -> Tuple[Frame, Frame]:
+        # For inner joins, any categorical keys in `self.lhs` and `self.rhs`
+        # were casted to their category type to produce `lhs` and `rhs`.
+        # Here, we cast them back.
+        out_lhs = lhs.copy(deep=False)
+        out_rhs = rhs.copy(deep=False)
+        if self.how == "inner":
+            for left_key, right_key in zip(*self._keys):
+                if isinstance(
+                    left_key.get(self.lhs).dtype, cudf.CategoricalDtype
+                ) and isinstance(
+                    right_key.get(self.rhs).dtype, cudf.CategoricalDtype
+                ):
+                    left_key.set(
+                        out_lhs,
+                        left_key.get(out_lhs).astype("category"),
+                        validate=False,
+                    )
+                    right_key.set(
+                        out_rhs,
+                        right_key.get(out_rhs).astype("category"),
+                        validate=False,
+                    )
+        return out_lhs, out_rhs
 
-    def typecast_libcudf_to_output(self, output, output_dtypes):
-        """
-        Apply precomputed output index and data column data types
-        to the output of a libcudf join.
-        """
 
-        index_dtypes, data_dtypes = output_dtypes
-        if output._index and len(index_dtypes) > 0:
-            for index_dtype, index_col_lbl, index_col in zip(
-                index_dtypes.values(),
-                output._index._data.keys(),
-                output._index._data.values(),
-            ):
-                if index_dtype:
-                    output._index._data[
-                        index_col_lbl
-                    ] = self._build_output_col(index_col, index_dtype)
-            # reconstruct the Index object as the underlying data types
-            # have changed:
-            output._index = cudf.core.index.Index._from_table(output._index)
-
-        for data_col_lbl, data_col in output._data.items():
-            data_dtype = data_dtypes[data_col_lbl]
-            if data_dtype:
-                output._data[data_col_lbl] = self._build_output_col(
-                    data_col, data_dtype
-                )
-        return output
+class MergeSemi(Merge):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._joiner = functools.partial(
+            libcudf.join.semi_join, how=kwargs["how"]
+        )
 
-    def _build_output_col(self, col, dtype):
-        if isinstance(
-            dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype)
-        ):
-            outcol = cudf.core.column.build_categorical_column(
-                categories=dtype.categories,
-                codes=col.set_mask(None),
-                mask=col.base_mask,
-                ordered=dtype.ordered,
-            )
+    def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame:
+        # semi-join result includes only lhs columns
+        if issubclass(self._out_class, cudf.Index):
+            return self._out_class._from_data(lhs._data)
         else:
-            outcol = col.astype(dtype)
-        return outcol
+            return self._out_class._from_data(lhs._data, index=lhs._index)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 82e89bb00f4..1c1e48e7372 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import itertools
 import numbers
@@ -18,6 +19,7 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.column import column
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import Index, as_index
 
@@ -188,6 +190,19 @@ def names(self):
     def names(self, value):
         value = [None] * self.nlevels if value is None else value
         assert len(value) == self.nlevels
+
+        if len(value) == len(set(value)):
+            # IMPORTANT: if the provided names are unique,
+            # we reconstruct self._data with the names as keys.
+            # If they are not unique, the keys of self._data
+            # and self._names will be different, which can lead
+            # to unexpected behaviour in some cases. This is
+            # definitely buggy, but we can't disallow non-unique
+            # names either...
+            self._data = self._data.__class__._create_unsafe(
+                dict(zip(value, self._data.values())),
+                level_names=self._data.level_names,
+            )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
     def rename(self, names, inplace=False):
@@ -234,7 +249,6 @@ def rename(self, names, inplace=False):
         ValueError: Length of names must match number of levels in MultiIndex.
 
         """
-
         return self.set_names(names, level=None, inplace=inplace)
 
     def set_names(self, names, level=None, inplace=False):
@@ -278,6 +292,10 @@ def set_names(self, names, level=None, inplace=False):
 
         return self._set_names(names=names, inplace=inplace)
 
+    @classmethod
+    def _from_data(cls, data: ColumnAccessor, index=None) -> MultiIndex:
+        return cls.from_frame(cudf.DataFrame._from_data(data))
+
     @classmethod
     def _from_table(cls, table, names=None):
         df = cudf.DataFrame(table._data)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a664c4fb182..71a4a48a07a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -6299,17 +6299,24 @@ def merge(
         method="hash",
         suffixes=("_x", "_y"),
     ):
-
         if left_on not in (self.name, None):
             raise ValueError(
                 "Series to other merge uses series name as key implicitly"
             )
 
-        lhs = self.copy(deep=False)
-        rhs = other.copy(deep=False)
+        if lsuffix or rsuffix:
+            raise ValueError(
+                "The lsuffix and rsuffix keywords have been replaced with the "
+                "``suffixes=`` keyword.  "
+                "Please provide the following instead: \n\n"
+                "    suffixes=('%s', '%s')"
+                % (lsuffix or "_x", rsuffix or "_y")
+            )
+        else:
+            lsuffix, rsuffix = suffixes
 
-        result = super(Series, lhs)._merge(
-            rhs,
+        result = super()._merge(
+            other,
             on=on,
             left_on=left_on,
             right_on=right_on,
@@ -6317,8 +6324,6 @@ def merge(
             right_index=right_index,
             how=how,
             sort=sort,
-            lsuffix=lsuffix,
-            rsuffix=rsuffix,
             method=method,
             indicator=False,
             suffixes=suffixes,
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 969cf1bf549..9164bfe98d1 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -14,11 +14,13 @@
     assert_exceptions_equal,
 )
 
+_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi")
+
 
 def make_params():
     np.random.seed(0)
 
-    hows = "left,inner,outer,right,leftanti,leftsemi".split(",")
+    hows = _JOIN_TYPES
     methods = "hash,sort".split(",")
 
     # Test specific cases (1)
@@ -69,6 +71,37 @@ def pd_odd_joins(left, right, join_type):
         return left[left.index.isin(right.index)][left.columns]
 
 
+def assert_join_results_equal(expect, got, how, **kwargs):
+    if how not in _JOIN_TYPES:
+        raise ValueError(f"Unrecognized join type {how}")
+    if how == "right":
+        got = got[expect.columns]
+
+    if isinstance(expect, (pd.Series, cudf.Series)):
+        return assert_eq(
+            expect.sort_values().reset_index(drop=True),
+            got.sort_values().reset_index(drop=True),
+            **kwargs,
+        )
+    elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)):
+        if not len(
+            expect.columns
+        ):  # can't sort_values() on a df without columns
+            return assert_eq(expect, got, **kwargs)
+
+        return assert_eq(
+            expect.sort_values(expect.columns.to_list()).reset_index(
+                drop=True
+            ),
+            got.sort_values(got.columns.to_list()).reset_index(drop=True),
+            **kwargs,
+        )
+    elif isinstance(expect, (pd.Index, cudf.Index)):
+        return assert_eq(expect.sort_values(), got.sort_values(), **kwargs)
+    else:
+        raise ValueError(f"Not a join result: {type(expect).__name__}")
+
+
 @pytest.mark.parametrize("aa,bb,how,method", make_params())
 def test_dataframe_join_how(aa, bb, how, method):
     df = cudf.DataFrame()
@@ -113,12 +146,7 @@ def work_gdf(df):
             # TODO: What is the less hacky way?
             expect.index.name = "bob"
             got.index.name = "mary"
-            assert_eq(
-                got.sort_values(got.columns.to_list()).reset_index(drop=True),
-                expect.sort_values(expect.columns.to_list()).reset_index(
-                    drop=True
-                ),
-            )
+            assert_join_results_equal(expect, got, how=how)
         # if(how=='right'):
         #     _sorted_check_series(expect['a'], expect['b'],
         #                          got['a'], got['b'])
@@ -187,10 +215,7 @@ def test_dataframe_join_cats():
     expect = lhs.to_pandas().join(rhs.to_pandas())
 
     # Note: pandas make an object Index after joining
-    assert_eq(
-        got.sort_values(by="b").sort_index().reset_index(drop=True),
-        expect.reset_index(drop=True),
-    )
+    assert_join_results_equal(expect, got, how="inner")
 
     # Just do some rough checking here.
     assert list(got.columns) == ["b", "c"]
@@ -264,7 +289,7 @@ def test_dataframe_join_mismatch_cats(how):
     expect.data_col_right = expect.data_col_right.astype(np.int64)
     expect.data_col_left = expect.data_col_left.astype(np.int64)
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how=how, check_categorical=False)
 
 
 @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None])
@@ -323,7 +348,7 @@ def test_dataframe_merge_on(on):
         list(pddf_joined.columns)
     ).reset_index(drop=True)
 
-    assert_eq(cdf_result, pdf_result, check_like=True)
+    assert_join_results_equal(cdf_result, pdf_result, how="left")
 
     merge_func_result_cdf = (
         join_result_cudf.to_pandas()
@@ -331,7 +356,7 @@ def test_dataframe_merge_on(on):
         .reset_index(drop=True)
     )
 
-    assert_eq(merge_func_result_cdf, cdf_result, check_like=True)
+    assert_join_results_equal(merge_func_result_cdf, cdf_result, how="left")
 
 
 def test_dataframe_merge_on_unknown_column():
@@ -383,7 +408,7 @@ def test_dataframe_empty_merge():
     expect = cudf.DataFrame({"a": [], "b": [], "c": []})
     got = gdf1.merge(gdf2, how="left", on=["a"])
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 def test_dataframe_merge_order():
@@ -408,7 +433,7 @@ def test_dataframe_merge_order():
     df2["a"] = [7, 8]
 
     df = df1.merge(df2, how="left", on=["id", "a"])
-    assert_eq(gdf, df)
+    assert_join_results_equal(df, gdf, how="left")
 
 
 @pytest.mark.parametrize(
@@ -550,7 +575,7 @@ def test_merge_left_index_zero():
     pd_merge = left.merge(right, left_on="x", right_on="y")
     gd_merge = gleft.merge(gright, left_on="x", right_on="y")
 
-    assert_eq(pd_merge, gd_merge)
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 @pytest.mark.parametrize(
@@ -571,7 +596,7 @@ def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs):
     gright = cudf.from_pandas(right)
     pd_merge = left.merge(right, **kwargs)
     gd_merge = gleft.merge(gright, **kwargs)
-    assert_eq(pd_merge, gd_merge)
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 @pytest.mark.parametrize(
@@ -592,7 +617,7 @@ def test_merge_left_right_index_left_right_on_kwargs(kwargs):
     gright = cudf.from_pandas(right)
     pd_merge = left.merge(right, **kwargs)
     gd_merge = gleft.merge(gright, **kwargs)
-    assert_eq(pd_merge, gd_merge)
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 def test_indicator():
@@ -608,9 +633,10 @@ def test_indicator():
 def test_merge_suffixes():
     pdf = cudf.DataFrame({"x": [1, 2, 1]})
     gdf = cudf.DataFrame({"x": [1, 2, 1]})
-    assert_eq(
+    assert_join_results_equal(
         gdf.merge(gdf, suffixes=("left", "right")),
         pdf.merge(pdf, suffixes=("left", "right")),
+        how="left",
     )
 
     assert_exceptions_equal(
@@ -628,11 +654,14 @@ def test_merge_left_on_right_on():
     gleft = cudf.from_pandas(left)
     gright = cudf.from_pandas(right)
 
-    assert_eq(left.merge(right, on="xx"), gleft.merge(gright, on="xx"))
+    assert_join_results_equal(
+        left.merge(right, on="xx"), gleft.merge(gright, on="xx"), how="left"
+    )
 
-    assert_eq(
+    assert_join_results_equal(
         left.merge(right, left_on="xx", right_on="xx"),
         gleft.merge(gright, left_on="xx", right_on="xx"),
+        how="left",
     )
 
 
@@ -708,7 +737,9 @@ def test_merge_sort(ons, hows):
     pd_merge = left.merge(right, **kwargs)
     # require the join keys themselves to be sorted correctly
     # the non-key columns will NOT match pandas ordering
-    assert_eq(pd_merge[kwargs["on"]], gd_merge[kwargs["on"]])
+    assert_join_results_equal(
+        pd_merge[kwargs["on"]], gd_merge[kwargs["on"]], how="left"
+    )
     pd_merge = pd_merge.drop(kwargs["on"], axis=1)
     gd_merge = gd_merge.drop(kwargs["on"], axis=1)
     if not pd_merge.empty:
@@ -720,7 +751,7 @@ def test_merge_sort(ons, hows):
             drop=True
         )
 
-    assert_eq(pd_merge, gd_merge)
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 @pytest.mark.parametrize(
@@ -781,7 +812,7 @@ def test_join_datetimes_index(dtype):
 
     assert gdf["d"].dtype == np.dtype(dtype)
 
-    assert_eq(pdf, gdf)
+    assert_join_results_equal(pdf, gdf, how="inner")
 
 
 def test_join_with_different_names():
@@ -791,7 +822,7 @@ def test_join_with_different_names():
     gright = cudf.from_pandas(right)
     pd_merge = left.merge(right, how="outer", left_on=["a"], right_on=["b"])
     gd_merge = gleft.merge(gright, how="outer", left_on=["a"], right_on=["b"])
-    assert_eq(pd_merge, gd_merge.sort_values(by=["a"]).reset_index(drop=True))
+    assert_join_results_equal(pd_merge, gd_merge, how="outer")
 
 
 def test_join_same_name_different_order():
@@ -801,9 +832,7 @@ def test_join_same_name_different_order():
     gright = cudf.from_pandas(right)
     pd_merge = left.merge(right, left_on=["a", "b"], right_on=["b", "a"])
     gd_merge = gleft.merge(gright, left_on=["a", "b"], right_on=["b", "a"])
-    assert_eq(
-        pd_merge, gd_merge.sort_values(by=["a_x"]).reset_index(drop=True)
-    )
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 def test_join_empty_table_dtype():
@@ -874,10 +903,7 @@ def test_join_multi(how, column_a, column_b, column_c):
     gdf_result = gdf_result[columns]
     pdf_result = pdf_result[columns]
 
-    assert_eq(
-        gdf_result.reset_index(drop=True).fillna(-1),
-        pdf_result.sort_index().reset_index(drop=True).fillna(-1),
-    )
+    assert_join_results_equal(pdf_result, gdf_result, how="inner")
 
 
 @pytest.mark.parametrize(
@@ -967,7 +993,7 @@ def test_merge_multi(kwargs):
     expect.index = range(len(expect))
     got.index = range(len(got))
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 @pytest.mark.parametrize("dtype_l", INTEGER_TYPES)
@@ -997,7 +1023,7 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r):
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 @pytest.mark.parametrize("dtype_l", ["float32", "float64"])
@@ -1032,7 +1058,7 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r):
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 @pytest.mark.parametrize("dtype_l", NUMERIC_TYPES)
@@ -1068,7 +1094,7 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r):
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 def test_typecast_on_join_no_float_round():
@@ -1092,7 +1118,7 @@ def test_typecast_on_join_no_float_round():
 
     got = gdf_l.merge(gdf_r, on="join_col", how="left")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 @pytest.mark.parametrize(
@@ -1121,10 +1147,7 @@ def test_typecast_on_join_overflow_unsafe(dtypes):
 
     with pytest.warns(
         UserWarning,
-        match=(
-            f"can't safely cast column"
-            f" from right with type {dtype_r} to {dtype_l}"
-        ),
+        match=(f"Can't safely cast column" f" from {dtype_r} to {dtype_l}"),
     ):
         merged = lhs.merge(rhs, on="a", how="left")  # noqa: F841
 
@@ -1165,7 +1188,7 @@ def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r):
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 @pytest.mark.parametrize("dtype_l", ["category", "str", "int32", "float32"])
@@ -1200,7 +1223,7 @@ def test_typecast_on_join_categorical(dtype_l, dtype_r):
     )
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 def make_categorical_dataframe(categories, ordered=False):
@@ -1220,7 +1243,7 @@ def test_categorical_typecast_inner():
     expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False)
     expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key")
 
-    assert_eq(expect_data, result["key"])
+    assert_eq(expect_data, result["key"], check_categorical=False)
 
     # Equal categories, unequal ordering -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=False)
@@ -1238,7 +1261,7 @@ def test_categorical_typecast_inner():
 
     expect_dtype = cudf.CategoricalDtype(categories=[2, 3], ordered=False)
     expect_data = cudf.Series([2, 3], dtype=expect_dtype, name="key")
-    assert_eq(expect_data, result["key"])
+    assert_eq(expect_data, result["key"], check_categorical=False)
 
     # One is ordered -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=False)
@@ -1427,20 +1450,10 @@ def test_index_join(lhs, rhs, how, level):
     g_lhs = l_df.set_index(lhs).index
     g_rhs = r_df.set_index(rhs).index
 
-    expected = (
-        p_lhs.join(p_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
-    got = (
-        g_lhs.join(g_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
+    expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False)
+    got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False)
 
-    assert_eq(expected, got)
+    assert_join_results_equal(expected, got, how=how)
 
 
 def test_index_join_corner_cases():
@@ -1461,20 +1474,10 @@ def test_index_join_corner_cases():
     p_rhs = r_pdf.set_index(rhs).index
     g_lhs = l_df.set_index(lhs).index
     g_rhs = r_df.set_index(rhs).index
-    expected = (
-        p_lhs.join(p_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
-    got = (
-        g_lhs.join(g_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
+    expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False)
+    got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False)
 
-    assert_eq(expected, got)
+    assert_join_results_equal(expected, got, how=how)
 
     # sort is supported only in case of two non-MultiIndex join
     # Join when column name doesn't match with level
@@ -1490,7 +1493,7 @@ def test_index_join_corner_cases():
     expected = p_lhs.join(p_rhs, how=how, sort=True)
     got = g_lhs.join(g_rhs, how=how, sort=True)
 
-    assert_eq(expected, got)
+    assert_join_results_equal(expected, got, how=how)
 
     # Pandas Index.join on categorical column returns generic column
     # but cudf will be returning a categorical column itself.
@@ -1504,22 +1507,12 @@ def test_index_join_corner_cases():
     p_rhs = r_pdf.set_index(rhs).index
     g_lhs = l_df.set_index(lhs).index
     g_rhs = r_df.set_index(rhs).index
-    expected = (
-        p_lhs.join(p_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
-    got = (
-        g_lhs.join(g_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
+    expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False)
+    got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False)
 
     got["a"] = got["a"].astype(expected["a"].dtype)
 
-    assert_eq(expected, got)
+    assert_join_results_equal(expected, got, how=how)
 
 
 def test_index_join_exception_cases():
@@ -1573,7 +1566,7 @@ def test_typecast_on_join_indexes():
 
     got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 def test_typecast_on_join_multiindices():
@@ -1624,7 +1617,7 @@ def test_typecast_on_join_multiindices():
     expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"])
     got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 def test_typecast_on_join_indexes_matching_categorical():
@@ -1651,7 +1644,7 @@ def test_typecast_on_join_indexes_matching_categorical():
     expect = expect.set_index("join_col")
     got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 @pytest.mark.parametrize(
@@ -1703,9 +1696,10 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs):
     expect = check_lhs.merge(check_rhs, how=how, **kwargs)
     got = lhs.merge(rhs, how=how, **kwargs)
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how=how)
 
 
+@pytest.mark.xfail(reason="Cannot sort values of list dtype")
 @pytest.mark.parametrize(
     "how", ["left", "inner", "right", "leftanti", "leftsemi"]
 )
@@ -1730,4 +1724,17 @@ def test_merge_with_lists(how):
     expect = pd_left.merge(pd_right, on="a")
     got = gd_left.merge(gd_right, on="a")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how=how)
+
+
+def test_join_renamed_index():
+    df = cudf.DataFrame(
+        {0: [1, 2, 3, 4, 5], 1: [1, 2, 3, 4, 5], "c": [1, 2, 3, 4, 5]}
+    ).set_index([0, 1])
+    df.index.names = ["a", "b"]  # doesn't actually change df._index._data
+
+    expect = df.to_pandas().merge(
+        df.to_pandas(), left_index=True, right_index=True
+    )
+    got = df.merge(df, left_index=True, right_index=True, how="inner")
+    assert_join_results_equal(expect, got, how="inner")
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 8b1ad696f04..2ca6bc622be 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -17,6 +17,7 @@
 from cudf.core._compat import PANDAS_GE_110
 from cudf.core.column.string import StringColumn
 from cudf.core.index import StringIndex, as_index
+from cudf.tests.test_joining import assert_join_results_equal
 from cudf.tests.utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -919,16 +920,12 @@ def test_string_split(data, pat, n, expand):
 
 
 @pytest.mark.parametrize(
-    "str_data,str_data_raise",
-    [
-        ([], 0),
-        (["a", "b", "c", "d", "e"], 0),
-        ([None, None, None, None, None], 1),
-    ],
+    "str_data",
+    [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]],
 )
 @pytest.mark.parametrize("num_keys", [1, 2, 3])
 @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
-def test_string_join_key(str_data, str_data_raise, num_keys, how):
+def test_string_join_key(str_data, num_keys, how):
     other_data = [1, 2, 3, 4, 5][: len(str_data)]
 
     pdf = pd.DataFrame()
@@ -942,19 +939,17 @@ def test_string_join_key(str_data, str_data_raise, num_keys, how):
     pdf2 = pdf.copy()
     gdf2 = gdf.copy()
 
-    expectation = raise_builder(
-        [0 if how == "right" else str_data_raise], (AssertionError)
-    )
+    expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how)
+    got = gdf.merge(gdf2, on=list(range(num_keys)), how=how)
 
-    with expectation:
-        expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how)
-        got = gdf.merge(gdf2, on=list(range(num_keys)), how=how)
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]  # reorder columns
 
-        if len(expect) == 0 and len(got) == 0:
-            expect = expect.reset_index(drop=True)
-            got = got[expect.columns]
+    if how == "right":
+        got = got[expect.columns]  # reorder columns
 
-        assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how=how)
 
 
 @pytest.mark.parametrize(
@@ -998,7 +993,7 @@ def test_string_join_key_nulls(str_data_nulls):
 
     expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 @pytest.mark.parametrize(
@@ -1027,7 +1022,10 @@ def test_string_join_non_key(str_data, num_cols, how):
         expect = expect.reset_index(drop=True)
         got = got[expect.columns]
 
-    assert_eq(expect, got)
+    if how == "right":
+        got = got[expect.columns]  # reorder columns
+
+    assert_join_results_equal(expect, got, how=how)
 
 
 @pytest.mark.parametrize(
@@ -1068,7 +1066,7 @@ def test_string_join_non_key_nulls(str_data_nulls):
         expect = expect.reset_index(drop=True)
         got = got[expect.columns]
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 def test_string_join_values_nulls():
@@ -1108,7 +1106,7 @@ def test_string_join_values_nulls():
     expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
     got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 @pytest.mark.parametrize(

From 8188ddbf2837caea731f7c4833945dfa9598b4b5 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 29 Mar 2021 19:16:38 -0500
Subject: [PATCH 16/20] Add Java bindings to join gather map APIs (#7751)

Adds Java bindings for the libcudf join APIs that return gather maps.  Depends upon #7454.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7751
---
 .../main/java/ai/rapids/cudf/ColumnView.java  |  43 ++
 .../main/java/ai/rapids/cudf/GatherMap.java   |  85 ++++
 java/src/main/java/ai/rapids/cudf/Table.java  | 139 ++++++
 java/src/main/native/include/jni_utils.hpp    |  18 +-
 java/src/main/native/src/TableJni.cpp         | 376 ++++++++++++-----
 .../java/ai/rapids/cudf/GatherMapTest.java    | 100 +++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 396 ++++++++++++++++++
 7 files changed, 1061 insertions(+), 96 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/GatherMap.java
 create mode 100644 java/src/test/java/ai/rapids/cudf/GatherMapTest.java

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 90fe3553abc..5d869ab75fb 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2523,6 +2523,49 @@ public static ColumnView makeStructView(ColumnView... columns) {
     return makeStructView(columns[0].rows, columns);
   }
 
+  /**
+   * Create a new column view from a raw device buffer. Note that this will NOT copy
+   * the contents of the buffer but only creates a view. The view MUST NOT outlive
+   * the underlying device buffer. The column view will be created without a validity
+   * vector, so it is not possible to create a view containing null elements. Additionally
+   * only fixed-width primitive types are supported.
+   *
+   * @param buffer device memory that will back the column view
+   * @param startOffset byte offset into the device buffer where the column data starts
+   * @param type type of data in the column view
+   * @param rows number of data elements in the column view
+   * @return new column view instance that must not outlive the backing device buffer
+   */
+  public static ColumnView fromDeviceBuffer(BaseDeviceMemoryBuffer buffer,
+                                            long startOffset,
+                                            DType type,
+                                            int rows) {
+    if (buffer == null) {
+      throw new NullPointerException("buffer is null");
+    }
+    int typeSize = type.getSizeInBytes();
+    if (typeSize <= 0) {
+      throw new IllegalArgumentException("Unsupported type: " + type);
+    }
+    if (startOffset < 0) {
+      throw new IllegalArgumentException("Invalid start offset: " + startOffset);
+    }
+    if (rows < 0) {
+      throw new IllegalArgumentException("Invalid row count: " + rows);
+    }
+    long dataSize = typeSize * rows;
+    if (startOffset + dataSize > buffer.length) {
+      throw new IllegalArgumentException("View extends beyond buffer range");
+    }
+    long dataAddress = buffer.getAddress() + startOffset;
+    if (dataAddress % typeSize != 0) {
+      throw new IllegalArgumentException("Data address " + Long.toHexString(dataAddress) +
+          " is misaligned relative to type size of " + typeSize + " bytes");
+    }
+    return new ColumnView(makeCudfColumnView(type.typeId.getNativeId(), type.getScale(),
+        dataAddress, dataSize, 0, 0, 0, rows, null));
+  }
+
   /**
    * Create a column of bool values indicating whether the specified scalar
    * is an element of each row of a list column.
diff --git a/java/src/main/java/ai/rapids/cudf/GatherMap.java b/java/src/main/java/ai/rapids/cudf/GatherMap.java
new file mode 100644
index 00000000000..12ff741bb69
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/GatherMap.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * This class tracks the data associated with a gather map, a buffer of INT32 elements that index
+ * a source table and can be passed to a table gather operation.
+ */
+public class GatherMap implements AutoCloseable {
+  private DeviceMemoryBuffer buffer;
+
+  /**
+   * Construct a gather map instance from a device buffer. The buffer length must be a multiple of
+   * the {@link DType#INT32} size, as each row of the gather map is an INT32.
+   * @param buffer device buffer backing the gather map data
+   */
+  public GatherMap(DeviceMemoryBuffer buffer) {
+    if (buffer.getLength() % DType.INT32.getSizeInBytes() != 0) {
+      throw new IllegalArgumentException("buffer length not a multiple of 4");
+    }
+    this.buffer = buffer;
+  }
+
+  /** Return the number of rows in the gather map */
+  public long getRowCount() {
+    ensureOpen();
+    return buffer.getLength() / 4;
+  }
+
+  /**
+   * Create a column view that can be used to perform a gather operation. Note that the resulting
+   * column view MUST NOT outlive the underlying device buffer within this instance!
+   * @param startRow row offset where the resulting gather map will start
+   * @param numRows number of rows in the resulting gather map
+   * @return column view of gather map data
+   */
+  public ColumnView toColumnView(long startRow, int numRows) {
+    ensureOpen();
+    return ColumnView.fromDeviceBuffer(buffer, startRow * 4, DType.INT32, numRows);
+  }
+
+  /**
+   * Release the underlying device buffer instance. After this is called, closing this instance
+   * will not close the underlying device buffer. It is the responsibility of the caller to close
+   * the returned device buffer.
+   * @return device buffer backing gather map data or null if the buffer has already been released
+   */
+  public DeviceMemoryBuffer releaseBuffer() {
+    DeviceMemoryBuffer result = buffer;
+    buffer = null;
+    return result;
+  }
+
+  /** Close the device buffer backing the gather map data. */
+  @Override
+  public void close() {
+    if (buffer != null) {
+      buffer.close();
+      buffer = null;
+    }
+  }
+
+  private void ensureOpen() {
+    if (buffer == null) {
+      throw new IllegalStateException("instance is closed");
+    }
+    if (buffer.closed) {
+      throw new IllegalStateException("buffer is closed");
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 6e0b7d3bb94..fc6ad55044a 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -482,18 +482,33 @@ private static native long[] merge(long[] tableHandles, int[] sortKeyIndexes,
   private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long rightTable,
                                         int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys,
+                                                  boolean compareNullsEqual) throws CudfException;
+
   private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable,
                                          int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys,
+                                                   boolean compareNullsEqual) throws CudfException;
+
   private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable,
                                          int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys,
+                                                  boolean compareNullsEqual) throws CudfException;
+
   private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable,
       int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] leftSemiJoinGatherMap(long leftKeys, long rightKeys,
+                                                     boolean compareNullsEqual) throws CudfException;
+
   private static native long[] leftAntiJoin(long leftTable, int[] leftJoinCols, long rightTable,
       int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] leftAntiJoinGatherMap(long leftKeys, long rightKeys,
+                                                     boolean compareNullsEqual) throws CudfException;
+
   private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException;
 
   private static native long[] concatenate(long[] cudfTablePointers) throws CudfException;
@@ -1925,6 +1940,130 @@ public Table gather(ColumnVector gatherMap, boolean checkBounds) {
     return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds));
   }
 
+  private GatherMap[] buildJoinGatherMaps(long[] gatherMapData) {
+    long bufferSize = gatherMapData[0];
+    long leftAddr = gatherMapData[1];
+    long leftHandle = gatherMapData[2];
+    long rightAddr = gatherMapData[3];
+    long rightHandle = gatherMapData[4];
+    GatherMap[] maps = new GatherMap[2];
+    maps[0] = new GatherMap(DeviceMemoryBuffer.fromRmm(leftAddr, bufferSize, leftHandle));
+    maps[1] = new GatherMap(DeviceMemoryBuffer.fromRmm(rightAddr, bufferSize, rightHandle));
+    return maps;
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a left equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the table argument represents the key columns from the right table. Two {@link GatherMap}
+   * instances will be returned that can be used to gather the left and right tables,
+   * respectively, to produce the result of the left join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightKeys join key columns from the right table
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return left and right table gather maps
+   */
+  public GatherMap[] leftJoinGatherMaps(Table rightKeys, boolean compareNullsEqual) {
+    if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightKeys.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        leftJoinGatherMaps(getNativeView(), rightKeys.getNativeView(), compareNullsEqual);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of an inner equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the table argument represents the key columns from the right table. Two {@link GatherMap}
+   * instances will be returned that can be used to gather the left and right tables,
+   * respectively, to produce the result of the inner join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightKeys join key columns from the right table
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return left and right table gather maps
+   */
+  public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqual) {
+    if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightKeys.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        innerJoinGatherMaps(getNativeView(), rightKeys.getNativeView(), compareNullsEqual);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of an full equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the table argument represents the key columns from the right table. Two {@link GatherMap}
+   * instances will be returned that can be used to gather the left and right tables,
+   * respectively, to produce the result of the full join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightKeys join key columns from the right table
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return left and right table gather maps
+   */
+  public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual) {
+    if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightKeys.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        fullJoinGatherMaps(getNativeView(), rightKeys.getNativeView(), compareNullsEqual);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  private GatherMap buildSemiJoinGatherMap(long[] gatherMapData) {
+    long bufferSize = gatherMapData[0];
+    long leftAddr = gatherMapData[1];
+    long leftHandle = gatherMapData[2];
+    return new GatherMap(DeviceMemoryBuffer.fromRmm(leftAddr, bufferSize, leftHandle));
+  }
+
+  /**
+   * Computes the gather map that can be used to manifest the result of a left semi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the table argument represents the key columns from the right table. The {@link GatherMap}
+   * instance returned can be used to gather the left table to produce the result of the
+   * left semi-join.
+   * It is the responsibility of the caller to close the resulting gather map instance.
+   * @param rightKeys join key columns from the right table
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return left table gather map
+   */
+  public GatherMap leftSemiJoinGatherMap(Table rightKeys, boolean compareNullsEqual) {
+    if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightKeys.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        leftSemiJoinGatherMap(getNativeView(), rightKeys.getNativeView(), compareNullsEqual);
+    return buildSemiJoinGatherMap(gatherMapData);
+  }
+
+  /**
+   * Computes the gather map that can be used to manifest the result of a left anti-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the table argument represents the key columns from the right table. The {@link GatherMap}
+   * instance returned can be used to gather the left table to produce the result of the
+   * left anti-join.
+   * It is the responsibility of the caller to close the resulting gather map instance.
+   * @param rightKeys join key columns from the right table
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return left table gather map
+   */
+  public GatherMap leftAntiJoinGatherMap(Table rightKeys, boolean compareNullsEqual) {
+    if (getNumberOfColumns() != rightKeys.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightKeys.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        leftAntiJoinGatherMap(getNativeView(), rightKeys.getNativeView(), compareNullsEqual);
+    return buildSemiJoinGatherMap(gatherMapData);
+  }
+
   /**
    * Convert this table of columns into a row major format that is useful for interacting with other
    * systems that do row major processing of the data. Currently only fixed-width column types are
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index 84694c177a1..3ce136dda19 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -243,6 +243,22 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
     return data_ptr;
   }
 
+  const N_TYPE *const begin() const {
+    return data();
+  }
+
+  N_TYPE *begin() {
+    return data();
+  }
+
+  const N_TYPE *const end() const {
+    return data() + size();
+  }
+
+  N_TYPE *end() {
+    return data() + size();
+  }
+
   const J_ARRAY_TYPE get_jArray() const { return orig; }
 
   J_ARRAY_TYPE get_jArray() { return orig; }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 6beedf54f5a..0e66cde3ee1 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -30,6 +30,7 @@
 #include <cudf/lists/explode.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/partitioning.hpp>
+#include <cudf/replace.hpp>
 #include <cudf/reshape.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/search.hpp>
@@ -41,6 +42,8 @@
 #include "dtype_utils.hpp"
 #include "row_conversion.hpp"
 
+#include <algorithm>
+
 namespace cudf {
 namespace jni {
 
@@ -620,6 +623,116 @@ bool valid_window_parameters(native_jintArray const &values,
          values.size() == preceding.size() && values.size() == following.size();
 }
 
+// Generate gather maps needed to manifest the result of a join between two tables.
+// The resulting Java long array contains the following at each index:
+//   0: Size of each gather map in bytes
+//   1: Device address of the gather map for the left table
+//   2: Host address of the rmm::device_buffer instance that owns the left gather map data
+//   3: Device address of the gather map for the right table
+//   4: Host address of the rmm::device_buffer instance that owns the right gather map data
+template <typename T>
+jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
+                            jboolean compare_nulls_equal, T join_func) {
+  JNI_NULL_CHECK(env, j_left_keys, "left_table is null", NULL);
+  JNI_NULL_CHECK(env, j_right_keys, "right_table is null", NULL);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
+    auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+              std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+        join_maps = join_func(*left_keys, *right_keys, nulleq);
+
+    // release the underlying device buffer to Java
+    auto left_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.first->release());
+    auto right_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.second->release());
+    cudf::jni::native_jlongArray result(env, 5);
+    result[0] = static_cast<jlong>(left_map_buffer->size());
+    result[1] = reinterpret_cast<jlong>(left_map_buffer->data());
+    result[2] = reinterpret_cast<jlong>(left_map_buffer.release());
+    result[3] = reinterpret_cast<jlong>(right_map_buffer->data());
+    result[4] = reinterpret_cast<jlong>(right_map_buffer.release());
+    return result.get_jArray();
+  }
+  CATCH_STD(env, NULL);
+}
+
+// Generate gather maps needed to manifest the result of a join between two tables.
+// The resulting Java long array contains the following at each index:
+//   0: Size of the gather map in bytes
+//   1: Device address of the gather map
+//   2: Host address of the rmm::device_buffer instance that owns the gather map data
+template <typename T>
+jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
+                                  jboolean compare_nulls_equal, T join_func) {
+  JNI_NULL_CHECK(env, j_left_keys, "left_table is null", NULL);
+  JNI_NULL_CHECK(env, j_right_keys, "right_table is null", NULL);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
+    auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    std::unique_ptr<rmm::device_uvector<cudf::size_type>> join_map =
+        join_func(*left_keys, *right_keys, nulleq);
+
+    // release the underlying device buffer to Java
+    auto gather_map_buffer = std::make_unique<rmm::device_buffer>(join_map->release());
+    cudf::jni::native_jlongArray result(env, 3);
+    result[0] = static_cast<jlong>(gather_map_buffer->size());
+    result[1] = reinterpret_cast<jlong>(gather_map_buffer->data());
+    result[2] = reinterpret_cast<jlong>(gather_map_buffer.release());
+    return result.get_jArray();
+  }
+  CATCH_STD(env, NULL);
+}
+
+// Returns a table view containing only the columns at the specified indices
+cudf::table_view const get_keys_table(cudf::table_view const *t,
+                                      native_jintArray const &key_indices) {
+  std::vector<cudf::column_view> key_cols;
+  key_cols.reserve(key_indices.size());
+  std::transform(key_indices.begin(), key_indices.end(), std::back_inserter(key_cols),
+                 [t](int idx) { return t->column(idx); });
+  return table_view(key_cols);
+}
+
+// Returns a table view containing only the columns that are NOT at the specified indices
+cudf::table_view const get_non_keys_table(cudf::table_view const *t,
+                                          native_jintArray const &key_indices) {
+  std::vector<int> non_key_indices;
+  for (int i = 0; i < t->num_columns(); ++i) {
+    if (std::find(key_indices.begin(), key_indices.end(), i) == key_indices.end()) {
+      non_key_indices.push_back(i);
+    }
+  }
+  std::vector<cudf::column_view> cols;
+  std::transform(non_key_indices.begin(), non_key_indices.end(), std::back_inserter(cols),
+                 [&t](int idx) { return t->column(idx); });
+  return table_view(cols);
+}
+
+// Combine left and right join results into a column pointer array that can be returned to the JVM.
+jlongArray combine_join_results(JNIEnv *env, std::vector<std::unique_ptr<cudf::column>> left_cols,
+                                std::vector<std::unique_ptr<cudf::column>> right_cols) {
+  cudf::jni::native_jlongArray outcol_handles(env, left_cols.size() + right_cols.size());
+  auto iter = std::transform(
+      left_cols.begin(), left_cols.end(), outcol_handles.begin(),
+      [](std::unique_ptr<cudf::column> &col) { return reinterpret_cast<jlong>(col.release()); });
+  std::transform(
+      right_cols.begin(), right_cols.end(), iter,
+      [](std::unique_ptr<cudf::column> &col) { return reinterpret_cast<jlong>(col.release()); });
+  return outcol_handles.get_jArray();
+}
+
+// Combine left and right join results into a column pointer array that can be returned to the JVM.
+jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
+                                cudf::table &right_results) {
+  std::vector<std::unique_ptr<cudf::column>> left_cols = left_results.release();
+  std::vector<std::unique_ptr<cudf::column>> right_cols = right_results.release();
+  return combine_join_results(env, std::move(left_cols), std::move(right_cols));
+}
+
 } // namespace
 
 } // namespace jni
@@ -1455,109 +1568,143 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jc
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoin(
-    JNIEnv *env, jclass, jlong left_table, jintArray left_col_join_indices, jlong right_table,
-    jintArray right_col_join_indices, jboolean compare_nulls_equal) {
-  JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
-  JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL);
-  JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
-  JNI_NULL_CHECK(env, right_col_join_indices, "right_col_join_indices is null", NULL);
+    JNIEnv *env, jclass, jlong j_left_table, jintArray j_left_key_indices, jlong j_right_table,
+    jintArray j_right_key_indices, jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
+  JNI_NULL_CHECK(env, j_left_key_indices, "left_col_join_indices is null", NULL);
+  JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
+  JNI_NULL_CHECK(env, j_right_key_indices, "right_col_join_indices is null", NULL);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_left_table = reinterpret_cast<cudf::table_view *>(left_table);
-    cudf::table_view *n_right_table = reinterpret_cast<cudf::table_view *>(right_table);
-    cudf::jni::native_jintArray left_join_cols_arr(env, left_col_join_indices);
-    std::vector<cudf::size_type> left_join_cols(
-        left_join_cols_arr.data(), left_join_cols_arr.data() + left_join_cols_arr.size());
-    cudf::jni::native_jintArray right_join_cols_arr(env, right_col_join_indices);
-    std::vector<cudf::size_type> right_join_cols(
-        right_join_cols_arr.data(), right_join_cols_arr.data() + right_join_cols_arr.size());
-
-    int dedupe_size = left_join_cols.size();
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> dedupe(dedupe_size);
-    for (int i = 0; i < dedupe_size; i++) {
-      dedupe[i].first = left_join_cols[i];
-      dedupe[i].second = right_join_cols[i];
-    }
-
-    std::unique_ptr<cudf::table> result =
-        cudf::left_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe,
-                        static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
-                                                                 cudf::null_equality::UNEQUAL);
-
-    return cudf::jni::convert_table_for_return(env, result);
+    auto left_in_table = reinterpret_cast<cudf::table_view *>(j_left_table);
+    auto right_in_table = reinterpret_cast<cudf::table_view *>(j_right_table);
+    cudf::jni::native_jintArray left_key_indices(env, j_left_key_indices);
+    auto left_keys_table = cudf::jni::get_keys_table(left_in_table, left_key_indices);
+    left_key_indices.cancel();
+    cudf::jni::native_jintArray right_key_indices(env, j_right_key_indices);
+    auto right_keys_table = cudf::jni::get_keys_table(right_in_table, right_key_indices);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+
+    // compute gather maps for the left and right tables that can produce the join result rows
+    auto join_maps = cudf::left_join(left_keys_table, right_keys_table, nulleq);
+    CUDF_EXPECTS(join_maps.first->size() <= std::numeric_limits<cudf::size_type>::max(),
+                 "join result exceeds maximum column length");
+    auto num_join_rows = static_cast<cudf::size_type>(join_maps.first->size());
+
+    // compute the join result rows for the left table columns
+    auto left_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
+                                             join_maps.first->data());
+    auto left_out_table =
+        cudf::gather(*left_in_table, left_gather_col, cudf::out_of_bounds_policy::DONT_CHECK);
+
+    // compute the join result rows for the right table columns
+    auto right_non_keys_table = cudf::jni::get_non_keys_table(right_in_table, right_key_indices);
+    right_key_indices.cancel();
+    auto right_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
+                                              join_maps.second->data());
+    auto right_out_table =
+        cudf::gather(right_non_keys_table, right_gather_col, cudf::out_of_bounds_policy::NULLIFY);
+
+    return cudf::jni::combine_join_results(env, *left_out_table, *right_out_table);
   }
   CATCH_STD(env, NULL);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoin(
-    JNIEnv *env, jclass, jlong left_table, jintArray left_col_join_indices, jlong right_table,
-    jintArray right_col_join_indices, jboolean compare_nulls_equal) {
-  JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
-  JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL);
-  JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
-  JNI_NULL_CHECK(env, right_col_join_indices, "right_col_join_indices is null", NULL);
+    JNIEnv *env, jclass, jlong j_left_table, jintArray j_left_key_indices, jlong j_right_table,
+    jintArray j_right_key_indices, jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
+  JNI_NULL_CHECK(env, j_left_key_indices, "left_col_join_indices is null", NULL);
+  JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
+  JNI_NULL_CHECK(env, j_right_key_indices, "right_col_join_indices is null", NULL);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_left_table = reinterpret_cast<cudf::table_view *>(left_table);
-    cudf::table_view *n_right_table = reinterpret_cast<cudf::table_view *>(right_table);
-    cudf::jni::native_jintArray left_join_cols_arr(env, left_col_join_indices);
-    std::vector<cudf::size_type> left_join_cols(
-        left_join_cols_arr.data(), left_join_cols_arr.data() + left_join_cols_arr.size());
-    cudf::jni::native_jintArray right_join_cols_arr(env, right_col_join_indices);
-    std::vector<cudf::size_type> right_join_cols(
-        right_join_cols_arr.data(), right_join_cols_arr.data() + right_join_cols_arr.size());
-
-    int dedupe_size = left_join_cols.size();
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> dedupe(dedupe_size);
-    for (int i = 0; i < dedupe_size; i++) {
-      dedupe[i].first = left_join_cols[i];
-      dedupe[i].second = right_join_cols[i];
-    }
-
-    std::unique_ptr<cudf::table> result =
-        cudf::inner_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe,
-                         static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
-                                                                  cudf::null_equality::UNEQUAL);
-
-    return cudf::jni::convert_table_for_return(env, result);
+    auto left_in_table = reinterpret_cast<cudf::table_view *>(j_left_table);
+    auto right_in_table = reinterpret_cast<cudf::table_view *>(j_right_table);
+    cudf::jni::native_jintArray left_key_indices(env, j_left_key_indices);
+    auto left_keys_table = cudf::jni::get_keys_table(left_in_table, left_key_indices);
+    left_key_indices.cancel();
+    cudf::jni::native_jintArray right_key_indices(env, j_right_key_indices);
+    auto right_keys_table = cudf::jni::get_keys_table(right_in_table, right_key_indices);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+
+    // compute gather maps for the left and right tables that can produce the join result rows
+    auto join_maps = cudf::inner_join(left_keys_table, right_keys_table, nulleq);
+    CUDF_EXPECTS(join_maps.first->size() <= std::numeric_limits<cudf::size_type>::max(),
+                 "join result exceeds maximum column length");
+    auto num_join_rows = static_cast<cudf::size_type>(join_maps.first->size());
+
+    // compute the join result rows for the left table columns
+    auto left_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
+                                             join_maps.first->data());
+    auto left_out_table =
+        cudf::gather(*left_in_table, left_gather_col, cudf::out_of_bounds_policy::DONT_CHECK);
+
+    // compute the join result rows for the right table columns
+    auto right_non_keys_table = cudf::jni::get_non_keys_table(right_in_table, right_key_indices);
+    right_key_indices.cancel();
+    auto right_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
+                                              join_maps.second->data());
+    auto right_out_table = cudf::gather(right_non_keys_table, right_gather_col,
+                                        cudf::out_of_bounds_policy::DONT_CHECK);
+
+    return cudf::jni::combine_join_results(env, *left_out_table, *right_out_table);
   }
   CATCH_STD(env, NULL);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoin(
-    JNIEnv *env, jclass, jlong left_table, jintArray left_col_join_indices, jlong right_table,
-    jintArray right_col_join_indices, jboolean compare_nulls_equal) {
-  JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
-  JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL);
-  JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
-  JNI_NULL_CHECK(env, right_col_join_indices, "right_col_join_indices is null", NULL);
+    JNIEnv *env, jclass, jlong j_left_table, jintArray j_left_key_indices, jlong j_right_table,
+    jintArray j_right_key_indices, jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
+  JNI_NULL_CHECK(env, j_left_key_indices, "left_col_join_indices is null", NULL);
+  JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
+  JNI_NULL_CHECK(env, j_right_key_indices, "right_col_join_indices is null", NULL);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_left_table = reinterpret_cast<cudf::table_view *>(left_table);
-    cudf::table_view *n_right_table = reinterpret_cast<cudf::table_view *>(right_table);
-    cudf::jni::native_jintArray left_join_cols_arr(env, left_col_join_indices);
-    std::vector<cudf::size_type> left_join_cols(
-        left_join_cols_arr.data(), left_join_cols_arr.data() + left_join_cols_arr.size());
-    cudf::jni::native_jintArray right_join_cols_arr(env, right_col_join_indices);
-    std::vector<cudf::size_type> right_join_cols(
-        right_join_cols_arr.data(), right_join_cols_arr.data() + right_join_cols_arr.size());
-
-    int dedupe_size = left_join_cols.size();
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> dedupe(dedupe_size);
-    for (int i = 0; i < dedupe_size; i++) {
-      dedupe[i].first = left_join_cols[i];
-      dedupe[i].second = right_join_cols[i];
-    }
-
-    std::unique_ptr<cudf::table> result =
-        cudf::full_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe,
-                        static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
-                                                                 cudf::null_equality::UNEQUAL);
-
-    return cudf::jni::convert_table_for_return(env, result);
+    auto left_in_table = reinterpret_cast<cudf::table_view *>(j_left_table);
+    auto right_in_table = reinterpret_cast<cudf::table_view *>(j_right_table);
+    cudf::jni::native_jintArray left_key_indices(env, j_left_key_indices);
+    auto left_keys_table = cudf::jni::get_keys_table(left_in_table, left_key_indices);
+    cudf::jni::native_jintArray right_key_indices(env, j_right_key_indices);
+    auto right_keys_table = cudf::jni::get_keys_table(right_in_table, right_key_indices);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+
+    // compute gather maps for the left and right tables that can produce the join result rows
+    auto join_maps = cudf::full_join(left_keys_table, right_keys_table, nulleq);
+    CUDF_EXPECTS(join_maps.first->size() <= std::numeric_limits<cudf::size_type>::max(),
+                 "join result exceeds maximum column length");
+    auto num_join_rows = static_cast<cudf::size_type>(join_maps.first->size());
+
+    // compute the join result rows for the left table columns
+    auto left_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
+                                             join_maps.first->data());
+    auto left_out_table =
+        cudf::gather(*left_in_table, left_gather_col, cudf::out_of_bounds_policy::NULLIFY);
+    // Replace any nulls in the left key column results with the right key column results.
+    std::vector<std::unique_ptr<cudf::column>> result_cols = left_out_table->release();
+    auto right_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
+                                              join_maps.second->data());
+    for (int i = 0; i < left_key_indices.size(); ++i) {
+      std::unique_ptr<cudf::column> &colptr = result_cols[left_key_indices[i]];
+      auto right_key_col = right_in_table->column(right_key_indices[i]);
+      auto gathered = cudf::gather(cudf::table_view{{right_key_col}}, right_gather_col,
+                                   cudf::out_of_bounds_policy::NULLIFY);
+      auto replaced_col = cudf::replace_nulls(*colptr, gathered->get_column(0));
+      colptr.reset(replaced_col.release());
+    }
+    left_key_indices.cancel();
+
+    // compute the join result rows for the right table columns
+    auto right_non_keys_table = cudf::jni::get_non_keys_table(right_in_table, right_key_indices);
+    right_key_indices.cancel();
+    auto right_out_table =
+        cudf::gather(right_non_keys_table, right_gather_col, cudf::out_of_bounds_policy::NULLIFY);
+
+    return cudf::jni::combine_join_results(env, std::move(result_cols), right_out_table->release());
   }
   CATCH_STD(env, NULL);
 }
@@ -1580,13 +1727,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoin(
     cudf::jni::native_jintArray right_join_cols_arr(env, right_col_join_indices);
     std::vector<cudf::size_type> right_join_cols(
         right_join_cols_arr.data(), right_join_cols_arr.data() + right_join_cols_arr.size());
-    std::vector<cudf::size_type> return_cols(n_left_table->num_columns());
-    for (cudf::size_type i = 0; i < n_left_table->num_columns(); ++i) {
-      return_cols[i] = i;
-    }
 
     std::unique_ptr<cudf::table> result = cudf::left_semi_join(
-        *n_left_table, *n_right_table, left_join_cols, right_join_cols, return_cols,
+        *n_left_table, *n_right_table, left_join_cols, right_join_cols,
         static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
                                                  cudf::null_equality::UNEQUAL);
 
@@ -1613,13 +1756,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoin(
     cudf::jni::native_jintArray right_join_cols_arr(env, right_col_join_indices);
     std::vector<cudf::size_type> right_join_cols(
         right_join_cols_arr.data(), right_join_cols_arr.data() + right_join_cols_arr.size());
-    std::vector<cudf::size_type> return_cols(n_left_table->num_columns());
-    for (cudf::size_type i = 0; i < n_left_table->num_columns(); ++i) {
-      return_cols[i] = i;
-    }
 
     std::unique_ptr<cudf::table> result = cudf::left_anti_join(
-        *n_left_table, *n_right_table, left_join_cols, right_join_cols, return_cols,
+        *n_left_table, *n_right_table, left_join_cols, right_join_cols,
         static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
                                                  cudf::null_equality::UNEQUAL);
 
@@ -1628,6 +1767,51 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoin(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  return cudf::jni::join_gather_maps(
+      env, j_left_keys, j_right_keys, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
+        return cudf::left_join(left, right, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  return cudf::jni::join_gather_maps(
+      env, j_left_keys, j_right_keys, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
+        return cudf::inner_join(left, right, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  return cudf::jni::join_gather_maps(
+      env, j_left_keys, j_right_keys, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
+        return cudf::full_join(left, right, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoinGatherMap(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  return cudf::jni::join_gather_single_map(
+      env, j_left_keys, j_right_keys, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
+        return cudf::left_semi_join(left, right, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  return cudf::jni::join_gather_single_map(
+      env, j_left_keys, j_right_keys, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
+        return cudf::left_anti_join(left, right, nulleq);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass,
                                                                  jlong left_table,
                                                                  jlong right_table) {
@@ -1859,7 +2043,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
     cudf::jni::auto_set_device(env);
     cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
     cudf::column_view *map = reinterpret_cast<cudf::column_view *>(j_map);
-    std::unique_ptr<cudf::table> result = cudf::gather(*input, *map);
+    auto bounds_policy =
+        check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
+    std::unique_ptr<cudf::table> result = cudf::gather(*input, *map, bounds_policy);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/test/java/ai/rapids/cudf/GatherMapTest.java b/java/src/test/java/ai/rapids/cudf/GatherMapTest.java
new file mode 100644
index 00000000000..b0e78a2c2cd
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/GatherMapTest.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class GatherMapTest {
+  @Test
+  void testInvalidBuffer() {
+    try (DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(707)) {
+      assertThrows(IllegalArgumentException.class, () -> new GatherMap(buffer));
+    }
+  }
+
+  @Test
+  void testRowCount() {
+    try (GatherMap map = new GatherMap(DeviceMemoryBuffer.allocate(700))) {
+      assertEquals(175, map.getRowCount());
+    }
+  }
+
+  @Test
+  void testClose() {
+    DeviceMemoryBuffer mockBuffer = Mockito.mock(DeviceMemoryBuffer.class);
+    GatherMap map = new GatherMap(mockBuffer);
+    map.close();
+    Mockito.verify(mockBuffer).close();
+  }
+
+  @Test
+  void testReleaseBuffer() {
+    DeviceMemoryBuffer mockBuffer = Mockito.mock(DeviceMemoryBuffer.class);
+    GatherMap map = new GatherMap(mockBuffer);
+    DeviceMemoryBuffer buffer = map.releaseBuffer();
+    assertSame(mockBuffer, buffer);
+    map.close();
+    Mockito.verify(mockBuffer, Mockito.never()).close();
+  }
+
+  @Test
+  void testInvalidColumnView() {
+    try (GatherMap map = new GatherMap(DeviceMemoryBuffer.allocate(1024))) {
+      assertThrows(IllegalArgumentException.class, () -> map.toColumnView(0, 257));
+      assertThrows(IllegalArgumentException.class, () -> map.toColumnView(257, 0));
+      assertThrows(IllegalArgumentException.class, () -> map.toColumnView(-4, 253));
+      assertThrows(IllegalArgumentException.class, () -> map.toColumnView(4, -2));
+    }
+  }
+
+  @Test
+  void testToColumnView() {
+    try (HostMemoryBuffer hostBuffer = HostMemoryBuffer.allocate(8 * 4)) {
+      hostBuffer.setInts(0, new int[]{10, 11, 12, 13, 14, 15, 16, 17}, 0, 8);
+      try (DeviceMemoryBuffer devBuffer = DeviceMemoryBuffer.allocate(8*4)) {
+        devBuffer.copyFromHostBuffer(hostBuffer);
+        devBuffer.incRefCount();
+        try (GatherMap map = new GatherMap(devBuffer)) {
+          ColumnView view = map.toColumnView(0, 8);
+          assertEquals(DType.INT32, view.getType());
+          assertEquals(0, view.getNullCount());
+          assertEquals(8, view.getRowCount());
+          try (HostMemoryBuffer viewHostBuffer = HostMemoryBuffer.allocate(8 * 4)) {
+            viewHostBuffer.copyFromDeviceBuffer(view.getData());
+            for (int i = 0; i < 8; i++) {
+              assertEquals(i + 10, viewHostBuffer.getInt(4*i));
+            }
+          }
+          view = map.toColumnView(3, 2);
+          assertEquals(DType.INT32, view.getType());
+          assertEquals(0, view.getNullCount());
+          assertEquals(2, view.getRowCount());
+          try (HostMemoryBuffer viewHostBuffer = HostMemoryBuffer.allocate(8)) {
+            viewHostBuffer.copyFromDeviceBuffer(view.getData());
+            assertEquals(13, viewHostBuffer.getInt(0));
+            assertEquals(14, viewHostBuffer.getInt(4));
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index b6350a207c1..ac71f96d3c3 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -929,6 +929,51 @@ void testLeftJoin() {
     }
   }
 
+  @Test
+  void testLeftJoinLeftEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    try (Table leftTable = new Table.TestBuilder()
+        .column(emptyInts)
+        .column(emptyInts)
+        .build();
+         Table rightTable = new Table.TestBuilder()
+             .column(306, 301, 360, 109, 335, 254, 317, 361, 251, 326)
+             .column( 20,  21,  22,  23,  24,  25,  26,  27,  28,  29)
+             .build();
+         Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), true);
+         Table expected = new Table.TestBuilder()
+             .column(emptyInts) // common
+             .column(emptyInts) // left
+             .column(emptyInts) // right
+             .build()) {
+      assertTablesAreEqual(expected, joinedTable);
+    }
+  }
+
+  @Test
+  void testLeftJoinRightEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    final Integer[] nullInts = new Integer[10];
+    Arrays.fill(nullInts, null);
+    try (Table leftTable = new Table.TestBuilder()
+        .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317)
+        .column( 10,  11,  12,  13,  14,  15,  16,  17,  18,  19)
+        .build();
+         Table rightTable = new Table.TestBuilder()
+             .column(emptyInts)
+             .column(emptyInts)
+             .build();
+         Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), true);
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true));
+         Table expected = new Table.TestBuilder()
+             .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317) // common
+             .column( 10,  11,  12,  13,  14,  15,  16,  17,  18,  19) // left
+             .column(nullInts) // right
+             .build()) {
+      assertTablesAreEqual(expected, orderedJoinedTable);
+    }
+  }
+
   @Test
   void testFullJoinWithNonCommonKeys() {
     try (Table leftTable = new Table.TestBuilder()
@@ -950,6 +995,46 @@ void testFullJoinWithNonCommonKeys() {
     }
   }
 
+  @Test
+  void testFullJoinLeftEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    final Integer[] nullInts = new Integer[6];
+    try (Table leftTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
+         Table rightTable = new Table.TestBuilder()
+             .column(  6,   5,   9,   8,  10,  32)
+             .column(200, 201, 202, 203, 204, 205)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(   5,    6,    8,    9,   10,   32) // common
+             .column(nullInts) // left
+             .column( 201,  200,  203,  202,  204,  205) // right
+             .build();
+         Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0), true);
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true))) {
+      assertTablesAreEqual(expected, orderedJoinedTable);
+    }
+  }
+
+  @Test
+  void testFullJoinRightEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    final Integer[] nullInts = new Integer[10];
+    try (Table leftTable = new Table.TestBuilder()
+        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
+        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
+        .build();
+         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
+         Table expected = new Table.TestBuilder()
+             .column(   0,    1,    2,    3,    4,   5,   6,    7,   8,   9) // common
+             .column( 103,  104,  100,  101,  106, 108, 107,  105, 109, 102) // left
+             .column(nullInts) // right
+             .build();
+         Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0), true);
+         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true))) {
+      assertTablesAreEqual(expected, orderedJoinedTable);
+    }
+  }
+
   @Test
   void testFullJoinOnNullKeys() {
     try (Table leftTable = new Table.TestBuilder()
@@ -1028,6 +1113,36 @@ void testInnerJoinWithNonCommonKeys() {
     }
   }
 
+  @Test
+  void testInnerJoinLeftEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    try (Table leftTable = new Table.TestBuilder()
+        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
+        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
+        .build();
+         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
+         Table expected = new Table.TestBuilder()
+             .column(emptyInts).column(emptyInts).column(emptyInts).build();
+         Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0), true)) {
+      assertTablesAreEqual(expected, joinedTable);
+    }
+  }
+
+  @Test
+  void testInnerJoinRightEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    try (Table leftTable = new Table.TestBuilder()
+        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
+        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
+        .build();
+         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
+         Table expected = new Table.TestBuilder()
+             .column(emptyInts).column(emptyInts).column(emptyInts).build();
+         Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0), true)) {
+      assertTablesAreEqual(expected, joinedTable);
+    }
+  }
+
   @Test
   void testInnerJoinOnNullKeys() {
     try (Table leftTable = new Table.TestBuilder()
@@ -1104,6 +1219,32 @@ void testLeftSemiJoin() {
     }
   }
 
+  @Test
+  void testLeftSemiJoinLeftEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    try (Table leftTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
+         Table rightTable = new Table.TestBuilder()
+             .column(  6,   5,   9,   8,  10,  32)
+             .column(201, 202, 203, 204, 205, 206)
+             .build();
+         Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0), true)) {
+      assertTablesAreEqual(leftTable, joinedTable);
+    }
+  }
+
+  @Test
+  void testLeftSemiJoinRightEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    try (Table leftTable = new Table.TestBuilder()
+        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
+        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
+        .build();
+         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
+         Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0), true)) {
+      assertTablesAreEqual(rightTable, joinedTable);
+    }
+  }
+
   @Test
   void testLeftSemiJoinWithNulls() {
     try (Table leftTable = new Table.TestBuilder()
@@ -1179,6 +1320,32 @@ void testLeftAntiJoin() {
     }
   }
 
+  @Test
+  void testLeftAntiJoinLeftEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    try (Table leftTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
+         Table rightTable = new Table.TestBuilder()
+             .column(  6,   5,   9,   8,  10,  32)
+             .column(201, 202, 203, 204, 205, 206)
+             .build();
+         Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0), true)) {
+      assertTablesAreEqual(leftTable, joinedTable);
+    }
+  }
+
+  @Test
+  void testLeftAntiJoinRightEmpty() {
+    final Integer[] emptyInts = new Integer[0];
+    try (Table leftTable = new Table.TestBuilder()
+        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
+        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
+        .build();
+         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
+         Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0), true)) {
+      assertTablesAreEqual(leftTable, joinedTable);
+    }
+  }
+
   @Test
   void testLeftAntiJoinOnNullKeys() {
     try (Table leftTable = new Table.TestBuilder()
@@ -1255,6 +1422,215 @@ void testCrossJoin() {
     }
   }
 
+  private void verifyJoinGatherMaps(GatherMap[] maps, Table expected) {
+    assertEquals(2, maps.length);
+    int numRows = (int) expected.getRowCount();
+    assertEquals(numRows, maps[0].getRowCount());
+    assertEquals(numRows, maps[1].getRowCount());
+    try (ColumnVector leftMap = maps[0].toColumnView(0, numRows).copyToColumnVector();
+         ColumnVector rightMap = maps[1].toColumnView(0, numRows).copyToColumnVector();
+         Table result = new Table(leftMap, rightMap);
+         Table orderedResult = result.orderBy(OrderByArg.asc(0, true))) {
+      assertTablesAreEqual(expected, orderedResult);
+    }
+  }
+
+  private void verifySemiJoinGatherMap(GatherMap map, Table expected) {
+    int numRows = (int) expected.getRowCount();
+    assertEquals(numRows, map.getRowCount());
+    try (ColumnVector leftMap = map.toColumnView(0, numRows).copyToColumnVector();
+         Table result = new Table(leftMap);
+         Table orderedResult = result.orderBy(OrderByArg.asc(0, true))) {
+      assertTablesAreEqual(expected, orderedResult);
+    }
+  }
+
+  @Test
+  void testLeftJoinGatherMaps() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 8, 9)
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3)
+             .build()) {
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightKeys, false);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testLeftJoinGatherMapsNulls() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+            .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+            .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightKeys, true);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testInnerJoinGatherMaps() {
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8, 9) // left
+             .column(2, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightKeys, false);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testInnerJoinGatherMapsNulls() {
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 7, 8, 8, 9) // left
+             .column(2, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightKeys, true);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFullJoinGatherMaps() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build();
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightKeys, false);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFullJoinGatherMapsNulls() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+             .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightKeys, true);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testLeftSemiJoinGatherMap() {
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8, 9) // left
+             .build();
+         GatherMap map = leftKeys.leftSemiJoinGatherMap(rightKeys, false)) {
+      verifySemiJoinGatherMap(map, expected);
+    }
+  }
+
+  @Test
+  void testLeftSemiJoinGatherMapNulls() {
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8, 9) // left
+             .build();
+         GatherMap map = leftKeys.leftSemiJoinGatherMap(rightKeys, true)) {
+      verifySemiJoinGatherMap(map, expected);
+    }
+  }
+
+  @Test
+  void testAntiSemiJoinGatherMap() {
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table expected = new Table.TestBuilder()
+             .column(0, 1, 3, 4, 5, 6) // left
+             .build();
+         GatherMap map = leftKeys.leftAntiJoinGatherMap(rightKeys, false)) {
+      verifySemiJoinGatherMap(map, expected);
+    }
+  }
+
+  @Test
+  void testAntiSemiJoinGatherMapNulls() {
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(0, 1, 3, 4, 5, 6) // left
+             .build();
+         GatherMap map = leftKeys.leftAntiJoinGatherMap(rightKeys, true)) {
+      verifySemiJoinGatherMap(map, expected);
+    }
+  }
+
   @Test
   void testBoundsNulls() {
     boolean[] descFlags = new boolean[1];
@@ -3988,6 +4364,26 @@ void testSimpleGather() {
     }
   }
 
+  @Test
+  void testBoundsCheckedGather() {
+    try (Table testTable = new Table.TestBuilder()
+            .column(1, 2, 3, 4, 5)
+            .column("A", "AA", "AAA", "AAAA", "AAAAA")
+            .decimal32Column(-3, 1, 2, 3, 4, 5)
+            .decimal64Column(-8, 100001L, 200002L, 300003L, 400004L, 500005L)
+            .build();
+         ColumnVector gatherMap = ColumnVector.fromInts(0, 100, 4, -2);
+         Table expected = new Table.TestBuilder()
+                 .column(1, null, 5, 4)
+                 .column("A", null, "AAAAA", "AAAA")
+                 .decimal32Column(-3, 1, null, 5, 4)
+                 .decimal64Column(-8, 100001L, null, 500005L, 400004L)
+                 .build();
+         Table found = testTable.gather(gatherMap)) {
+      assertTablesAreEqual(expected, found);
+    }
+  }
+
   @Test
   void testMaskWithoutValidity() {
     try (ColumnVector mask = ColumnVector.fromBoxedBooleans(true, false, true, false, true);

From 2d24a9b0060025ebbefaaa102c1fcb8e3ea6a978 Mon Sep 17 00:00:00 2001
From: Jake Hemstad <jhemstad@nvidia.com>
Date: Mon, 29 Mar 2021 20:33:29 -0500
Subject: [PATCH 17/20] Disable column_view data accessors for unsupported
 types (#7725)

Fixes https://github.com/rapidsai/cudf/issues/7712

`column_view` provides data accessors like `column_view::data<T>` and `column_view::begin<T>`. These accessors are only valid for fixed-width primitive types that can be constructed by simply casting the underlying `void*` to `T*`.

However, the accessors never actually enforced this rule, e.g., `column_view::data<struct_view>` should fail to compile.

This PR disables these accessors for invalid types.

This uncovered a number of places that were erroneously instantiating `column_view` accessors, which would lead to silent failures (e.g., `scatter` was failing silently for `struct` columns).

I added a few new things to aid me in this effort:

- `CUDF_ENABLE_IF` macro to make it easier to SFINAE.
- `is_rep_layout_compatbile<T>()` to identify types that are layout compatible with their rep (e.g., `duration_ns` is layout compatible with its `int64_t` rep. The `decimal32` type is _not_ layout compatible with it's `int32_t` rep).
- `column_device_view::has_element_accessor<T>()` identifies if `column_device_view::element<T>()` has a valid overload.

Authors:
  - Jake Hemstad (@jrhemstad)

Approvers:
  - Christopher Harris (@cwharris)
  - Conor Hoekstra (@codereport)
  - Vyas Ramasubramani (@vyasr)

URL: https://github.com/rapidsai/cudf/pull/7725
---
 .../type_dispatcher_benchmark.cu              |  17 +-
 cpp/include/cudf/ast/detail/transform.cuh     |  26 +-
 .../cudf/column/column_device_view.cuh        | 362 +++++++++++-------
 cpp/include/cudf/column/column_view.hpp       |  46 ++-
 cpp/include/cudf/detail/gather.cuh            | 193 +++++-----
 cpp/include/cudf/detail/scatter.cuh           |  56 +--
 cpp/include/cudf/lists/detail/scatter.cuh     |  36 +-
 cpp/include/cudf/table/row_operators.cuh      |  40 +-
 cpp/include/cudf/utilities/traits.hpp         |  30 ++
 cpp/src/copying/copy.cu                       |  30 +-
 cpp/src/copying/copy_range.cu                 |  44 ++-
 cpp/src/filling/fill.cu                       |  12 +-
 cpp/src/interop/dlpack.cpp                    |   9 +-
 cpp/src/interop/from_arrow.cpp                |  14 +-
 cpp/src/interop/to_arrow.cpp                  |   9 +-
 cpp/src/jit/type.cpp                          |  21 +-
 cpp/src/merge/merge.cu                        |  15 +-
 cpp/src/replace/nulls.cu                      |   5 +-
 cpp/tests/copying/copy_tests.cu               |   5 +-
 19 files changed, 617 insertions(+), 353 deletions(-)

diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
index df3a373c576..18ef5a1168e 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
@@ -27,6 +27,7 @@
 
 #include <cudf/detail/utilities/cuda.cuh>
 
+#include <cudf/utilities/traits.hpp>
 #include <random>
 #include <type_traits>
 #include "../fixture/benchmark_fixture.hpp"
@@ -87,7 +88,7 @@ __global__ void host_dispatching_kernel(mutable_column_device_view source_column
 
 template <FunctorType functor_type>
 struct ColumnHandle {
-  template <typename ColumnType>
+  template <typename ColumnType, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<ColumnType>())>
   void operator()(mutable_column_device_view source_column, int work_per_thread)
   {
     cudf::detail::grid_1d grid_config{source_column.size(), block_size};
@@ -95,6 +96,12 @@ struct ColumnHandle {
     // Launch the kernel.
     host_dispatching_kernel<functor_type, ColumnType><<<grid_size, block_size>>>(source_column);
   }
+
+  template <typename ColumnType, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<ColumnType>())>
+  void operator()(mutable_column_device_view source_column, int work_per_thread)
+  {
+    CUDF_FAIL("Invalid type to benchmark.");
+  }
 };
 
 // The following is for DEVICE_DISPATCHING:
@@ -104,12 +111,18 @@ struct ColumnHandle {
 // n_rows * n_cols.
 template <FunctorType functor_type>
 struct RowHandle {
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
   __device__ void operator()(mutable_column_device_view source, cudf::size_type index)
   {
     using F                 = Functor<T, functor_type>;
     source.data<T>()[index] = F::f(source.data<T>()[index]);
   }
+
+  template <typename T, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
+  __device__ void operator()(mutable_column_device_view source, cudf::size_type index)
+  {
+    cudf_assert(false && "Unsupported type.");
+  }
 };
 
 // This is for DEVICE_DISPATCHING
diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
index 2719a8b5077..da15ac07c63 100644
--- a/cpp/include/cudf/ast/detail/transform.cuh
+++ b/cpp/include/cudf/ast/detail/transform.cuh
@@ -20,10 +20,12 @@
 #include <cudf/ast/operators.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -55,10 +57,19 @@ struct row_output {
    * @param row_index Row index of data column.
    * @param result Value to assign to output.
    */
-  template <typename Element>
+  template <typename Element, CUDF_ENABLE_IF(is_rep_layout_compatible<Element>())>
   __device__ void resolve_output(detail::device_data_reference device_data_reference,
                                  cudf::size_type row_index,
                                  Element result) const;
+  // Definition below after row_evaluator is a complete type
+
+  template <typename Element, CUDF_ENABLE_IF(not is_rep_layout_compatible<Element>())>
+  __device__ void resolve_output(detail::device_data_reference device_data_reference,
+                                 cudf::size_type row_index,
+                                 Element result) const
+  {
+    cudf_assert(false && "Invalid type in resolve_output.");
+  }
 
  private:
   row_evaluator const& evaluator;
@@ -167,7 +178,7 @@ struct row_evaluator {
    * @param row_index Row index of data column.
    * @return Element
    */
-  template <typename Element>
+  template <typename Element, CUDF_ENABLE_IF(column_device_view::has_element_accessor<Element>())>
   __device__ Element resolve_input(detail::device_data_reference device_data_reference,
                                    cudf::size_type row_index) const
   {
@@ -187,6 +198,15 @@ struct row_evaluator {
     }
   }
 
+  template <typename Element,
+            CUDF_ENABLE_IF(not column_device_view::has_element_accessor<Element>())>
+  __device__ Element resolve_input(detail::device_data_reference device_data_reference,
+                                   cudf::size_type row_index) const
+  {
+    cudf_assert(false && "Unsupported type in resolve_input.");
+    return {};
+  }
+
   /**
    * @brief Callable to perform a unary operation.
    *
@@ -249,7 +269,7 @@ struct row_evaluator {
   mutable_column_device_view* output_column;
 };
 
-template <typename Element>
+template <typename Element, std::enable_if_t<is_rep_layout_compatible<Element>()>*>
 __device__ void row_output::resolve_output(detail::device_data_reference device_data_reference,
                                            cudf::size_type row_index,
                                            Element result) const
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 14d44b77fad..a842e51c94a 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -71,10 +71,14 @@ class alignas(16) column_device_view_base {
    * a column, and instead, accessing the elements should be done via
    *`data<T>()`.
    *
+   * This function will only participate in overload resolution if `is_rep_layout_compatible<T>()`
+   * or `std::is_same<T,void>::value` are true.
+   *
    * @tparam The type to cast to
    * @return T const* Typed pointer to underlying data
    */
-  template <typename T = void>
+  template <typename T = void,
+            CUDF_ENABLE_IF(std::is_same<T, void>::value or is_rep_layout_compatible<T>())>
   __host__ __device__ T const* head() const noexcept
   {
     return static_cast<T const*>(_data);
@@ -89,10 +93,13 @@ class alignas(16) column_device_view_base {
    * For columns with children, the pointer returned is undefined
    * and should not be used.
    *
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false.
+   *
    * @tparam T The type to cast to
    * @return T const* Typed pointer to underlying data, including the offset
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   __host__ __device__ T const* data() const noexcept
   {
     return head<T>() + _offset;
@@ -235,6 +242,18 @@ class alignas(16) column_device_view_base {
     : _type{type}, _size{size}, _data{data}, _null_mask{null_mask}, _offset{offset}
   {
   }
+
+  template <typename C, typename T, typename = void>
+  struct has_element_accessor_impl : std::false_type {
+  };
+
+  template <typename C, typename T>
+  struct has_element_accessor_impl<
+    C,
+    T,
+    void_t<decltype(std::declval<C>().template element<T>(std::declval<size_type>()))>>
+    : std::true_type {
+  };
 };
 
 // Forward declaration
@@ -283,15 +302,145 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *
    * This function accounts for the offset.
    *
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false. Specializations of this function may exist for types `T` where
+   *`is_rep_layout_compatible<T>` is false.
+   *
    * @tparam T The element type
    * @param element_index Position of the desired element
    */
-  template <typename T>
-  __device__ T const element(size_type element_index) const noexcept
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  __device__ T element(size_type element_index) const noexcept
   {
     return data<T>()[element_index];
   }
 
+  /**
+   * @brief Returns `string_view` to the string element at the specified index.
+   *
+   * If the element at the specified index is NULL, i.e., `is_null(element_index)
+   * == true`, then any attempt to use the result will lead to undefined behavior.
+   *
+   * This function accounts for the offset.
+   *
+   * @param element_index Position of the desired string element
+   * @return string_view instance representing this element at this index
+   */
+  template <typename T, CUDF_ENABLE_IF(std::is_same<T, string_view>::value)>
+  __device__ T element(size_type element_index) const noexcept
+  {
+    size_type index = element_index + offset();  // account for this view's _offset
+    const int32_t* d_offsets =
+      d_children[strings_column_view::offsets_column_index].data<int32_t>();
+    const char* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
+    size_type offset      = d_offsets[index];
+    return string_view{d_strings + offset, d_offsets[index + 1] - offset};
+  }
+
+ private:
+  /**
+   * @brief Dispatch functor for resolving the index value for a dictionary element.
+   *
+   * The basic dictionary elements are the indices which can be any index type.
+   */
+  struct index_element_fn {
+    template <typename IndexType,
+              CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_unsigned<IndexType>::value)>
+    __device__ size_type operator()(column_device_view const& indices, size_type index)
+    {
+      return static_cast<size_type>(indices.element<IndexType>(index));
+    }
+
+    template <typename IndexType,
+              typename... Args,
+              CUDF_ENABLE_IF(not(is_index_type<IndexType>() and
+                                 std::is_unsigned<IndexType>::value))>
+    __device__ size_type operator()(Args&&... args)
+    {
+      cudf_assert(false and "dictionary indices must be an unsigned integral type");
+      return 0;
+    }
+  };
+
+ public:
+  /**
+   * @brief Returns `dictionary32` element at the specified index for a
+   * dictionary column.
+   *
+   * `dictionary32` is a strongly typed wrapper around an `int32_t` value that holds the
+   * offset into the dictionary keys for the specified element.
+   *
+   * For example, given a dictionary column `d` with:
+   * ```c++
+   * keys: {"foo", "bar", "baz"}
+   * indices: {2, 0, 2, 1, 0}
+   *
+   * d.element<dictionary32>(0) == dictionary32{2};
+   * d.element<dictionary32>(1) == dictionary32{0};
+   * ```
+   *
+   * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
+   * then any attempt to use the result will lead to undefined behavior.
+   *
+   * This function accounts for the offset.
+   *
+   * @param element_index Position of the desired element
+   * @return dictionary32 instance representing this element at this index
+   */
+  template <typename T, CUDF_ENABLE_IF(std::is_same<T, dictionary32>::value)>
+  __device__ T element(size_type element_index) const noexcept
+  {
+    size_type index    = element_index + offset();  // account for this view's _offset
+    auto const indices = d_children[0];
+    return dictionary32{type_dispatcher(indices.type(), index_element_fn{}, indices, index)};
+  }
+
+  /**
+   * @brief Returns a `numeric::decimal32` element at the specified index for a `fixed_point`
+   * column.
+   *
+   * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
+   * then any attempt to use the result will lead to undefined behavior.
+   *
+   * @param element_index Position of the desired element
+   * @return numeric::decimal32 representing the element at this index
+   */
+  template <typename T, CUDF_ENABLE_IF(std::is_same<T, numeric::decimal32>::value)>
+  __device__ T element(size_type element_index) const noexcept
+  {
+    using namespace numeric;
+    auto const scale = scale_type{_type.scale()};
+    return decimal32{scaled_integer<int32_t>{data<int32_t>()[element_index], scale}};
+  }
+
+  /**
+   * @brief Returns a `numeric::decimal64` element at the specified index for a `fixed_point`
+   * column.
+   *
+   * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
+   * then any attempt to use the result will lead to undefined behavior.
+   *
+   * @param element_index Position of the desired element
+   * @return numeric::decimal64 representing the element at this index
+   */
+  template <typename T, CUDF_ENABLE_IF(std::is_same<T, numeric::decimal64>::value)>
+  __device__ T element(size_type element_index) const noexcept
+  {
+    using namespace numeric;
+    auto const scale = scale_type{_type.scale()};
+    return decimal64{scaled_integer<int64_t>{data<int64_t>()[element_index], scale}};
+  }
+
+  /**
+   * @brief For a given `T`, indicates if `column_device_view::element<T>()` has a valid overload.
+   *
+   */
+  template <typename T>
+  static constexpr bool has_element_accessor()
+  {
+    return has_element_accessor_impl<column_device_view, T>::value;
+  }
+
   /**
    * @brief Iterator for navigating this column
    */
@@ -306,9 +455,12 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * with columns where `has_nulls() == true` will result in undefined behavior
    * when accessing null elements.
    *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
    * For columns with null elements, use `make_null_replacement_iterator`.
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
   const_iterator<T> begin() const
   {
     return const_iterator<T>{count_it{0}, detail::value_accessor<T>{*this}};
@@ -321,9 +473,12 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * with columns where `has_nulls() == true` will result in undefined behavior
    * when accessing null elements.
    *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
    * For columns with null elements, use `make_null_replacement_iterator`.
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
   const_iterator<T> end() const
   {
     return const_iterator<T>{count_it{size()}, detail::value_accessor<T>{*this}};
@@ -357,11 +512,16 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * Else, if the element at `i` is null, then the value of `p.first` is
    * undefined and `p.second == false`.
    *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
    * @throws cudf::logic_error if tparam `has_nulls == true` and
    * `nullable() == false`
    * @throws cudf::logic_error if column datatype and Element type mismatch.
    */
-  template <typename T, bool has_nulls>
+  template <typename T,
+            bool has_nulls,
+            CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
   const_pair_iterator<T, has_nulls> pair_begin() const
   {
     return const_pair_iterator<T, has_nulls>{count_it{0},
@@ -382,11 +542,16 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * Else, if the element at `i` is null, then the value of `p.first` is
    * undefined and `p.second == false`.
    *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
    * @throws cudf::logic_error if tparam `has_nulls == true` and
    * `nullable() == false`
    * @throws cudf::logic_error if column datatype and Element type mismatch.
    */
-  template <typename T, bool has_nulls>
+  template <typename T,
+            bool has_nulls,
+            CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
   const_pair_rep_iterator<T, has_nulls> pair_rep_begin() const
   {
     return const_pair_rep_iterator<T, has_nulls>{count_it{0},
@@ -397,11 +562,16 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @brief Return a pair iterator to the element following the last element of
    * the column.
    *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
    * @throws cudf::logic_error if tparam `has_nulls == true` and
    * `nullable() == false`
    * @throws cudf::logic_error if column datatype and Element type mismatch.
    */
-  template <typename T, bool has_nulls>
+  template <typename T,
+            bool has_nulls,
+            CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
   const_pair_iterator<T, has_nulls> pair_end() const
   {
     return const_pair_iterator<T, has_nulls>{count_it{size()},
@@ -412,11 +582,16 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @brief Return a pair iterator to the element following the last element of
    * the column.
    *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
    * @throws cudf::logic_error if tparam `has_nulls == true` and
    * `nullable() == false`
    * @throws cudf::logic_error if column datatype and Element type mismatch.
    */
-  template <typename T, bool has_nulls>
+  template <typename T,
+            bool has_nulls,
+            CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
   const_pair_rep_iterator<T, has_nulls> pair_rep_end() const
   {
     return const_pair_rep_iterator<T, has_nulls>{count_it{size()},
@@ -549,6 +724,9 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @brief Returns pointer to the base device memory allocation casted to
    * the specified type.
    *
+   * This function will only participate in overload resolution if `is_rep_layout_compatible<T>()`
+   * or `std::is_same<T,void>::value` are true.
+   *
    * @note If `offset() == 0`, then `head<T>() == data<T>()`
    *
    * @note It should be rare to need to access the `head<T>()` allocation of
@@ -558,7 +736,8 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @tparam The type to cast to
    * @return T* Typed pointer to underlying data
    */
-  template <typename T = void>
+  template <typename T = void,
+            CUDF_ENABLE_IF(std::is_same<T, void>::value or is_rep_layout_compatible<T>())>
   __host__ __device__ T* head() const noexcept
   {
     return const_cast<T*>(detail::column_device_view_base::head<T>());
@@ -568,14 +747,15 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @brief Returns the underlying data casted to the specified type, plus the
    * offset.
    *
-   * @note If `offset() == 0`, then `head<T>() == data<T>()`
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false.
    *
-   * This pointer is undefined for columns with children.
+   * @note If `offset() == 0`, then `head<T>() == data<T>()`
    *
    * @tparam T The type to cast to
    * @return T* Typed pointer to underlying data, including the offset
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   __host__ __device__ T* data() const noexcept
   {
     return const_cast<T*>(detail::column_device_view_base::data<T>());
@@ -586,15 +766,31 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    *
    * This function accounts for the offset.
    *
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false. Specializations of this function may exist for types `T` where
+   *`is_rep_layout_compatible<T>` is false.
+   *
+   *
    * @tparam T The element type
    * @param element_index Position of the desired element
    */
-  template <typename T>
-  __device__ T& element(size_type element_index) noexcept
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  __device__ T& element(size_type element_index) const noexcept
   {
     return data<T>()[element_index];
   }
 
+  /**
+   * @brief For a given `T`, indicates if `mutable_column_device_view::element<T>()` has a valid
+   * overload.
+   *
+   */
+  template <typename T>
+  static constexpr bool has_element_accessor()
+  {
+    return has_element_accessor_impl<mutable_column_device_view, T>::value;
+  }
+
   /**
    * @brief Returns raw pointer to the underlying bitmask allocation.
    *
@@ -618,11 +814,14 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @brief Return first element (accounting for offset) after underlying data
    * is casted to the specified type.
    *
+   * This function does not participate in overload resolution if
+   * `mutable_column_device_view::has_element_accessor<T>()` is false.
+   *
    * @tparam T The desired type
    * @return T* Pointer to the first element after casting
    */
-  template <typename T>
-  std::enable_if_t<is_fixed_width<T>(), iterator<T>> begin()
+  template <typename T, CUDF_ENABLE_IF(mutable_column_device_view::has_element_accessor<T>())>
+  iterator<T> begin()
   {
     return iterator<T>{count_it{0}, detail::mutable_value_accessor<T>{*this}};
   }
@@ -631,11 +830,14 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @brief Return one past the last element after underlying data is casted to
    * the specified type.
    *
+   * This function does not participate in overload resolution if
+   * `mutable_column_device_view::has_element_accessor<T>()` is false.
+   *
    * @tparam T The desired type
    * @return T const* Pointer to one past the last element after casting
    */
-  template <typename T>
-  std::enable_if_t<is_fixed_width<T>(), iterator<T>> end()
+  template <typename T, CUDF_ENABLE_IF(mutable_column_device_view::has_element_accessor<T>())>
+  iterator<T> end()
   {
     return iterator<T>{count_it{size()}, detail::mutable_value_accessor<T>{*this}};
   }
@@ -740,121 +942,6 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
   mutable_column_device_view(mutable_column_view source);
 };
 
-/**
- * @brief Returns `string_view` to the string element at the specified index.
- *
- * If the element at the specified index is NULL, i.e., `is_null(element_index)
- * == true`, then any attempt to use the result will lead to undefined behavior.
- *
- * This function accounts for the offset.
- *
- * @param element_index Position of the desired string element
- * @return string_view instance representing this element at this index
- */
-template <>
-__device__ inline string_view const column_device_view::element<string_view>(
-  size_type element_index) const noexcept
-{
-  size_type index          = element_index + offset();  // account for this view's _offset
-  const int32_t* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
-  const char* d_strings    = d_children[strings_column_view::chars_column_index].data<char>();
-  size_type offset         = d_offsets[index];
-  return string_view{d_strings + offset, d_offsets[index + 1] - offset};
-}
-
-/**
- * @brief Dispatch functor for resolving the index value for a dictionary element.
- *
- * The basic dictionary elements are the indices which can be any index type.
- */
-struct index_element_fn {
-  template <
-    typename IndexType,
-    std::enable_if_t<is_index_type<IndexType>() and std::is_unsigned<IndexType>::value>* = nullptr>
-  __device__ size_type operator()(column_device_view const& input, size_type index)
-  {
-    return static_cast<size_type>(input.element<IndexType>(index));
-  }
-  template <typename IndexType,
-            typename... Args,
-            std::enable_if_t<not(is_index_type<IndexType>() and
-                                 std::is_unsigned<IndexType>::value)>* = nullptr>
-  __device__ size_type operator()(Args&&... args)
-  {
-    cudf_assert(false and "dictionary indices must be an unsigned integral type");
-    return 0;
-  }
-};
-
-/**
- * @brief Returns `dictionary32` element at the specified index for a
- * dictionary column.
- *
- * `dictionary32` is a strongly typed wrapper around an `int32_t` value that holds the
- * offset into the dictionary keys for the specified element.
- *
- * For example, given a dictionary column `d` with:
- * ```c++
- * keys: {"foo", "bar", "baz"}
- * indices: {2, 0, 2, 1, 0}
- *
- * d.element<dictionary32>(0) == dictionary32{2};
- * d.element<dictionary32>(1) == dictionary32{0};
- * ```
- *
- * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
- * then any attempt to use the result will lead to undefined behavior.
- *
- * This function accounts for the offset.
- *
- * @param element_index Position of the desired element
- * @return dictionary32 instance representing this element at this index
- */
-template <>
-__device__ inline dictionary32 const column_device_view::element<dictionary32>(
-  size_type element_index) const noexcept
-{
-  size_type index    = element_index + offset();  // account for this view's _offset
-  auto const indices = d_children[0];
-  return dictionary32{type_dispatcher(indices.type(), index_element_fn{}, indices, index)};
-}
-
-/**
- * @brief Returns a `numeric::decimal32` element at the specified index for a `fixed_point` column.
- *
- * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
- * then any attempt to use the result will lead to undefined behavior.
- *
- * @param element_index Position of the desired element
- * @return numeric::decimal32 representing the element at this index
- */
-template <>
-__device__ inline numeric::decimal32 const column_device_view::element<numeric::decimal32>(
-  size_type element_index) const noexcept
-{
-  using namespace numeric;
-  auto const scale = scale_type{_type.scale()};
-  return decimal32{scaled_integer<int32_t>{data<int32_t>()[element_index], scale}};
-}
-
-/**
- * @brief Returns a `numeric::decimal64` element at the specified index for a `fixed_point` column.
- *
- * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`,
- * then any attempt to use the result will lead to undefined behavior.
- *
- * @param element_index Position of the desired element
- * @return numeric::decimal64 representing the element at this index
- */
-template <>
-__device__ inline numeric::decimal64 const column_device_view::element<numeric::decimal64>(
-  size_type element_index) const noexcept
-{
-  using namespace numeric;
-  auto const scale = scale_type{_type.scale()};
-  return decimal64{scaled_integer<int64_t>{data<int64_t>()[element_index], scale}};
-}
-
 namespace detail {
 
 #ifdef __CUDACC__  // because set_bit in bit.hpp is wrapped with __CUDACC__
@@ -896,7 +983,6 @@ __device__ inline bitmask_type get_mask_offset_word(bitmask_type const* __restri
  *
  * @tparam T The type of elements in the column
  */
-
 template <typename T>
 struct value_accessor {
   column_device_view const col;  ///< column view of column in device
@@ -1023,8 +1109,8 @@ struct mutable_value_accessor {
 };
 
 /**
- * @brief Helper function for use by column_device_view and mutable_column_device_view constructors
- * to build device_views from views.
+ * @brief Helper function for use by column_device_view and mutable_column_device_view
+ * constructors to build device_views from views.
  *
  * It is used to build the array of child columns in device memory. Since child columns can
  * also have child columns, this uses recursion to build up the flat device buffer to contain
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 168db61f672..82326a21d7d 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -16,6 +16,8 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+
 #include <vector>
 
 /**
@@ -55,10 +57,14 @@ class column_view_base {
    *a column, and instead, accessing the elements should be done via
    *`data<T>()`.
    *
+   * This function will only participate in overload resolution if `is_rep_layout_compatible<T>()`
+   * or `std::is_same<T,void>::value` are true.
+   *
    * @tparam The type to cast to
    * @return T const* Typed pointer to underlying data
    */
-  template <typename T = void>
+  template <typename T = void,
+            CUDF_ENABLE_IF(std::is_same<T, void>::value or is_rep_layout_compatible<T>())>
   T const* head() const noexcept
   {
     return static_cast<T const*>(_data);
@@ -70,12 +76,13 @@ class column_view_base {
    *
    * @note If `offset() == 0`, then `head<T>() == data<T>()`
    *
-   * @TODO Clarify behavior for variable-width types.
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false.
    *
    * @tparam T The type to cast to
    * @return T const* Typed pointer to underlying data, including the offset
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   T const* data() const noexcept
   {
     return head<T>() + _offset;
@@ -85,10 +92,13 @@ class column_view_base {
    * @brief Return first element (accounting for offset) after underlying data
    * is casted to the specified type.
    *
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false.
+   *
    * @tparam T The desired type
    * @return T const* Pointer to the first element after casting
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   T const* begin() const noexcept
   {
     return data<T>();
@@ -98,10 +108,13 @@ class column_view_base {
    * @brief Return one past the last element after underlying data is casted to
    * the specified type.
    *
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false.
+   *
    * @tparam T The desired type
    * @return T const* Pointer to one past the last element after casting
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   T const* end() const noexcept
   {
     return begin<T>() + size();
@@ -438,6 +451,9 @@ class mutable_column_view : public detail::column_view_base {
    * @brief Returns pointer to the base device memory allocation casted to
    * the specified type.
    *
+   * This function will only participate in overload resolution if `is_rep_layout_compatible<T>()`
+   * or `std::is_same<T,void>::value` are true.
+   *
    * @note If `offset() == 0`, then `head<T>() == data<T>()`
    *
    * @note It should be rare to need to access the `head<T>()` allocation of a
@@ -446,7 +462,8 @@ class mutable_column_view : public detail::column_view_base {
    * @tparam The type to cast to
    * @return T* Typed pointer to underlying data
    */
-  template <typename T = void>
+  template <typename T = void,
+            CUDF_ENABLE_IF(std::is_same<T, void>::value or is_rep_layout_compatible<T>())>
   T* head() const noexcept
   {
     return const_cast<T*>(detail::column_view_base::head<T>());
@@ -456,14 +473,15 @@ class mutable_column_view : public detail::column_view_base {
    * @brief Returns the underlying data casted to the specified type, plus the
    * offset.
    *
-   * @note If `offset() == 0`, then `head<T>() == data<T>()`
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false.
    *
-   * @TODO Clarify behavior for variable-width types.
+   * @note If `offset() == 0`, then `head<T>() == data<T>()`
    *
    * @tparam T The type to cast to
    * @return T* Typed pointer to underlying data, including the offset
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   T* data() const noexcept
   {
     return const_cast<T*>(detail::column_view_base::data<T>());
@@ -473,10 +491,13 @@ class mutable_column_view : public detail::column_view_base {
    * @brief Return first element (accounting for offset) when underlying data is
    * casted to the specified type.
    *
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false.
+   *
    * @tparam T The desired type
    * @return T* Pointer to the first element after casting
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   T* begin() const noexcept
   {
     return const_cast<T*>(detail::column_view_base::begin<T>());
@@ -486,10 +507,13 @@ class mutable_column_view : public detail::column_view_base {
    * @brief Return one past the last element after underlying data is casted to
    * the specified type.
    *
+   * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
+   * false.
+   *
    * @tparam T The desired type
    * @return T* Pointer to one past the last element after casting
    */
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   T* end() const noexcept
   {
     return const_cast<T*>(detail::column_view_base::end<T>());
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 73647ac2292..bf488621d52 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -139,6 +139,46 @@ void gather_helper(InputItr source_itr,
   }
 }
 
+// Error case when no other overload or specialization is available
+template <typename Element, typename Enable = void>
+struct column_gatherer_impl {
+  std::unique_ptr<column> operator()(...) { CUDF_FAIL("Unsupported type in gather."); }
+};
+
+/**
+ * @brief Function object for gathering a type-erased
+ * column. To be used with the cudf::type_dispatcher.
+ */
+struct column_gatherer {
+  /**
+   * @brief Type-dispatched function to gather from one column to another based
+   * on a `gather_map`.
+   *
+   * @tparam Element Dispatched type for the column being gathered
+   * @tparam MapIterator Iterator type for the gather map
+   * @param source_column View into the column to gather from
+   * @param gather_map_begin Beginning of iterator range of integral values representing the gather
+   * map
+   * @param gather_map_end End of iterator range of integral values representing the gather map
+   * @param nullify_out_of_bounds Nullify values in `gather_map` that are out of bounds
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  template <typename Element, typename MapIterator>
+  std::unique_ptr<column> operator()(column_view const& source_column,
+                                     MapIterator gather_map_begin,
+                                     MapIterator gather_map_end,
+                                     bool nullify_out_of_bounds,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    column_gatherer_impl<Element> gatherer{};
+
+    return gatherer(
+      source_column, gather_map_begin, gather_map_end, nullify_out_of_bounds, stream, mr);
+  }
+};
+
 /**
  * @brief Function object for gathering a type-erased column.
  *
@@ -148,8 +188,8 @@ void gather_helper(InputItr source_itr,
  * @tparam Element Dispatched type for the column being gathered
  * @tparam MapIterator Iterator type for the gather map
  */
-template <typename Element, typename MapIterator>
-struct column_gatherer_impl {
+template <typename Element>
+struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<Element>()>> {
   /**
    * @brief Type-dispatched function to gather from one column to another based
    * on a `gather_map`.
@@ -164,6 +204,7 @@ struct column_gatherer_impl {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned column's device memory
    */
+  template <typename MapIterator>
   std::unique_ptr<column> operator()(column_view const& source_column,
                                      MapIterator gather_map_begin,
                                      MapIterator gather_map_end,
@@ -195,8 +236,8 @@ struct column_gatherer_impl {
  *
  * @tparam MapIterator Iterator type for the gather map
  */
-template <typename MapItType>
-struct column_gatherer_impl<string_view, MapItType> {
+template <>
+struct column_gatherer_impl<string_view> {
   /**
    * @brief Type-dispatched function to gather from one column to another based
    * on a `gather_map`. This handles string_view type column_views only.
@@ -209,6 +250,7 @@ struct column_gatherer_impl<string_view, MapItType> {
    * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned column's device memory
    */
+  template <typename MapItType>
   std::unique_ptr<column> operator()(column_view const& source_column,
                                      MapItType gather_map_begin,
                                      MapItType gather_map_end,
@@ -234,8 +276,8 @@ struct column_gatherer_impl<string_view, MapItType> {
  * This functor is invoked only on the root column of a hierarchy of list
  * columns. Recursion is handled internally.
  */
-template <typename MapItRoot>
-struct column_gatherer_impl<list_view, MapItRoot> {
+template <>
+struct column_gatherer_impl<list_view> {
   /**
    * @brief Gather a list column from a hierarchy of list columns.
    *
@@ -282,6 +324,7 @@ struct column_gatherer_impl<list_view, MapItRoot> {
    * @returns column with elements gathered based on the gather map
    *
    */
+  template <typename MapItRoot>
   std::unique_ptr<column> operator()(column_view const& column,
                                      MapItRoot gather_map_begin,
                                      MapItRoot gather_map_end,
@@ -326,45 +369,11 @@ struct column_gatherer_impl<list_view, MapItRoot> {
   }
 };
 
-/**
- * @brief Function object for gathering a type-erased
- * column. To be used with the cudf::type_dispatcher.
- */
-struct column_gatherer {
-  /**
-   * @brief Type-dispatched function to gather from one column to another based
-   * on a `gather_map`.
-   *
-   * @tparam Element Dispatched type for the column being gathered
-   * @tparam MapIterator Iterator type for the gather map
-   * @param source_column View into the column to gather from
-   * @param gather_map_begin Beginning of iterator range of integral values representing the gather
-   * map
-   * @param gather_map_end End of iterator range of integral values representing the gather map
-   * @param nullify_out_of_bounds Nullify values in `gather_map` that are out of bounds
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @param mr Device memory resource used to allocate the returned column's device memory
-   */
-  template <typename Element, typename MapIterator>
-  std::unique_ptr<column> operator()(column_view const& source_column,
-                                     MapIterator gather_map_begin,
-                                     MapIterator gather_map_end,
-                                     bool nullify_out_of_bounds,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    column_gatherer_impl<Element, MapIterator> gatherer{};
-
-    return gatherer(
-      source_column, gather_map_begin, gather_map_end, nullify_out_of_bounds, stream, mr);
-  }
-};
-
 /**
  * @brief Column gather specialization for dictionary column type.
  */
-template <typename MapItType>
-struct column_gatherer_impl<dictionary32, MapItType> {
+template <>
+struct column_gatherer_impl<dictionary32> {
   /**
    * @brief Type-dispatched function to gather from one column to another based
    * on a `gather_map`.
@@ -378,6 +387,7 @@ struct column_gatherer_impl<dictionary32, MapItType> {
    * @param mr Device memory resource used to allocate the returned column's device memory
    * @return New dictionary column with gathered rows.
    */
+  template <typename MapItType>
   std::unique_ptr<column> operator()(column_view const& source_column,
                                      MapItType gather_map_begin,
                                      MapItType gather_map_end,
@@ -426,6 +436,56 @@ struct column_gatherer_impl<dictionary32, MapItType> {
   }
 };
 
+template <>
+struct column_gatherer_impl<struct_view> {
+  template <typename MapItRoot>
+  std::unique_ptr<column> operator()(column_view const& column,
+                                     MapItRoot gather_map_begin,
+                                     MapItRoot gather_map_end,
+                                     bool nullify_out_of_bounds,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    structs_column_view structs_column(column);
+    auto gather_map_size{std::distance(gather_map_begin, gather_map_end)};
+    if (gather_map_size == 0) { return empty_like(column); }
+
+    std::vector<std::unique_ptr<cudf::column>> output_struct_members;
+    std::transform(structs_column.child_begin(),
+                   structs_column.child_end(),
+                   std::back_inserter(output_struct_members),
+                   [&gather_map_begin, &gather_map_end, nullify_out_of_bounds, stream, mr](
+                     cudf::column_view const& col) {
+                     return cudf::type_dispatcher<dispatch_storage_type>(col.type(),
+                                                                         column_gatherer{},
+                                                                         col,
+                                                                         gather_map_begin,
+                                                                         gather_map_end,
+                                                                         nullify_out_of_bounds,
+                                                                         stream,
+                                                                         mr);
+                   });
+
+    gather_bitmask(
+      // Table view of struct column.
+      cudf::table_view{
+        std::vector<cudf::column_view>{structs_column.child_begin(), structs_column.child_end()}},
+      gather_map_begin,
+      output_struct_members,
+      nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK,
+      stream,
+      mr);
+
+    return cudf::make_structs_column(
+      gather_map_size,
+      std::move(output_struct_members),
+      0,
+      rmm::device_buffer{0, stream, mr},  // Null mask will be fixed up in cudf::gather().
+      stream,
+      mr);
+  }
+};
+
 /**
  * @brief Function object for applying a transformation on the gathermap
  * that converts negative indices to positive indices
@@ -538,55 +598,6 @@ void gather_bitmask(table_view const& source,
   }
 }
 
-template <typename MapItRoot>
-struct column_gatherer_impl<struct_view, MapItRoot> {
-  std::unique_ptr<column> operator()(column_view const& column,
-                                     MapItRoot gather_map_begin,
-                                     MapItRoot gather_map_end,
-                                     bool nullify_out_of_bounds,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    structs_column_view structs_column(column);
-    auto gather_map_size{std::distance(gather_map_begin, gather_map_end)};
-    if (gather_map_size == 0) { return empty_like(column); }
-
-    std::vector<std::unique_ptr<cudf::column>> output_struct_members;
-    std::transform(structs_column.child_begin(),
-                   structs_column.child_end(),
-                   std::back_inserter(output_struct_members),
-                   [&gather_map_begin, &gather_map_end, nullify_out_of_bounds, stream, mr](
-                     cudf::column_view const& col) {
-                     return cudf::type_dispatcher<dispatch_storage_type>(col.type(),
-                                                                         column_gatherer{},
-                                                                         col,
-                                                                         gather_map_begin,
-                                                                         gather_map_end,
-                                                                         nullify_out_of_bounds,
-                                                                         stream,
-                                                                         mr);
-                   });
-
-    gather_bitmask(
-      // Table view of struct column.
-      cudf::table_view{
-        std::vector<cudf::column_view>{structs_column.child_begin(), structs_column.child_end()}},
-      gather_map_begin,
-      output_struct_members,
-      nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK,
-      stream,
-      mr);
-
-    return cudf::make_structs_column(
-      gather_map_size,
-      std::move(output_struct_members),
-      0,
-      rmm::device_buffer{0, stream, mr},  // Null mask will be fixed up in cudf::gather().
-      stream,
-      mr);
-  }
-};
-
 /**
  * @brief Gathers the specified rows of a set of columns according to a gather map.
  *
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 2cb1cbffc68..30764b9b89f 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -79,8 +79,14 @@ auto scatter_to_gather(MapIterator scatter_map_begin,
   return gather_map;
 }
 
-template <typename Element, typename MapIterator>
+template <typename Element, typename Enable = void>
 struct column_scatterer_impl {
+  std::unique_ptr<column> operator()(...) const { CUDF_FAIL("Unsupported type for scatter."); }
+};
+
+template <typename Element>
+struct column_scatterer_impl<Element, std::enable_if_t<cudf::is_fixed_width<Element>()>> {
+  template <typename MapIterator>
   std::unique_ptr<column> operator()(column_view const& source,
                                      MapIterator scatter_map_begin,
                                      MapIterator scatter_map_end,
@@ -103,8 +109,9 @@ struct column_scatterer_impl {
   }
 };
 
-template <typename MapIterator>
-struct column_scatterer_impl<string_view, MapIterator> {
+template <>
+struct column_scatterer_impl<string_view> {
+  template <typename MapIterator>
   std::unique_ptr<column> operator()(column_view const& source,
                                      MapIterator scatter_map_begin,
                                      MapIterator scatter_map_end,
@@ -119,8 +126,9 @@ struct column_scatterer_impl<string_view, MapIterator> {
   }
 };
 
-template <typename MapIterator>
-struct column_scatterer_impl<list_view, MapIterator> {
+template <>
+struct column_scatterer_impl<list_view> {
+  template <typename MapIterator>
   std::unique_ptr<column> operator()(column_view const& source,
                                      MapIterator scatter_map_begin,
                                      MapIterator scatter_map_end,
@@ -133,23 +141,9 @@ struct column_scatterer_impl<list_view, MapIterator> {
   }
 };
 
-template <typename MapIterator>
-struct column_scatterer {
-  template <typename Element>
-  std::unique_ptr<column> operator()(column_view const& source,
-                                     MapIterator scatter_map_begin,
-                                     MapIterator scatter_map_end,
-                                     column_view const& target,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
-  {
-    column_scatterer_impl<Element, MapIterator> scatterer{};
-    return scatterer(source, scatter_map_begin, scatter_map_end, target, stream, mr);
-  }
-};
-
-template <typename MapIterator>
-struct column_scatterer_impl<dictionary32, MapIterator> {
+template <>
+struct column_scatterer_impl<dictionary32> {
+  template <typename MapIterator>
   std::unique_ptr<column> operator()(column_view const& source_in,
                                      MapIterator scatter_map_begin,
                                      MapIterator scatter_map_end,
@@ -206,6 +200,20 @@ struct column_scatterer_impl<dictionary32, MapIterator> {
   }
 };
 
+struct column_scatterer {
+  template <typename Element, typename MapIterator>
+  std::unique_ptr<column> operator()(column_view const& source,
+                                     MapIterator scatter_map_begin,
+                                     MapIterator scatter_map_end,
+                                     column_view const& target,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
+  {
+    column_scatterer_impl<Element> scatterer{};
+    return scatterer(source, scatter_map_begin, scatter_map_end, target, stream, mr);
+  }
+};
+
 /**
  * @brief Scatters the rows of the source table into a copy of the target table
  * according to a scatter map.
@@ -276,15 +284,13 @@ std::unique_ptr<table> scatter(
 
   auto result = std::vector<std::unique_ptr<column>>(target.num_columns());
 
-  auto scatter_functor = column_scatterer<decltype(updated_scatter_map_begin)>{};
-
   std::transform(source.begin(),
                  source.end(),
                  target.begin(),
                  result.begin(),
                  [=](auto const& source_col, auto const& target_col) {
                    return type_dispatcher<dispatch_storage_type>(source_col.type(),
-                                                                 scatter_functor,
+                                                                 column_scatterer{},
                                                                  source_col,
                                                                  updated_scatter_map_begin,
                                                                  updated_scatter_map_end,
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 5dd3db1117c..8e2ecdf49a7 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -577,15 +577,15 @@ struct list_child_constructor {
     auto child_offsets = cudf::strings::detail::make_offsets_child_column(
       begin, begin + child_list_views.size(), stream, mr);
 
-    auto child_column =
-      cudf::type_dispatcher(source_lists_column_view.child().child(1).type(),
-                            list_child_constructor{},
-                            child_list_views,
-                            child_offsets->view(),
-                            cudf::lists_column_view(source_lists_column_view.child()),
-                            cudf::lists_column_view(target_lists_column_view.child()),
-                            stream,
-                            mr);
+    auto child_column = cudf::type_dispatcher<dispatch_storage_type>(
+      source_lists_column_view.child().child(1).type(),
+      list_child_constructor{},
+      child_list_views,
+      child_offsets->view(),
+      cudf::lists_column_view(source_lists_column_view.child()),
+      cudf::lists_column_view(target_lists_column_view.child()),
+      stream,
+      mr);
 
     auto child_null_mask =
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
@@ -672,7 +672,7 @@ struct list_child_constructor {
       iter_target_member_as_list,
       std::back_inserter(child_columns),
       [&](auto source_struct_member_as_list, auto target_struct_member_as_list) {
-        return cudf::type_dispatcher(
+        return cudf::type_dispatcher<dispatch_storage_type>(
           source_struct_member_as_list->child(cudf::lists_column_view::child_column_index).type(),
           list_child_constructor{},
           list_vector,
@@ -780,14 +780,14 @@ std::unique_ptr<column> scatter(
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
     list_size_begin, list_size_begin + target.size(), stream, mr);
 
-  auto child_column = cudf::type_dispatcher(child_column_type,
-                                            list_child_constructor{},
-                                            target_vector,
-                                            offsets_column->view(),
-                                            source_lists_column_view,
-                                            target_lists_column_view,
-                                            stream,
-                                            mr);
+  auto child_column = cudf::type_dispatcher<dispatch_storage_type>(child_column_type,
+                                                                   list_child_constructor{},
+                                                                   target_vector,
+                                                                   offsets_column->view(),
+                                                                   source_lists_column_view,
+                                                                   target_lists_column_view,
+                                                                   stream,
+                                                                   mr);
 
   auto null_mask =
     target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr};
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 5af3c29a3d9..decd2879f54 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -28,6 +28,8 @@
 #include <thrust/swap.h>
 #include <thrust/transform_reduce.h>
 
+#include <limits>
+
 namespace cudf {
 
 /**
@@ -407,39 +409,47 @@ class row_lexicographic_comparator {
 template <template <typename> class hash_function, bool has_nulls = true>
 class element_hasher {
  public:
-  template <typename T>
-  __device__ inline hash_value_type operator()(column_device_view col, size_type row_index)
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  __device__ hash_value_type operator()(column_device_view col, size_type row_index) const
   {
     if (has_nulls && col.is_null(row_index)) { return std::numeric_limits<hash_value_type>::max(); }
-
     return hash_function<T>{}(col.element<T>(row_index));
   }
+
+  template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+  __device__ hash_value_type operator()(column_device_view col, size_type row_index) const
+  {
+    cudf_assert(false && "Unsupported type in hash.");
+    return {};
+  }
 };
 
 template <template <typename> class hash_function, bool has_nulls = true>
 class element_hasher_with_seed {
  public:
-  __device__ element_hasher_with_seed()
-    : _seed{0}, _null_hash(std::numeric_limits<hash_value_type>::max())
-  {
-  }
-  __device__ element_hasher_with_seed(
-    uint32_t seed = 0, hash_value_type null_hash = std::numeric_limits<hash_value_type>::max())
+  element_hasher_with_seed() = default;
+  __device__ element_hasher_with_seed(uint32_t seed, hash_value_type null_hash)
     : _seed{seed}, _null_hash(null_hash)
   {
   }
-  // seed, null_hash, byte endianness
-  template <typename T>
-  __device__ inline hash_value_type operator()(column_device_view col, size_type row_index)
+
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  __device__ hash_value_type operator()(column_device_view col, size_type row_index) const
   {
     if (has_nulls && col.is_null(row_index)) { return _null_hash; }
-
     return hash_function<T>{_seed}(col.element<T>(row_index));
   }
 
+  template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+  __device__ hash_value_type operator()(column_device_view col, size_type row_index) const
+  {
+    cudf_assert(false && "Unsupported type in hash.");
+    return {};
+  }
+
  private:
-  uint32_t _seed;
-  hash_value_type _null_hash;
+  uint32_t _seed{0};
+  hash_value_type _null_hash{std::numeric_limits<hash_value_type>::max()};
 };
 
 /**
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 1e0d45d081d..aa5f554ad40 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -34,6 +34,19 @@ namespace cudf {
 template <typename...>
 using void_t = void;
 
+/**
+ * @brief Convenience macro for SFINAE as an unnamed template parameter.
+ *
+ * Example:
+ * \code{cpp}
+ * // This function will participate in overload resolution only if T is an integral type
+ * template <typename T, CUDF_ENABLE_IF(std::is_integral<T>::value)>
+ * void foo();
+ * \endcode
+ *
+ */
+#define CUDF_ENABLE_IF(...) std::enable_if_t<(__VA_ARGS__)>* = nullptr
+
 template <typename L, typename R, typename = void>
 struct is_relationally_comparable_impl : std::false_type {
 };
@@ -443,6 +456,23 @@ constexpr inline bool is_chrono(data_type type)
   return cudf::type_dispatcher(type, is_chrono_impl{});
 }
 
+/**
+ * @brief Indicates whether `T` is layout compatible with its "representation" type.
+ *
+ * For example, in a column, a `decimal32` is concretely represented by a single `int32_t`, but the
+ * `decimal32` type itself contains both the integer representation and the scale. Therefore,
+ * `decimal32` is _not_ layout compatible with `int32_t`.
+ *
+ * As further example, `duration_ns` is distinct from its concrete `int64_t` representation type,
+ * but they are layout compatible.
+ *
+ */
+template <typename T>
+constexpr bool is_rep_layout_compatible()
+{
+  return cudf::is_numeric<T>() or cudf::is_chrono<T>() or cudf::is_boolean<T>();
+}
+
 /**
  * @brief Indicates whether the type `T` is a dictionary type.
  *
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 422fc0821a0..e6adc027acc 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -22,16 +22,21 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.cuh>
 
+#include <cudf/utilities/traits.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
 namespace {
-/**
- * @brief Specialization of copy_if_else_functor for string_views.
- */
-template <typename T, typename Left, typename Right, typename Filter>
+
+template <typename T, typename Enable = void>
 struct copy_if_else_functor_impl {
+  std::unique_ptr<column> operator()(...) { CUDF_FAIL("Unsupported type for copy_if_else."); }
+};
+
+template <typename T>
+struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>()>> {
+  template <typename Left, typename Right, typename Filter>
   std::unique_ptr<column> operator()(Left const& lhs,
                                      Right const& rhs,
                                      size_type size,
@@ -69,8 +74,9 @@ struct copy_if_else_functor_impl {
 /**
  * @brief Specialization of copy_if_else_functor for string_views.
  */
-template <typename Left, typename Right, typename Filter>
-struct copy_if_else_functor_impl<string_view, Left, Right, Filter> {
+template <>
+struct copy_if_else_functor_impl<string_view> {
+  template <typename Left, typename Right, typename Filter>
   std::unique_ptr<column> operator()(Left const& lhs,
                                      Right const& rhs,
                                      size_type size,
@@ -107,8 +113,9 @@ struct copy_if_else_functor_impl<string_view, Left, Right, Filter> {
 /**
  * @brief Specialization of copy_if_else_functor for list_views.
  */
-template <typename Left, typename Right, typename Filter>
-struct copy_if_else_functor_impl<list_view, Left, Right, Filter> {
+template <>
+struct copy_if_else_functor_impl<list_view> {
+  template <typename Left, typename Right, typename Filter>
   std::unique_ptr<column> operator()(Left const& lhs,
                                      Right const& rhs,
                                      size_type size,
@@ -122,8 +129,9 @@ struct copy_if_else_functor_impl<list_view, Left, Right, Filter> {
   }
 };
 
-template <typename Left, typename Right, typename Filter>
-struct copy_if_else_functor_impl<struct_view, Left, Right, Filter> {
+template <>
+struct copy_if_else_functor_impl<struct_view> {
+  template <typename Left, typename Right, typename Filter>
   std::unique_ptr<column> operator()(Left const& lhs,
                                      Right const& rhs,
                                      size_type size,
@@ -152,7 +160,7 @@ struct copy_if_else_functor {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    copy_if_else_functor_impl<T, Left, Right, Filter> copier{};
+    copy_if_else_functor_impl<T> copier{};
     return copier(lhs, rhs, size, left_nullable, right_nullable, filter, stream, mr);
   }
 };
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 31a8796f950..f4ce9ea27ac 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -70,22 +70,19 @@ struct in_place_copy_range_dispatch {
   cudf::column_view const& source;
   cudf::mutable_column_view& target;
 
-  template <typename T>
-  std::enable_if_t<cudf::is_fixed_width<T>(), void> operator()(cudf::size_type source_begin,
-                                                               cudf::size_type source_end,
-                                                               cudf::size_type target_begin,
-                                                               rmm::cuda_stream_view stream)
+  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
+  void operator()(cudf::size_type source_begin,
+                  cudf::size_type source_end,
+                  cudf::size_type target_begin,
+                  rmm::cuda_stream_view stream)
   {
     in_place_copy_range<T>(source, target, source_begin, source_end, target_begin, stream);
   }
 
-  template <typename T>
-  std::enable_if_t<not cudf::is_fixed_width<T>(), void> operator()(cudf::size_type source_begin,
-                                                                   cudf::size_type source_end,
-                                                                   cudf::size_type target_begin,
-                                                                   rmm::cuda_stream_view stream)
+  template <typename T, typename... Args>
+  void operator()(Args&&...)
   {
-    CUDF_FAIL("in-place copy does not work for variable width types.");
+    CUDF_FAIL("Unsupported type for in-place copy.");
   }
 };
 
@@ -93,7 +90,18 @@ struct out_of_place_copy_range_dispatch {
   cudf::column_view const& source;
   cudf::column_view const& target;
 
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
+  std::unique_ptr<cudf::column> operator()(
+    cudf::size_type source_begin,
+    cudf::size_type source_end,
+    cudf::size_type target_begin,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  {
+    CUDF_FAIL("Unsupported type for out of place copy.");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
   std::unique_ptr<cudf::column> operator()(
     cudf::size_type source_begin,
     cudf::size_type source_end,
@@ -237,12 +245,12 @@ void copy_range_in_place(column_view const& source,
                "target should be nullable if source has null values.");
 
   if (source_end != source_begin) {  // otherwise no-op
-    cudf::type_dispatcher(target.type(),
-                          in_place_copy_range_dispatch{source, target},
-                          source_begin,
-                          source_end,
-                          target_begin,
-                          stream);
+    cudf::type_dispatcher<dispatch_storage_type>(target.type(),
+                                                 in_place_copy_range_dispatch{source, target},
+                                                 source_begin,
+                                                 source_end,
+                                                 target_begin,
+                                                 stream);
   }
 }
 
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index e7ae51ded9e..9fc64cd15a4 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -94,7 +94,17 @@ struct out_of_place_fill_range_dispatch {
   cudf::scalar const& value;
   cudf::column_view const& input;
 
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
+  std::unique_ptr<cudf::column> operator()(
+    cudf::size_type begin,
+    cudf::size_type end,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  {
+    CUDF_FAIL("Unsupported type in fill.");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
   std::unique_ptr<cudf::column> operator()(
     cudf::size_type begin,
     cudf::size_type end,
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 84dd41907d2..571c695e66e 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -25,11 +25,18 @@
 #include <dlpack/dlpack.h>
 
 #include <algorithm>
+#include <cudf/utilities/traits.hpp>
 
 namespace cudf {
 namespace {
 struct get_column_data_impl {
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  void const* operator()(column_view const& col)
+  {
+    CUDF_FAIL("Unsupported type to convert to dlpack.");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   void const* operator()(column_view const& col)
   {
     return col.data<T>();
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp
index 612e2111b66..99c9b386a15 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cpp
@@ -111,7 +111,17 @@ struct dispatch_to_cudf_column {
     return mask;
   }
 
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  std::unique_ptr<column> operator()(arrow::Array const& array,
+                                     data_type type,
+                                     bool skip_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_FAIL("Unsupported type in from_arrow.");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   std::unique_ptr<column> operator()(arrow::Array const& array,
                                      data_type type,
                                      bool skip_mask,
@@ -124,7 +134,7 @@ struct dispatch_to_cudf_column {
     auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
     auto mutable_column_view = col->mutable_view();
     CUDA_TRY(cudaMemcpyAsync(
-      mutable_column_view.data<void*>(),
+      mutable_column_view.data<T>(),
       reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(T),
       sizeof(T) * num_rows,
       cudaMemcpyDefault,
diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp
index 7daffc1a3c3..4bc50b21718 100644
--- a/cpp/src/interop/to_arrow.cpp
+++ b/cpp/src/interop/to_arrow.cpp
@@ -113,7 +113,14 @@ struct dispatch_to_arrow {
     return child_arrays;
   }
 
-  template <typename T>
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  std::shared_ptr<arrow::Array> operator()(
+    column_view, cudf::type_id, column_metadata const&, arrow::MemoryPool*, rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow.");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   std::shared_ptr<arrow::Array> operator()(column_view input_view,
                                            cudf::type_id id,
                                            column_metadata const& metadata,
diff --git a/cpp/src/jit/type.cpp b/cpp/src/jit/type.cpp
index 6b1e8c57c3d..e833a6fa10f 100644
--- a/cpp/src/jit/type.cpp
+++ b/cpp/src/jit/type.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <string>
 
@@ -25,15 +26,15 @@ struct get_data_ptr_functor {
   /**
    * @brief Gets the data pointer from a column_view
    */
-  template <typename T>
-  std::enable_if_t<is_fixed_width<T>(), const void*> operator()(column_view const& view)
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  void const* operator()(column_view const& view)
   {
     return static_cast<const void*>(view.template data<T>());
   }
 
   // TODO: both the failing operators can be combined into single template
-  template <typename T>
-  std::enable_if_t<not is_fixed_width<T>(), const void*> operator()(column_view const& view)
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  void const* operator()(column_view const& view)
   {
     CUDF_FAIL("Invalid data type for JIT operation");
   }
@@ -41,16 +42,16 @@ struct get_data_ptr_functor {
   /**
    * @brief Gets the data pointer from a scalar
    */
-  template <typename T>
-  std::enable_if_t<is_fixed_width<T>(), const void*> operator()(scalar const& s)
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  void const* operator()(scalar const& s)
   {
     using ScalarType = scalar_type_t<T>;
     auto s1          = static_cast<ScalarType const*>(&s);
     return static_cast<const void*>(s1->data());
   }
 
-  template <typename T>
-  std::enable_if_t<not is_fixed_width<T>(), const void*> operator()(scalar const& s)
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  void const* operator()(scalar const& s)
   {
     CUDF_FAIL("Invalid data type for JIT operation");
   }
@@ -58,12 +59,12 @@ struct get_data_ptr_functor {
 
 const void* get_data_ptr(column_view const& view)
 {
-  return type_dispatcher(view.type(), get_data_ptr_functor{}, view);
+  return type_dispatcher<dispatch_storage_type>(view.type(), get_data_ptr_functor{}, view);
 }
 
 const void* get_data_ptr(scalar const& s)
 {
-  return type_dispatcher(s.type(), get_data_ptr_functor{}, s);
+  return type_dispatcher<dispatch_storage_type>(s.type(), get_data_ptr_functor{}, s);
 }
 
 std::string get_type_name(data_type type)
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index be13405b469..24c0af12938 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -35,6 +35,7 @@
 
 #include <queue>
 #include <vector>
+#include "cudf/utilities/traits.hpp"
 
 namespace cudf {
 namespace detail {
@@ -235,10 +236,20 @@ rmm::device_vector<index_type> generate_merged_indices(
 struct column_merger {
   explicit column_merger(index_vector const& row_order) : row_order_(row_order) {}
 
+  template <typename Element, CUDF_ENABLE_IF(not is_rep_layout_compatible<Element>())>
+  std::unique_ptr<column> operator()(
+    column_view const& lcol,
+    column_view const& rcol,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const
+  {
+    CUDF_FAIL("Unsupported type for merge.");
+  }
+
   // column merger operator;
   //
-  template <typename Element>  // required: column type
-  std::unique_ptr<column> operator()(
+  template <typename Element>
+  std::enable_if_t<is_rep_layout_compatible<Element>(), std::unique_ptr<column>> operator()(
     column_view const& lcol,
     column_view const& rcol,
     rmm::cuda_stream_view stream,
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 50b1b7062d9..afc2bbb37bd 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -36,6 +36,7 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -153,7 +154,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
  *        `replace_nulls` with the appropriate data types.
  */
 struct replace_nulls_column_kernel_forwarder {
-  template <typename col_type, std::enable_if_t<cudf::is_fixed_width<col_type>()>* = nullptr>
+  template <typename col_type, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<col_type>())>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::column_view const& replacement,
                                            rmm::cuda_stream_view stream,
@@ -192,7 +193,7 @@ struct replace_nulls_column_kernel_forwarder {
     return output;
   }
 
-  template <typename col_type, std::enable_if_t<not cudf::is_fixed_width<col_type>()>* = nullptr>
+  template <typename col_type, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<col_type>())>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::column_view const& replacement,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/tests/copying/copy_tests.cu b/cpp/tests/copying/copy_tests.cu
index e2f5d1eec5c..e9249a6bd0e 100644
--- a/cpp/tests/copying/copy_tests.cu
+++ b/cpp/tests/copying/copy_tests.cu
@@ -26,6 +26,7 @@
 #include <cudf/detail/copy_if_else.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -66,7 +67,7 @@ TYPED_TEST(CopyTest, CopyIfElseTestManyNulls)
 }
 
 struct copy_if_else_tiny_grid_functor {
-  template <typename T, typename Filter, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, typename Filter, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& lhs,
                                            cudf::column_view const& rhs,
                                            Filter filter,
@@ -91,7 +92,7 @@ struct copy_if_else_tiny_grid_functor {
     return out;
   }
 
-  template <typename T, typename Filter, std::enable_if_t<not cudf::is_fixed_width<T>()>* = nullptr>
+  template <typename T, typename Filter, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& lhs,
                                            cudf::column_view const& rhs,
                                            Filter filter,

From fdcdb969614451ad895bd8818e848a8d98065824 Mon Sep 17 00:00:00 2001
From: ChrisJar <chris.jarrett.0@gmail.com>
Date: Mon, 29 Mar 2021 20:38:44 -0500
Subject: [PATCH 18/20] Enable typecasting between decimal and int (#7691)

This enables type conversions between `decimal` types and `int` types.

Closes #7440

Authors:
  - @ChrisJar

Approvers:
  - GALI PREM SAGAR (@galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/7691
---
 python/cudf/cudf/_lib/unary.pyx           | 25 ++-----
 python/cudf/cudf/core/column/decimal.py   | 11 +--
 python/cudf/cudf/core/column/numerical.py | 10 +--
 python/cudf/cudf/tests/test_decimal.py    | 82 +++++++++++++++++------
 4 files changed, 71 insertions(+), 57 deletions(-)

diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx
index 6e20dcaf299..3bac0cde9c6 100644
--- a/python/cudf/cudf/_lib/unary.pyx
+++ b/python/cudf/cudf/_lib/unary.pyx
@@ -25,7 +25,7 @@ from cudf._lib.cpp.unary cimport (
     unary_operator,
 )
 
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, dtype_to_data_type
 
 cimport cudf._lib.cpp.unary as libcudf_unary
 cimport cudf._lib.cpp.types as libcudf_types
@@ -95,30 +95,17 @@ def is_valid(Column input):
 
 def cast(Column input, object dtype=np.float64):
     cdef column_view c_input = input.view()
-    cdef type_id tid
-    cdef data_type c_dtype
-
-    # TODO: Use dtype_to_data_type when it becomes available
-    # to simplify this conversion
-    if is_decimal_dtype(dtype):
-        tid = libcudf_types.type_id.DECIMAL64
-        c_dtype = data_type(tid, -dtype.scale)
-    else:
-        tid = (
-            <type_id> (
-                <underlying_type_t_type_id> (
-                    np_to_cudf_types[np.dtype(dtype)]
-                )
-            )
-        )
-        c_dtype = data_type(tid)
+    cdef data_type c_dtype = dtype_to_data_type(dtype)
 
     cdef unique_ptr[column] c_result
 
     with nogil:
         c_result = move(libcudf_unary.cast(c_input, c_dtype))
 
-    return Column.from_unique_ptr(move(c_result))
+    result = Column.from_unique_ptr(move(c_result))
+    if is_decimal_dtype(result.dtype):
+        result.dtype.precision = dtype.precision
+    return result
 
 
 def is_nan(Column input):
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 4ba675516ae..96e09a5abb5 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -4,7 +4,6 @@
 import cupy as cp
 import numpy as np
 import pyarrow as pa
-from pandas.api.types import is_integer_dtype
 from typing import cast
 
 from cudf import _lib as libcudf
@@ -80,19 +79,11 @@ def as_decimal_column(
     ) -> "cudf.core.column.DecimalColumn":
         if dtype == self.dtype:
             return self
-        result = libcudf.unary.cast(self, dtype)
-        if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-            result.dtype.precision = dtype.precision
-        return result
+        return libcudf.unary.cast(self, dtype)
 
     def as_numerical_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.NumericalColumn":
-        if is_integer_dtype(dtype):
-            raise NotImplementedError(
-                "Casting from decimal types to integer "
-                "types not currently supported"
-            )
         return libcudf.unary.cast(self, dtype)
 
     def as_string_column(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index da77517c75d..f58a47a918c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -208,15 +208,7 @@ def as_timedelta_column(
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.DecimalColumn":
-        if is_integer_dtype(self.dtype):
-            raise NotImplementedError(
-                "Casting from integer types to decimal "
-                "types not currently supported"
-            )
-        result = libcudf.unary.cast(self, dtype)
-        if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-            result.dtype.precision = dtype.precision
-        return result
+        return libcudf.unary.cast(self, dtype)
 
     def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         dtype = np.dtype(dtype)
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index ddf56828c3d..80ff9d5734c 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -11,7 +11,9 @@
 from cudf.core.column import DecimalColumn, NumericalColumn
 
 from cudf.tests.utils import (
+    NUMERIC_TYPES,
     FLOAT_TYPES,
+    INTEGER_TYPES,
     assert_eq,
 )
 
@@ -75,18 +77,59 @@ def test_from_arrow_max_precision():
     "to_dtype",
     [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)],
 )
-def test_typecast_to_decimal(data, from_dtype, to_dtype):
-    actual = data.astype(from_dtype)
-    expected = actual
+def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype):
+    got = data.astype(from_dtype)
 
-    actual = actual.astype(to_dtype)
-    pa_arr = expected.to_arrow().cast(
+    pa_arr = got.to_arrow().cast(
         pa.decimal128(to_dtype.precision, to_dtype.scale)
     )
     expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))
 
-    assert_eq(actual, expected)
-    assert_eq(actual.dtype, expected.dtype)
+    got = got.astype(to_dtype)
+
+    assert_eq(got, expected)
+    assert_eq(got.dtype, expected.dtype)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cudf.Series(
+            [
+                14.12302,
+                38.2,
+                np.nan,
+                0.0,
+                -8.302014,
+                np.nan,
+                94.31304,
+                np.nan,
+                -112.2314,
+                0.3333333,
+                np.nan,
+            ]
+        ),
+    ],
+)
+@pytest.mark.parametrize("from_dtype", INTEGER_TYPES)
+@pytest.mark.parametrize(
+    "to_dtype",
+    [Decimal64Dtype(9, 3), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)],
+)
+def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype):
+    got = data.astype(from_dtype)
+
+    pa_arr = (
+        got.to_arrow()
+        .cast("float64")
+        .cast(pa.decimal128(to_dtype.precision, to_dtype.scale))
+    )
+    expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))
+
+    got = got.astype(to_dtype)
+
+    assert_eq(got, expected)
+    assert_eq(got.dtype, expected.dtype)
 
 
 @pytest.mark.parametrize(
@@ -117,17 +160,17 @@ def test_typecast_to_decimal(data, from_dtype, to_dtype):
     [Decimal64Dtype(7, 2), Decimal64Dtype(18, 10), Decimal64Dtype(11, 4)],
 )
 def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
-    actual = data.astype(from_dtype)
-    expected = actual
+    got = data.astype(from_dtype)
 
-    actual = actual.astype(to_dtype)
-    pa_arr = expected.to_arrow().cast(
+    pa_arr = got.to_arrow().cast(
         pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False
     )
     expected = cudf.Series(DecimalColumn.from_arrow(pa_arr))
 
-    assert_eq(actual, expected)
-    assert_eq(actual.dtype, expected.dtype)
+    got = got.astype(to_dtype)
+
+    assert_eq(got, expected)
+    assert_eq(got.dtype, expected.dtype)
 
 
 @pytest.mark.parametrize(
@@ -151,14 +194,15 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
 )
 @pytest.mark.parametrize(
     "from_dtype",
-    [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 10)],
+    [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(17, 10)],
 )
-@pytest.mark.parametrize("to_dtype", FLOAT_TYPES)
+@pytest.mark.parametrize("to_dtype", NUMERIC_TYPES)
 def test_typecast_from_decimal(data, from_dtype, to_dtype):
-    actual = data.astype(from_dtype)
-    pa_arr = actual.to_arrow().cast(to_dtype, safe=False)
+    got = data.astype(from_dtype)
+    pa_arr = got.to_arrow().cast(to_dtype, safe=False)
 
-    actual = actual.astype(to_dtype)
+    got = got.astype(to_dtype)
     expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))
 
-    assert_eq(actual, expected)
+    assert_eq(got, expected)
+    assert_eq(got.dtype, expected.dtype)

From 599f62d1f3aea59fd6429911bfeb394349428c83 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 29 Mar 2021 21:10:12 -0500
Subject: [PATCH 19/20] Add Java bindings for row_bit_count (#7749)

Adds Java bindings for `cudf::row_bit_count`.  This depends on #7534.

Authors:
  - Jason Lowe (@jlowe)

Approvers:
  - Robert (Bobby) Evans (@revans2)

URL: https://github.com/rapidsai/cudf/pull/7749
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 24 +++++++++++++++++
 java/src/main/native/src/TableJni.cpp         | 11 ++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 26 +++++++++++++++++++
 3 files changed, 61 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index fc6ad55044a..8f256987dd2 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -529,6 +529,8 @@ private static native long[] repeatColumnCount(long tableHandle,
                                                  long columnHandle,
                                                  boolean checkCount);
 
+  private static native long rowBitCount(long tableHandle) throws CudfException;
+
   private static native long[] explode(long tableHandle, int index);
 
   private static native long[] explodePosition(long tableHandle, int index);
@@ -1906,6 +1908,28 @@ public Table explodeOuterPosition(int index) {
     return new Table(explodeOuterPosition(nativeHandle, index));
   }
 
+  /**
+   * Returns an approximate cumulative size in bits of all columns in the `table_view` for each row.
+   * This function counts bits instead of bytes to account for the null mask which only has one
+   * bit per row. Each row in the returned column is the sum of the per-row bit size for each column
+   * in the table.
+   *
+   * In some cases, this is an inexact approximation. Specifically, columns of lists and strings
+   * require N+1 offsets to represent N rows. It is up to the caller to calculate the small
+   * additional overhead of the terminating offset for any group of rows being considered.
+   *
+   * This function returns the per-row bit sizes as the columns are currently formed. This can
+   * end up being larger than the number you would get by gathering the rows. Specifically,
+   * the push-down of struct column validity masks can nullify rows that contain data for
+   * string or list columns. In these cases, the size returned is conservative such that:
+   * row_bit_count(column(x)) >= row_bit_count(gather(column(x)))
+   *
+   * @return INT32 column of bit size per row of the table
+   */
+  public ColumnVector rowBitCount() {
+    return new ColumnVector(rowBitCount(getNativeView()));
+  }
+
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
    * in the resulting table's columns will contain row "gatherMap[i]" from this table.
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 0e66cde3ee1..346ae8435cc 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2366,4 +2366,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIE
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv* env, jclass, jlong j_table) {
+  JNI_NULL_CHECK(env, j_table, "table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto t = reinterpret_cast<cudf::table_view*>(j_table);
+    std::unique_ptr<cudf::column> result = cudf::row_bit_count(*t);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index ac71f96d3c3..9c67966c16c 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -4344,6 +4344,32 @@ void testGroupByNoAggs() {
     }
   }
 
+  @Test
+  void testRowBitCount() {
+    try (Table t = new Table.TestBuilder()
+        .column(0, 1, null, 3)                 // 33 bits per row (4 bytes + valid bit)
+        .column(0.0, null, 2.0, 3.0)           // 65 bits per row (8 bytes + valid bit)
+        .column("zero", null, "two", "three")  // 33 bits (4 byte offset + valid bit) + char bits
+        .build();
+         ColumnVector expected = ColumnVector.fromInts(163, 131, 155, 171);
+         ColumnVector actual = t.rowBitCount()) {
+      assertColumnsAreEqual(expected, actual);
+    }
+  }
+
+  @Test
+  void testRowBitCountEmpty() {
+    try (Table t = new Table.TestBuilder()
+            .column(new Integer[0])
+            .column(new Double[0])
+            .column(new String[0])
+            .build();
+         ColumnVector c = t.rowBitCount()) {
+      assertEquals(DType.INT32, c.getType());
+      assertEquals(0, c.getRowCount());
+    }
+  }
+
   @Test
   void testSimpleGather() {
     try (Table testTable = new Table.TestBuilder()

From 7d49f75df9681dbe1653029e7d508355884a6d86 Mon Sep 17 00:00:00 2001
From: Mike Wendt <1915404+mike-wendt@users.noreply.github.com>
Date: Tue, 30 Mar 2021 10:46:20 -0400
Subject: [PATCH 20/20] Update conda recipes pinning of repo dependencies
 (#7743)

Ensure all conda packages created in this repo that depend on other packages are all version pinned to the same build number. This way it prevents a conda solve from picking mismatched versions of `cudf` and `libcudf` among others that can break this repo and others.

Authors:
  - Mike Wendt (@mike-wendt)

Approvers:
  - Ray Douglass (@raydouglass)

URL: https://github.com/rapidsai/cudf/pull/7743
---
 conda/recipes/cudf/meta.yaml          | 2 +-
 conda/recipes/cudf_kafka/meta.yaml    | 8 ++++----
 conda/recipes/custreamz/meta.yaml     | 8 ++++----
 conda/recipes/dask-cudf/meta.yaml     | 6 +++---
 conda/recipes/libcudf_kafka/meta.yaml | 2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 5635f54ba20..a119040bbcf 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -28,7 +28,7 @@ requirements:
     - numba >=0.49.0
     - dlpack
     - pyarrow 1.0.1
-    - libcudf {{ version }}
+    - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - rmm {{ minor_version }}
     - cudatoolkit {{ cuda_version }}
   run:
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 0acd9ec4bb2..cc3f30091bf 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -29,12 +29,12 @@ requirements:
     - python
     - cython >=0.29,<0.30
     - setuptools
-    - cudf {{ version }}
-    - libcudf_kafka {{ version }}
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
+    - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
   run:
-    - libcudf_kafka {{ version }}
+    - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - python-confluent-kafka
-    - cudf {{ version }}
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
 
 test:
   requires:
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index ffda6d0c3c6..8edca7a51d0 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -23,15 +23,15 @@ requirements:
   host:
     - python
     - python-confluent-kafka
-    - cudf_kafka {{ version }}
+    - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
   run:
     - python
-    - streamz 
-    - cudf {{ version }}
+    - streamz
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - dask >=2.22.0
     - distributed >=2.22.0
     - python-confluent-kafka
-    - cudf_kafka {{ version }}
+    - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
 
 test:
   requires:
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 170075743bd..04992f8e481 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -22,15 +22,15 @@ build:
 requirements:
   host:
     - python
-    - cudf {{ version }}
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - dask >=2.22.0
     - distributed >=2.22.0
   run:
     - python
-    - cudf {{ version }}
+    - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - dask >=2.22.0
     - distributed >=2.22.0
-  
+
 test:
   requires:
     - cudatoolkit {{ cuda_version }}.*
diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index 5348ec471e9..81ff922b8d7 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -25,7 +25,7 @@ requirements:
   build:
     - cmake >=3.17.0
   host:
-    - libcudf {{ version }}
+    - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }}
     - librdkafka >=1.5.0,<1.5.3
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not