diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu
index bd013afc451..fa6afdd908c 100644
--- a/cpp/benchmarks/join/join_benchmark.cu
+++ b/cpp/benchmarks/join/join_benchmark.cu
@@ -105,12 +105,8 @@ static void BM_join(benchmark::State &state)
   for (auto _ : state) {
     cuda_event_timer raii(state, true, 0);
 
-    auto result = cudf::inner_join(probe_table,
-                                   build_table,
-                                   columns_to_join,
-                                   columns_to_join,
-                                   {{0, 0}},
-                                   cudf::null_equality::UNEQUAL);
+    auto result = cudf::inner_join(
+      probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL);
   }
 }
 
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index b2c1296ccef..fcc0bcd444e 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -20,6 +20,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <vector>
 
@@ -30,6 +31,44 @@ namespace cudf {
  * @file
  */
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to an
+ * inner join between the specified tables.
+ *
+ * The first returned vector contains the row indices from the left
+ * table that have a match in the right table (in unspecified order).
+ * The corresponding values in the second returned vector are
+ * the matched row indices from the right table.
+ *
+ * @code{.pseudo}
+ *     Left: {{0, 1, 2}}
+ *     Right: {{1, 2, 3}}
+ *     Result: {{1, 2}, {0, 1}}
+ *
+ *     Left: {{0, 1, 2}, {3, 4, 5}}
+ *     Right: {{1, 2, 3}, {4, 6, 7}}
+ *     Result: {{1}, {0}}
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param[in] left_keys The left table
+ * @param[in] right_keys The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing an inner join between two tables with `left_keys` and `right_keys`
+ * as the join keys .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+inner_join(cudf::table_view const& left_keys,
+           cudf::table_view const& right_keys,
+           null_equality compare_nulls         = null_equality::EQUAL,
+           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs an inner join on the specified columns of two
  * tables (`left`, `right`)
@@ -38,26 +77,13 @@ namespace cudf {
  * in the columns being joined on match.
  *
  * @code{.pseudo}
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, a: {1, 2, 5}
+ *          Left: {{0, 1, 2}}
+ *          Right: {{4, 9, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {1}
- *          columns_in_common: { {0, 1} }
- * Result: { a: {1, 2}, b: {1, 2} }
- *
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, c: {1, 2, 5}
- *          left_on: {0}
- *          right_on: {0}
- *          columns_in_common: { }
- * Result: { a: {1, 2}, b: {1, 2}, c: {1, 2} }
+ * Result: {{1, 2}, {4, 9}, {1, 2}}
  * @endcode
  *
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`.
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) such that the location of `L` within `left_on` is not equal to
- * location of R within `right_on`
  * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
  * mismatch.
  * @throw cudf::logic_error if number of columns in either `left` or `right`
@@ -73,59 +99,83 @@ namespace cudf {
  * @param[in] right_on The column indices from `right` to join on.
  * The column from `right` indicated by `right_on[i]` will be compared against the column
  * from `left` indicated by `left_on[i]`.
- * @param[in] columns_in_common is a vector of pairs of column indices into
- * `left` and `right`, respectively, that are "in common". For "common"
- * columns, only a single output column will be produced, which is gathered
- * from `left_on` columns. Else, for every column in `left_on` and `right_on`,
- * an output column will be produced.  For each of these pairs (L, R), L
- * should exist in `left_on` and R should exist in `right_on`.
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`. The resulting table will be joined columns of
- * `left(including common columns)+right(excluding common columns)`.
+ * specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> inner_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to a
+ * left join between the specified tables.
+ *
+ * The first returned vector contains all the row indices from the left
+ * table (in unspecified order). The corresponding value in the
+ * second returned vector is either (1) the row index of the matched row
+ * from the right table, if there is a match  or  (2) an unspecified
+ * out-of-bounds value.
+ *
+ * @code{.pseudo}
+ *     Left: {{0, 1, 2}}
+ *     Right: {{1, 2, 3}}
+ *     Result: {{0, 1, 2}, {None, 0, 1}}
+ *
+ *     Left: {{0, 1, 2}, {3, 4, 5}}
+ *     Right: {{1, 2, 3}, {4, 6, 7}}
+ *     Result: {{0, 1, 2}, {None, 0, None}}
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param[in] left_keys The left table
+ * @param[in] right_keys The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a left join between two tables with `left_keys` and `right_keys`
+ * as the join keys .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+left_join(cudf::table_view const& left_keys,
+          cudf::table_view const& right_keys,
+          null_equality compare_nulls         = null_equality::EQUAL,
+          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a left join (also known as left outer join) on the
  * specified columns of two tables (`left`, `right`)
  *
- * Left Join returns all the rows from the left table and those rows from the
+ * Left join returns all the rows from the left table and those rows from the
  * right table that match on the joined columns.
  * For rows from the right table that do not have a match, the corresponding
  * values in the left columns will be null.
  *
  * @code{.pseudo}
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, a: {1 ,2 ,5}
+ *          Left: {{0, 1, 2}}
+ *          Right: {{1, 2, 3}, {1, 2 ,5}}
  *          left_on: {0}
  *          right_on: {1}
- *          columns_in_common: { {0, 1} }
- * Result: { a: {0, 1, 2}, b: {NULL, 1, 2} }
+ * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} }
  *
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, c: {1, 2, 5}
+ *          Left: {{0, 1, 2}}
+ *          Right {{1, 2, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {0}
- *          columns_in_common: { }
- * Result: { a: {0, 1, 2}, b: {NULL, 1, 2}, c: {NULL, 1, 2} }
+ * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} }
  * @endcode
  *
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`.
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) such that the location of `L` within `left_on` is not equal to
- * location of R within `right_on`
  * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
  * mismatch.
  * @throw cudf::logic_error if number of columns in either `left` or `right`
@@ -141,29 +191,59 @@ std::unique_ptr<cudf::table> inner_join(
  * @param[in] right_on The column indices from `right` to join on.
  * The column from `right` indicated by `right_on[i]` will be compared against the column
  * from `left` indicated by `left_on[i]`.
- * @param[in] columns_in_common is a vector of pairs of column indices into
- * `left` and `right`, respectively, that are "in common". For "common"
- * columns, only a single output column will be produced, which is gathered
- * from `left_on` columns. Else, for every column in `left_on` and `right_on`,
- * an output column will be produced.  For each of these pairs (L, R), L
- * should exist in `left_on` and R should exist in `right_on`.
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`. The resulting table will be joined columns of
- * `left(including common columns)+right(excluding common columns)`.
+ * specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> left_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to a
+ * full join between the specified tables.
+ *
+ * Taken pairwise, the values from the returned vectors are one of:
+ * (1) row indices corresponding to matching rows from the left and
+ * right tables, (2) a row index and an unspecified out-of-bounds value,
+ * representing a row from one table without a match in the other.
+ *
+ * @code{.pseudo}
+ *     Left: {{0, 1, 2}}
+ *     Right: {{1, 2, 3}}
+ *     Result: {{0, 1, 2, None}, {None, 0, 1, 2}}
+ *
+ *     Left: {{0, 1, 2}, {3, 4, 5}}
+ *     Right: {{1, 2, 3}, {4, 6, 7}}
+ *     Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}}
+ *
+ * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
+ * mismatch.
+ *
+ * @param[in] left The left table
+ * @param[in] right The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a full join between two tables with `left_keys` and `right_keys`
+ * as the join keys .
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+full_join(cudf::table_view const& left_keys,
+          cudf::table_view const& right_keys,
+          null_equality compare_nulls         = null_equality::EQUAL,
+          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a full join (also known as full outer join) on the
  * specified columns of two tables (`left`, `right`)
@@ -174,26 +254,19 @@ std::unique_ptr<cudf::table> left_join(
  * values in the left columns will be null.
  *
  * @code{.pseudo}
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, c: {1, 2, 5}
+ *          Left: {{0, 1, 2}}
+ *          Right: {{1, 2, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {1}
- *          columns_in_common: { {0, 1} }
- * Result: { a: {0, 1, 2, NULL}, b: {NULL, 1, 2, 3}, c: {NULL, 1, 2, 5} }
+ * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} }
  *
- *          Left a: {0, 1, 2}
- *          Right b: {1, 2, 3}, c: {1, 2, 5}
+ *          Left: {{0, 1, 2}}
+ *          Right: {{1, 2, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {0}
- *          columns_in_common: { }
- * Result: { a: {0, 1, 2, NULL}, b: {NULL, 1, 2, 3}, c: {NULL, 1, 2, 5} }
+ * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} }
  * @endcode
  *
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) if L does not exist in `left_on` or R does not exist in `right_on`.
- * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
- * (L, R) such that the location of `L` within `left_on` is not equal to
- * location of R within `right_on`
  * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
  * mismatch.
  * @throw cudf::logic_error if number of columns in either `left` or `right`
@@ -209,28 +282,54 @@ std::unique_ptr<cudf::table> left_join(
  * @param[in] right_on The column indices from `right` to join on.
  * The column from `right` indicated by `right_on[i]` will be compared against the column
  * from `left` indicated by `left_on[i]`.
- * @param[in] columns_in_common is a vector of pairs of column indices into
- * `left` and `right`, respectively, that are "in common". For "common"
- * columns, only a single output column will be produced, which is gathered
- * from `left_on` columns. Else, for every column in `left_on` and `right_on`,
- * an output column will be produced.  For each of these pairs (L, R), L
- * should exist in `left_on` and R should exist in `right_on`.
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`. The resulting table will be joined columns of
- * `left(including common columns)+right(excluding common columns)`.
+ * specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> full_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a vector of row indices corresponding to a left semi join
+ * between the specified tables.
+ *
+ * The returned vector contains the row indices from the left table
+ * for which there is a matching row in the right table.
+ *
+ * @code{.pseudo}
+ *          TableA: {{0, 1, 2}}
+ *          TableB: {{1, 2, 3}}
+ *          right_on: {1}
+ * Result: {1, 2}
+ * @endcode
+ *
+ * @throw cudf::logic_error if number of columns in either
+ * `left_keys` or `right_keys` table is 0 or exceeds MAX_JOIN_SIZE
+ *
+ * @param[in] left_keys The left table
+ * @param[in] right_keys The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A vector `left_indices` that can be used to construct
+ * the result of performing a left semi join between two tables with
+ * `left_keys` and `right_keys` as the join keys .
+ */
+std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
+  cudf::table_view const& left_keys,
+  cudf::table_view const& right_keys,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a left semi join on the specified columns of two
  * tables (`left`, `right`)
@@ -239,24 +338,20 @@ std::unique_ptr<cudf::table> full_join(
  * returns rows that exist in the right table.
  *
  * @code{.pseudo}
- *          TableA a: {0, 1, 2}
- *          TableB b: {1, 2, 3}, a: {1, 2, 5}
+ *          TableA: {{0, 1, 2}}
+ *          TableB: {{1, 2, 3}, {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {1}
- *          return_columns: { 0 }
- * Result: { a: {1, 2} }
+ * Result: { {1, 2} }
  *
- *          TableA a: {0, 1, 2}, c: {1, 2, 5}
- *          TableB b: {1, 2, 3}
+ *          TableA {{0, 1, 2}, {1, 2, 5}}
+ *          TableB {{1, 2, 3}}
  *          left_on: {0}
  *          right_on: {0}
- *          return_columns: { 1 }
- * Result: { c: {1, 2} }
+ * Result: { {1, 2}, {2, 5} }
  * @endcode
  *
- * @throw cudf::logic_error if the number of columns in either `left` or `right` table is 0
- * @throw cudf::logic_error if the number of returned columns is 0
- * @throw cudf::logic_error if the number of elements in `left_on` and `right_on` are not equal
+ * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0
  *
  * @param[in] left             The left table
  * @param[in] right            The right table
@@ -268,22 +363,49 @@ std::unique_ptr<cudf::table> full_join(
  *                             The column from `right` indicated by `right_on[i]`
  *                             will be compared against the column from `left`
  *                             indicated by `left_on[i]`.
- * @param[in] return_columns   A vector of column indices from `left` to
- *                             include in the returned table.
  * @param[in] compare_nulls    Controls whether null join-key values should match or not.
  * @param[in] mr               Device memory resource used to allocate the returned table's
  *                             device memory
  *
  * @return                     Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`. The resulting table
- *                             will contain `return_columns` from `left` that match in right.
+ *                             specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<cudf::size_type> const& return_columns,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a vector of row indices corresponding to a left anti join
+ * between the specified tables.
+ *
+ * The returned vector contains the row indices from the left table
+ * for which there is no matching row in the right table.
+ *
+ * @code{.pseudo}
+ *          TableA: {{0, 1, 2}}
+ *          TableB: {{1, 2, 3}}
+ * Result: {0}
+ * @endcode
+ *
+ * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0
+ *
+ * @param[in] left_keys The left table
+ * @param[in] right_keys The right table
+ * @param[in] compare_nulls controls whether null join-key values
+ * should match or not.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A column `left_indices` that can be used to construct
+ * the result of performing a left anti join between two tables with
+ * `left_keys` and `right_keys` as the join keys .
+ */
+std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
+  cudf::table_view const& left_keys,
+  cudf::table_view const& right_keys,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -295,24 +417,23 @@ std::unique_ptr<cudf::table> left_semi_join(
  * returns rows that do not exist in the right table.
  *
  * @code{.pseudo}
- *          TableA a: {0, 1, 2}
- *          TableB b: {1, 2, 3}, a: {1, 2, 5}
+ *          TableA: {{0, 1, 2}}
+ *          TableB: {{1, 2, 3},  {1, 2, 5}}
  *          left_on: {0}
  *          right_on: {1}
- *          return_columns: { 0 }
- * Result: { a: {0} }
+ * Result: {{0}, {1}}
  *
- *          TableA a: {0, 1, 2}, c: {1, 2, 5}
- *          TableB b: {1, 2, 3}
+ *          TableA: {{0, 1, 2}, {1, 2, 5}}
+ *          TableB: {{1, 2, 3}}
  *          left_on: {0}
  *          right_on: {0}
- *          return_columns: { 1 }
- * Result: { c: {1} }
+ * Result: { {0} {1} }
  * @endcode
  *
- * @throw cudf::logic_error if the number of columns in either `left` or `right` table is 0
- * @throw cudf::logic_error if the number of returned columns is 0
- * @throw cudf::logic_error if the number of elements in `left_on` and `right_on` are not equal
+ * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
+ * mismatch.
+ * @throw cudf::logic_error if number of columns in either `left` or `right`
+ * table is 0 or exceeds MAX_JOIN_SIZE
  *
  * @param[in] left             The left table
  * @param[in] right            The right table
@@ -324,22 +445,18 @@ std::unique_ptr<cudf::table> left_semi_join(
  *                             The column from `right` indicated by `right_on[i]`
  *                             will be compared against the column from `left`
  *                             indicated by `left_on[i]`.
- * @param[in] return_columns   A vector of column indices from `left` to
- *                             include in the returned table.
  * @param[in] compare_nulls    Controls whether null join-key values should match or not.
  * @param[in] mr               Device memory resource used to allocate the returned table's
  *                             device memory
  *
  * @return                     Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`. The resulting table
- *                             will contain `return_columns` from `left` that match in right.
+ *                             specified by `left_on` and `right_on`.
  */
 std::unique_ptr<cudf::table> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<cudf::size_type> const& return_columns,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -393,128 +510,75 @@ class hash_join {
    * undefined.
    *
    * @param build The build table, from which the hash table is built.
-   * @param build_on The column indices from `build` to join on.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   hash_join(cudf::table_view const& build,
-            std::vector<size_type> const& build_on,
             null_equality compare_nulls,
             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
-   * @brief Controls where common columns will be output for a inner join.
-   */
-  enum class common_columns_output_side {
-    PROBE,  ///< Common columns is output in the probe portion of the table pair returned by
-            ///< `inner_join`.
-    BUILD   ///< Common columns is output in the build portion of the table pair returned by
-            ///< `inner_join`.
-  };
-
-  /**
-   * @brief Performs an inner join by probing in the internal hash table.
-   *
-   * Given that it is sometimes desired to choose the small table to be the `build` side for an
-   * inner join，a (`probe`, `build`) table pair, which contains the probe and build portions of the
-   * logical joined table respectively, is returned so that caller can freely rearrange them to
-   * restore the logical `left` `right` order. This introduces some extra logic about where "common"
-   * columns should go, i.e. the legacy `cudf::inner_join()` API always outputs "common" columns in
-   * the `left` portion and the corresponding columns in the `right` portion are omitted. To better
-   * align with the legacy `cudf::inner_join()` API, a `common_columns_output_side` parameter is
-   * introduced to specify whether "common" columns should go in `probe` or `build` portion.
-   *
-   * More details please @see cudf::inner_join().
+   * Returns the row indices that can be used to construct the result of performing
+   * an inner join between two tables. @see cudf::inner_join().
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param probe_on The column indices from `probe` to join on.
-   * @param columns_in_common is a vector of pairs of column indices into
-   * `probe` and `build`, respectively, that are "in common". For "common"
-   * columns, only a single output column will be produced, which is gathered
-   * from `probe_on` columns or `build_on` columns if `probe_output_side` is LEFT or RIGHT.
-   * Else, for every column in `probe_on` and `build_on`,
-   * an output column will be produced. For each of these pairs (P, B), P
-   * should exist in `probe_on` and B should exist in `build_on`.
-   * @param common_columns_output_side @see `common_columns_output_side`.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return Table pair of (`probe`, `build`) of joining both tables on the columns
-   * specified by `probe_on` and `build_on`. The resulting table pair will be joined columns of
-   * (`probe(including common columns)`, `build(excluding common columns)`) if
-   * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`,
-   * `build(including common columns)`) if `common_columns_output_side` is `BUILD`.
+   * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
+   * the result of performing an inner join between two tables with `build` and `probe`
+   * as the the join keys .
    */
-  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> inner_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    common_columns_output_side common_columns_output_side = common_columns_output_side::PROBE,
-    null_equality compare_nulls                           = null_equality::EQUAL,
-    rmm::cuda_stream_view stream                          = rmm::cuda_stream_default,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  inner_join(cudf::table_view const& probe,
+             null_equality compare_nulls         = null_equality::EQUAL,
+             rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+             rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
   /**
-   * @brief Performs a left join by probing in the internal hash table.
-   *
-   * More details please @see cudf::left_join().
+   * Returns the row indices that can be used to construct the result of performing
+   * a left join between two tables. @see cudf::left_join().
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param probe_on The column indices from `probe` to join on.
-   * @param columns_in_common is a vector of pairs of column indices into
-   * `probe` and `build`, respectively, that are "in common". For "common"
-   * columns, only a single output column will be produced, which is gathered
-   * from `probe_on` columns. Else, for every column in `probe_on` and `build_on`,
-   * an output column will be produced. For each of these pairs (P, B), P
-   * should exist in `probe_on` and B should exist in `build_on`.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return Result of joining `build` and `probe` tables on the columns
-   * specified by `build_on` and `probe_on`. The resulting table will be joined columns of
-   * `probe(including common columns)+build(excluding common columns)`.
+   * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
+   * the result of performing a left join between two tables with `build` and `probe`
+   * as the the join keys .
    */
-  std::unique_ptr<cudf::table> left_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    null_equality compare_nulls         = null_equality::EQUAL,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  left_join(cudf::table_view const& probe,
+            null_equality compare_nulls         = null_equality::EQUAL,
+            rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+            rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
   /**
-   * @brief Performs a full join by probing in the internal hash table.
-   *
-   * More details please @see cudf::full_join().
+   * Returns the row indices that can be used to construct the result of performing
+   * a full join between two tables. @see cudf::full_join().
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param probe_on The column indices from `probe` to join on.
-   * @param columns_in_common is a vector of pairs of column indices into
-   * `probe` and `build`, respectively, that are "in common". For "common"
-   * columns, only a single output column will be produced, which is gathered
-   * from `probe_on` columns. Else, for every column in `probe_on` and `build_on`,
-   * an output column will be produced. For each of these pairs (P, B), P
-   * should exist in `probe_on` and B should exist in `build_on`.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return Result of joining `build` and `probe` tables on the columns
-   * specified by `build_on` and `probe_on`. The resulting table will be joined columns of
-   * `probe(including common columns)+build(excluding common columns)`.
+   * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
+   * the result of performing a full join between two tables with `build` and `probe`
+   * as the the join keys .
    */
-  std::unique_ptr<cudf::table> full_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    null_equality compare_nulls         = null_equality::EQUAL,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  full_join(cudf::table_view const& probe,
+            null_equality compare_nulls         = null_equality::EQUAL,
+            rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+            rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   struct hash_join_impl;
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 5cdecab9115..a225e590f9a 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -126,6 +126,11 @@ class table_view_base {
    */
   size_type num_rows() const noexcept { return _num_rows; }
 
+  /**
+   * @brief Returns true if `num_columns()` returns zero, or false otherwise
+   */
+  size_type is_empty() const noexcept { return num_columns() == 0; }
+
   table_view_base() = default;
 
   ~table_view_base() = default;
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index dc153e9395d..181752d18e8 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -43,9 +43,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
 
   if (neg_indices == negative_index_policy::ALLOWED) {
     cudf::size_type n_rows = source_table.num_rows();
-    auto idx_converter     = [n_rows] __device__(size_type in) {
-      return ((in % n_rows) + n_rows) % n_rows;
-    };
+    auto idx_converter = [n_rows] __device__(size_type in) { return in < 0 ? in + n_rows : in; };
     return gather(source_table,
                   thrust::make_transform_iterator(map_begin, idx_converter),
                   thrust::make_transform_iterator(map_end, idx_converter),
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index d827d03a6c0..5a6ad8892de 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <thrust/uninitialized_fill.h>
 #include <join/hash_join.cuh>
 
 #include <cudf/detail/concatenate.cuh>
@@ -20,93 +21,44 @@
 #include <cudf/detail/gather.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <iostream>
 #include <numeric>
 
 namespace cudf {
 namespace detail {
 
-/**
- * @brief Returns a vector with non-common indices which is set difference
- * between `[0, num_columns)` and index values in common_column_indices
- *
- * @param num_columns The number of columns, which represents column indices
- * from `[0, num_columns)` in a table
- * @param common_column_indices A vector of common indices which needs to be
- * excluded from `[0, num_columns)`
- *
- * @return vector A vector containing only the indices which are not present in
- * `common_column_indices`
- */
-auto non_common_column_indices(size_type num_columns,
-                               std::vector<size_type> const &common_column_indices)
-{
-  CUDF_EXPECTS(common_column_indices.size() <= static_cast<uint64_t>(num_columns),
-               "Too many columns in common");
-  std::vector<size_type> all_column_indices(num_columns);
-  std::iota(std::begin(all_column_indices), std::end(all_column_indices), 0);
-  std::vector<size_type> sorted_common_column_indices{common_column_indices};
-  std::sort(std::begin(sorted_common_column_indices), std::end(sorted_common_column_indices));
-  std::vector<size_type> non_common_column_indices(num_columns - common_column_indices.size());
-  std::set_difference(std::cbegin(all_column_indices),
-                      std::cend(all_column_indices),
-                      std::cbegin(sorted_common_column_indices),
-                      std::cend(sorted_common_column_indices),
-                      std::begin(non_common_column_indices));
-  return non_common_column_indices;
-}
-
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
-  table_view const &probe,
-  table_view const &build,
-  std::vector<std::pair<size_type, size_type>> const &columns_in_common,
-  cudf::hash_join::common_columns_output_side common_columns_output_side)
+  table_view const &probe, table_view const &build)
 {
-  std::vector<size_type> columns_to_exclude(columns_in_common.size());
-  std::transform(columns_in_common.begin(),
-                 columns_in_common.end(),
-                 columns_to_exclude.begin(),
-                 [common_columns_output_side](auto &col) {
-                   return common_columns_output_side == hash_join::common_columns_output_side::PROBE
-                            ? col.second
-                            : col.first;
-                 });
-  std::vector<size_type> non_common_indices = non_common_column_indices(
-    common_columns_output_side == hash_join::common_columns_output_side::PROBE
-      ? build.num_columns()
-      : probe.num_columns(),
-    columns_to_exclude);
   std::unique_ptr<table> empty_probe = empty_like(probe);
   std::unique_ptr<table> empty_build = empty_like(build);
-  if (common_columns_output_side == hash_join::common_columns_output_side::PROBE) {
-    table_view empty_build_view = empty_build->select(non_common_indices);
-    empty_build                 = std::make_unique<table>(empty_build_view);
-  } else {
-    table_view empty_probe_view = empty_probe->select(non_common_indices);
-    empty_probe                 = std::make_unique<table>(empty_probe_view);
-  }
   return std::make_pair(std::move(empty_probe), std::move(empty_build));
 }
 
-VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b)
+VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((a.first.size() == a.second.size()),
+  CUDF_EXPECTS((a.first->size() == a.second->size()),
                "Mismatch between sizes of vectors in vector pair");
-  CUDF_EXPECTS((b.first.size() == b.second.size()),
+  CUDF_EXPECTS((b.first->size() == b.second->size()),
                "Mismatch between sizes of vectors in vector pair");
-  if (a.first.empty()) {
-    return b;
-  } else if (b.first.empty()) {
-    return a;
+  if (a.first->is_empty()) {
+    return std::move(b);
+  } else if (b.first->is_empty()) {
+    return std::move(a);
   }
-  auto original_size = a.first.size();
-  a.first.resize(a.first.size() + b.first.size());
-  a.second.resize(a.second.size() + b.second.size());
-  thrust::copy(b.first.begin(), b.first.end(), a.first.begin() + original_size);
-  thrust::copy(b.second.begin(), b.second.end(), a.second.begin() + original_size);
-  return a;
+  auto original_size = a.first->size();
+  a.first->resize(a.first->size() + b.first->size(), stream);
+  a.second->resize(a.second->size() + b.second->size(), stream);
+  thrust::copy(
+    rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size);
+  thrust::copy(rmm::exec_policy(stream),
+               b.second->begin(),
+               b.second->end(),
+               a.second->begin() + original_size);
+  return std::move(a);
 }
 
 template <typename T>
@@ -133,16 +85,20 @@ struct valid_range {
  *
  * @return Pair of vectors containing the left join indices complement
  */
-std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
-get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
-                                 size_type left_table_row_count,
-                                 size_type right_table_row_count,
-                                 rmm::cuda_stream_view stream)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+get_left_join_indices_complement(
+  std::unique_ptr<rmm::device_uvector<size_type>> &right_indices,
+  size_type left_table_row_count,
+  size_type right_table_row_count,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
 {
   // Get array of indices that do not appear in right_indices
 
   // Vector allocated for unmatched result
-  rmm::device_vector<size_type> right_indices_complement(right_table_row_count);
+  auto right_indices_complement =
+    std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
 
   // If left table is empty in a full join call then all rows of the right table
   // should be represented in the joined indices. This is an optimization since
@@ -151,12 +107,16 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
   // produce exactly the same result as the else path but will be faster.
   if (left_table_row_count == 0) {
     thrust::sequence(rmm::exec_policy(stream),
-                     right_indices_complement.begin(),
-                     right_indices_complement.end(),
+                     right_indices_complement->begin(),
+                     right_indices_complement->end(),
                      0);
   } else {
     // Assume all the indices in invalid_index_map are invalid
-    rmm::device_vector<size_type> invalid_index_map(right_table_row_count, 1);
+    auto invalid_index_map =
+      std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
+    thrust::uninitialized_fill(
+      rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1});
+
     // Functor to check for index validity since left joins can create invalid indices
     valid_range<size_type> valid(0, right_table_row_count);
 
@@ -164,11 +124,11 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
     // Thus specifying that those locations are valid
     thrust::scatter_if(rmm::exec_policy(stream),
                        thrust::make_constant_iterator(0),
-                       thrust::make_constant_iterator(0) + right_indices.size(),
-                       right_indices.begin(),      // Index locations
-                       right_indices.begin(),      // Stencil - Check if index location is valid
-                       invalid_index_map.begin(),  // Output indices
-                       valid);                     // Stencil Predicate
+                       thrust::make_constant_iterator(0) + right_indices->size(),
+                       right_indices->begin(),      // Index locations
+                       right_indices->begin(),      // Stencil - Check if index location is valid
+                       invalid_index_map->begin(),  // Output indices
+                       valid);                      // Stencil Predicate
     size_type begin_counter = static_cast<size_type>(0);
     size_type end_counter   = static_cast<size_type>(right_table_row_count);
 
@@ -176,15 +136,19 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
     size_type indices_count = thrust::copy_if(rmm::exec_policy(stream),
                                               thrust::make_counting_iterator(begin_counter),
                                               thrust::make_counting_iterator(end_counter),
-                                              invalid_index_map.begin(),
-                                              right_indices_complement.begin(),
+                                              invalid_index_map->begin(),
+                                              right_indices_complement->begin(),
                                               thrust::identity<size_type>()) -
-                              right_indices_complement.begin();
-    right_indices_complement.resize(indices_count);
+                              right_indices_complement->begin();
+    right_indices_complement->resize(indices_count, stream);
   }
 
-  rmm::device_vector<size_type> left_invalid_indices(right_indices_complement.size(),
-                                                     JoinNoneValue);
+  auto left_invalid_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(right_indices_complement->size(), stream);
+  thrust::fill(rmm::exec_policy(stream),
+               left_invalid_indices->begin(),
+               left_invalid_indices->end(),
+               JoinNoneValue);
 
   return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement));
 }
@@ -195,8 +159,6 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
  * @throw cudf::logic_error if the number of columns in `build` table is 0.
  * @throw cudf::logic_error if the number of rows in `build` table is 0.
  * @throw cudf::logic_error if insertion to the hash table fails.
- * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build`
- * table.
  *
  * @param build Table of columns used to build join hash.
  * @param compare_nulls Controls whether null join-key values should match or not.
@@ -256,19 +218,22 @@ std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_
  * @return Join output indices vector pair.
  */
 template <join_kind JoinKind>
-std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_join_hash_table(
-  cudf::table_device_view build_table,
-  cudf::table_device_view probe_table,
-  multimap_type const &hash_table,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+probe_join_hash_table(cudf::table_device_view build_table,
+                      cudf::table_device_view probe_table,
+                      multimap_type const &hash_table,
+                      null_equality compare_nulls,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource *mr)
 {
   size_type estimated_size = estimate_join_output_size<JoinKind, multimap_type>(
     build_table, probe_table, hash_table, compare_nulls, stream);
 
   // If the estimated output size is zero, return immediately
   if (estimated_size == 0) {
-    return std::make_pair(rmm::device_vector<size_type>{}, rmm::device_vector<size_type>{});
+    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   // Because we are approximating the number of joined elements, our approximation
@@ -278,12 +243,13 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
   rmm::device_scalar<size_type> write_index(0, stream);
   size_type join_size{0};
 
-  rmm::device_vector<size_type> left_indices;
-  rmm::device_vector<size_type> right_indices;
+  auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+  auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+
   auto current_estimated_size = estimated_size;
   do {
-    left_indices.resize(estimated_size);
-    right_indices.resize(estimated_size);
+    left_indices->resize(estimated_size, stream);
+    right_indices->resize(estimated_size, stream);
 
     constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
     detail::grid_1d config(probe_table.num_rows(), block_size);
@@ -298,8 +264,8 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
         probe_table,
         hash_probe,
         equality,
-        left_indices.data().get(),
-        right_indices.data().get(),
+        left_indices->data(),
+        right_indices->data(),
         write_index.data(),
         estimated_size);
 
@@ -310,179 +276,11 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
     estimated_size *= 2;
   } while ((current_estimated_size < join_size));
 
-  left_indices.resize(join_size);
-  right_indices.resize(join_size);
+  left_indices->resize(join_size, stream);
+  right_indices->resize(join_size, stream);
   return std::make_pair(std::move(left_indices), std::move(right_indices));
 }
 
-/**
- * @brief  Combines the non common probe, common probe, non common build and common build
- * columns in the correct order according to `common_columns_output_side` to form the joined
- * (`probe`, `build`) table pair.
- *
- * @param probe_noncommon_cols Columns obtained by gathering non common probe columns.
- * @param probe_noncommon_col_indices Output locations of non common probe columns in the probe
- * portion.
- * @param probe_common_col_indices Output locations of common probe columns in the probe portion.
- * @param build_noncommon_cols Columns obtained by gathering non common build columns.
- * @param build_noncommon_col_indices Output locations of non common build columns in the build
- * portion.
- * @param build_common_col_indices Output locations of common build columns in the build portion.
- * @param common_cols Columns obtained by gathering common columns from `probe` and `build` tables
- * in the build portion.
- * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side.
- *
- * @return Table pair of (`probe`, `build`).
- */
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> combine_join_columns(
-  std::vector<std::unique_ptr<column>> &&probe_noncommon_cols,
-  std::vector<size_type> const &probe_noncommon_col_indices,
-  std::vector<size_type> const &probe_common_col_indices,
-  std::vector<std::unique_ptr<column>> &&build_noncommon_cols,
-  std::vector<size_type> const &build_noncommon_col_indices,
-  std::vector<size_type> const &build_common_col_indices,
-  std::vector<std::unique_ptr<column>> &&common_cols,
-  cudf::hash_join::common_columns_output_side common_columns_output_side)
-{
-  if (common_columns_output_side == cudf::hash_join::common_columns_output_side::PROBE) {
-    std::vector<std::unique_ptr<column>> probe_cols(probe_noncommon_cols.size() +
-                                                    common_cols.size());
-    for (size_t i = 0; i < probe_noncommon_cols.size(); ++i) {
-      probe_cols.at(probe_noncommon_col_indices.at(i)) = std::move(probe_noncommon_cols.at(i));
-    }
-    for (size_t i = 0; i < common_cols.size(); ++i) {
-      probe_cols.at(probe_common_col_indices.at(i)) = std::move(common_cols.at(i));
-    }
-    return std::make_pair(std::make_unique<cudf::table>(std::move(probe_cols)),
-                          std::make_unique<cudf::table>(std::move(build_noncommon_cols)));
-  } else {
-    std::vector<std::unique_ptr<column>> build_cols(build_noncommon_cols.size() +
-                                                    common_cols.size());
-    for (size_t i = 0; i < build_noncommon_cols.size(); ++i) {
-      build_cols.at(build_noncommon_col_indices.at(i)) = std::move(build_noncommon_cols.at(i));
-    }
-    for (size_t i = 0; i < common_cols.size(); ++i) {
-      build_cols.at(build_common_col_indices.at(i)) = std::move(common_cols.at(i));
-    }
-    return std::make_pair(std::make_unique<cudf::table>(std::move(probe_noncommon_cols)),
-                          std::make_unique<cudf::table>(std::move(build_cols)));
-  }
-}
-
-/**
- * @brief  Gathers rows from `probe` and `build` table and returns a (`probe`, `build`) table pair,
- * which contains the probe and build portions of the logical joined table respectively.
- *
- * @tparam JoinKind The type of join to be performed
- *
- * @param probe Probe side table
- * @param build build side table
- * @param joined_indices Pair of vectors containing row indices from which
- * `probe` and `build` tables are gathered. If any row index is out of bounds,
- * the contribution in the output `table` will be NULL.
- * @param columns_in_common is a vector of pairs of column indices
- * from tables `probe` and `build` respectively, that are "in common".
- * For "common" columns, only a single output column will be produced.
- * For an inner or left join, the result will be gathered from the column in
- * `probe`. For a full join, the result will be gathered from both common
- * columns in `probe` and `build` and concatenated to form a single column.
- * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side.
- *
- * @return Table pair of (`probe`, `build`) containing the rows from `probe` and
- * `build` specified by `joined_indices`.
- * Columns in `columns_in_common` will be included in either `probe` or `build` portion as
- * `common_columns_output_side` indicates. Final form would look like
- * (`probe(including common columns)`, `build(excluding common columns)`) if
- * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`,
- * `build(including common columns)`) if `common_columns_output_side` is `BUILD`.
- */
-template <join_kind JoinKind>
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_df(
-  table_view const &probe,
-  table_view const &build,
-  VectorPair &joined_indices,
-  std::vector<std::pair<size_type, size_type>> const &columns_in_common,
-  cudf::hash_join::common_columns_output_side common_columns_output_side,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr)
-{
-  std::vector<size_type> probe_common_col;
-  probe_common_col.reserve(columns_in_common.size());
-  std::vector<size_type> build_common_col;
-  build_common_col.reserve(columns_in_common.size());
-  for (const auto &c : columns_in_common) {
-    probe_common_col.push_back(c.first);
-    build_common_col.push_back(c.second);
-  }
-  std::vector<size_type> probe_noncommon_col =
-    non_common_column_indices(probe.num_columns(), probe_common_col);
-  std::vector<size_type> build_noncommon_col =
-    non_common_column_indices(build.num_columns(), build_common_col);
-
-  out_of_bounds_policy const bounds_policy = JoinKind != join_kind::INNER_JOIN
-                                               ? out_of_bounds_policy::NULLIFY
-                                               : out_of_bounds_policy::DONT_CHECK;
-
-  std::unique_ptr<table> common_table = std::make_unique<table>();
-  // Construct the joined columns
-  if (join_kind::FULL_JOIN == JoinKind) {
-    auto complement_indices = get_left_join_indices_complement(
-      joined_indices.second, probe.num_rows(), build.num_rows(), stream);
-    if (not columns_in_common.empty()) {
-      auto common_from_build = detail::gather(build.select(build_common_col),
-                                              complement_indices.second.begin(),
-                                              complement_indices.second.end(),
-                                              bounds_policy,
-                                              stream,
-                                              rmm::mr::get_current_device_resource());
-      auto common_from_probe = detail::gather(probe.select(probe_common_col),
-                                              joined_indices.first.begin(),
-                                              joined_indices.first.end(),
-                                              bounds_policy,
-                                              stream,
-                                              rmm::mr::get_current_device_resource());
-      common_table           = cudf::detail::concatenate(
-        std::vector<table_view>({common_from_build->view(), common_from_probe->view()}),
-        stream,
-        mr);
-    }
-    joined_indices = concatenate_vector_pairs(complement_indices, joined_indices);
-  } else {
-    if (not columns_in_common.empty()) {
-      common_table = detail::gather(probe.select(probe_common_col),
-                                    joined_indices.first.begin(),
-                                    joined_indices.first.end(),
-                                    bounds_policy,
-                                    stream,
-                                    mr);
-    }
-  }
-
-  // Construct the probe non common columns
-  std::unique_ptr<table> probe_table = detail::gather(probe.select(probe_noncommon_col),
-                                                      joined_indices.first.begin(),
-                                                      joined_indices.first.end(),
-                                                      bounds_policy,
-                                                      stream,
-                                                      mr);
-
-  std::unique_ptr<table> build_table = detail::gather(build.select(build_noncommon_col),
-                                                      joined_indices.second.begin(),
-                                                      joined_indices.second.end(),
-                                                      bounds_policy,
-                                                      stream,
-                                                      mr);
-
-  return combine_join_columns(probe_table->release(),
-                              probe_noncommon_col,
-                              probe_common_col,
-                              build_table->release(),
-                              build_noncommon_col,
-                              build_common_col,
-                              common_table->release(),
-                              common_columns_output_side);
-}
-
 std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table> &&left,
                                                 std::unique_ptr<cudf::table> &&right)
 {
@@ -499,147 +297,112 @@ std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table> &&l
 hash_join::hash_join_impl::~hash_join_impl() = default;
 
 hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
-                                          std::vector<size_type> const &build_on,
                                           null_equality compare_nulls,
                                           rmm::cuda_stream_view stream)
-  : _build(build),
-    _build_selected(build.select(build_on)),
-    _build_on(build_on),
-    _hash_table(nullptr)
+  : _build(build), _hash_table(nullptr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(0 != _build.num_columns(), "Hash join build table is empty");
   CUDF_EXPECTS(_build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Build column size is too big for hash join");
 
-  if (_build_on.empty() || 0 == build.num_rows()) { return; }
+  if (0 == build.num_rows()) { return; }
 
-  _hash_table = build_join_hash_table(_build_selected, compare_nulls, stream);
+  _hash_table = build_join_hash_table(_build, compare_nulls, stream);
 }
 
-std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
-hash_join::hash_join_impl::inner_join(
-  cudf::table_view const &probe,
-  std::vector<size_type> const &probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
-  common_columns_output_side common_columns_output_side,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::hash_join_impl::inner_join(cudf::table_view const &probe,
+                                      null_equality compare_nulls,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
-  return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(
-    probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr);
+  return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(probe, compare_nulls, stream, mr);
 }
 
-std::unique_ptr<cudf::table> hash_join::hash_join_impl::left_join(
-  cudf::table_view const &probe,
-  std::vector<size_type> const &probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::hash_join_impl::left_join(cudf::table_view const &probe,
+                                     null_equality compare_nulls,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
-  auto probe_build_pair =
-    compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(probe,
-                                                          probe_on,
-                                                          columns_in_common,
-                                                          common_columns_output_side::PROBE,
-                                                          compare_nulls,
-                                                          stream,
-                                                          mr);
-  return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
-                                          std::move(probe_build_pair.second));
+  return compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(probe, compare_nulls, stream, mr);
 }
 
-std::unique_ptr<cudf::table> hash_join::hash_join_impl::full_join(
-  cudf::table_view const &probe,
-  std::vector<size_type> const &probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::hash_join_impl::full_join(cudf::table_view const &probe,
+                                     null_equality compare_nulls,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
-  auto probe_build_pair =
-    compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(probe,
-                                                          probe_on,
-                                                          columns_in_common,
-                                                          common_columns_output_side::PROBE,
-                                                          compare_nulls,
-                                                          stream,
-                                                          mr);
-  return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
-                                          std::move(probe_build_pair.second));
+  return compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(probe, compare_nulls, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
-std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
-hash_join::hash_join_impl::compute_hash_join(
-  cudf::table_view const &probe,
-  std::vector<size_type> const &probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
-  common_columns_output_side common_columns_output_side,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource *mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe,
+                                             null_equality compare_nulls,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource *mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Probe column size is too big for hash join");
-  CUDF_EXPECTS(_build_on.size() == probe_on.size(),
+  CUDF_EXPECTS(_build.num_columns() == probe.num_columns(),
                "Mismatch in number of columns to be joined on");
 
-  CUDF_EXPECTS(std::all_of(columns_in_common.begin(),
-                           columns_in_common.end(),
-                           [this, &probe_on](auto pair) {
-                             size_t p = std::find(probe_on.begin(), probe_on.end(), pair.first) -
-                                        probe_on.begin();
-                             size_t b = std::find(_build_on.begin(), _build_on.end(), pair.second) -
-                                        _build_on.begin();
-                             return (p != probe_on.size()) && (b != _build_on.size()) && (p == b);
-                           }),
-               "Invalid values passed to columns_in_common");
-
-  if (is_trivial_join(probe, _build, probe_on, _build_on, JoinKind)) {
-    return get_empty_joined_table(probe, _build, columns_in_common, common_columns_output_side);
+  if (is_trivial_join(probe, _build, JoinKind)) {
+    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  auto probe_selected = probe.select(probe_on);
-  CUDF_EXPECTS(std::equal(std::cbegin(_build_selected),
-                          std::cend(_build_selected),
-                          std::cbegin(probe_selected),
-                          std::cend(probe_selected),
+  CUDF_EXPECTS(std::equal(std::cbegin(_build),
+                          std::cend(_build),
+                          std::cbegin(probe),
+                          std::cend(probe),
                           [](const auto &b, const auto &p) { return b.type() == p.type(); }),
                "Mismatch in joining column data types");
 
-  constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN)
-                                                      ? cudf::detail::join_kind::LEFT_JOIN
-                                                      : JoinKind;
-  auto joined_indices = probe_join_indices<ProbeJoinKind>(probe_selected, compare_nulls, stream);
-  return cudf::detail::construct_join_output_df<JoinKind>(
-    probe, _build, joined_indices, columns_in_common, common_columns_output_side, stream, mr);
+  return probe_join_indices<JoinKind>(probe, compare_nulls, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
-std::enable_if_t<JoinKind != cudf::detail::join_kind::FULL_JOIN,
-                 std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>>
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe,
                                               null_equality compare_nulls,
-                                              rmm::cuda_stream_view stream) const
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource *mr) const
 {
   // Trivial left join case - exit early
-  if (!_hash_table && JoinKind == cudf::detail::join_kind::LEFT_JOIN) {
-    return get_trivial_left_join_indices(probe, stream);
+  if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) {
+    return get_trivial_left_join_indices(probe, stream, mr);
   }
 
   CUDF_EXPECTS(_hash_table, "Hash table of hash join is null.");
 
-  auto build_table = cudf::table_device_view::create(_build_selected, stream);
+  auto build_table = cudf::table_device_view::create(_build, stream);
   auto probe_table = cudf::table_device_view::create(probe, stream);
-  return cudf::detail::probe_join_hash_table<JoinKind>(
-    *build_table, *probe_table, *_hash_table, compare_nulls, stream);
+
+  constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN)
+                                                      ? cudf::detail::join_kind::LEFT_JOIN
+                                                      : JoinKind;
+  auto join_indices = cudf::detail::probe_join_hash_table<ProbeJoinKind>(
+    *build_table, *probe_table, *_hash_table, compare_nulls, stream, mr);
+
+  if (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
+    auto complement_indices = detail::get_left_join_indices_complement(
+      join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr);
+    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
+  }
+  return join_indices;
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index b37f228f6d3..aaa25e8f941 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -15,6 +15,9 @@
  */
 #pragma once
 
+#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
 #include <join/join_common_utils.hpp>
 #include <join/join_kernels.cuh>
 
@@ -25,7 +28,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/sequence.h>
@@ -178,19 +181,29 @@ size_type estimate_join_output_size(table_device_view build_table,
  *
  * @param left Table of left columns to join
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the result
  *
  * @return Join output indices vector pair
  */
-inline std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
-get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stream)
+inline std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+                 std::unique_ptr<rmm::device_uvector<size_type>>>
+get_trivial_left_join_indices(
+  table_view const& left,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  rmm::device_vector<size_type> left_indices(left.num_rows());
-  thrust::sequence(rmm::exec_policy(stream), left_indices.begin(), left_indices.end(), 0);
-  rmm::device_vector<size_type> right_indices(left.num_rows());
-  thrust::fill(rmm::exec_policy(stream), right_indices.begin(), right_indices.end(), JoinNoneValue);
+  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+  thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
+  auto right_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+  thrust::fill(
+    rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue);
   return std::make_pair(std::move(left_indices), std::move(right_indices));
 }
 
+std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
+  table_view const& probe, table_view const& build);
+
 std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
                                                 std::unique_ptr<cudf::table>&& right);
 
@@ -207,106 +220,52 @@ struct hash_join::hash_join_impl {
 
  private:
   cudf::table_view _build;
-  cudf::table_view _build_selected;
-  std::vector<size_type> _build_on;
   std::unique_ptr<cudf::detail::multimap_type, std::function<void(cudf::detail::multimap_type*)>>
     _hash_table;
 
  public:
   /**
-   * @brief Constructor that internally builds the hash table based on the given `build` table and
-   * column indices specified by `build_on` for subsequent probe calls.
+   * @brief Constructor that internally builds the hash table based on the given `build` table
    *
    * @throw cudf::logic_error if the number of columns in `build` table is 0.
    * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE.
-   * @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build`
-   * table.
    *
    * @param build The build table, from which the hash table is built.
-   * @param build_on The column indices from `build` to join on.
    * @param compare_nulls Controls whether null join-key values should match or not.
    */
   hash_join_impl(cudf::table_view const& build,
-                 std::vector<size_type> const& build_on,
                  null_equality compare_nulls,
                  rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
-  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> inner_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    common_columns_output_side common_columns_output_side,
-    null_equality compare_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const;
-
-  std::unique_ptr<cudf::table> left_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    null_equality compare_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const;
-
-  std::unique_ptr<cudf::table> full_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    null_equality compare_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  inner_join(cudf::table_view const& probe,
+             null_equality compare_nulls,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr) const;
+
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  left_join(cudf::table_view const& probe,
+            null_equality compare_nulls,
+            rmm::cuda_stream_view stream,
+            rmm::mr::device_memory_resource* mr) const;
+
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  full_join(cudf::table_view const& probe,
+            null_equality compare_nulls,
+            rmm::cuda_stream_view stream,
+            rmm::mr::device_memory_resource* mr) const;
 
  private:
-  /**
-   * @brief Performs hash join by probing the columns provided in `probe` as per
-   * the joining indices given in `probe_on` and returns a (`probe`, `_build`) table pair, which
-   * contains the probe and build portions of the logical joined table respectively.
-   *
-   * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
-   * (`P`, `B`) where `P` does not exist in `probe_on` or `B` does not exist in
-   * `_build_on`.
-   * @throw cudf::logic_error if `columns_in_common` contains a pair of indices
-   * (`P`, `B`) such that the location of `P` within `probe_on` is not equal to
-   * the location of `B` within `_build_on`.
-   * @throw cudf::logic_error if the number of elements in `probe_on` and
-   * `_build_on` are not equal.
-   * @throw cudf::logic_error if the number of columns in `probe` is 0.
-   * @throw cudf::logic_error if the number of rows in `probe` table exceeds MAX_JOIN_SIZE.
-   * @throw std::out_of_range if elements of `probe_on` exceed the number of columns in the `probe`
-   * table.
-   * @throw cudf::logic_error if types do not match between joining columns.
-   *
-   * @tparam JoinKind The type of join to be performed.
-   *
-   * @param probe The probe table.
-   * @param probe_on The column's indices from `probe` to join on.
-   * Column `i` from `probe_on` will be compared against column `i` of `_build_on`.
-   * @param columns_in_common is a vector of pairs of column indices into
-   * `probe` and `_build`, respectively, that are "in common". For "common"
-   * columns, only a single output column will be produced, which is gathered
-   * from `probe_on` columns. Else, for every column in `probe_on` and `_build_on`,
-   * an output column will be produced. For each of these pairs (P, B), P
-   * should exist in `probe_on` and B should exist in `_build_on`.
-   * @param common_columns_output_side @see cudf::hash_join::common_columns_output_side.
-   * @param compare_nulls Controls whether null join-key values should match or not.
-   * @param mr Device memory resource used to allocate the returned table's device memory.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return Table pair of (`probe`, `_build`) of joining both tables on the columns
-   * specified by `probe_on` and `_build_on`. The resulting table pair will be joined columns of
-   * (`probe(including common columns)`, `_build(excluding common columns)`) if
-   * `common_columns_output_side` is `PROBE`, or (`probe(excluding common columns)`,
-   * `_build(including common columns)`) if `common_columns_output_side` is `BUILD`.
-   */
   template <cudf::detail::join_kind JoinKind>
-  std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> compute_hash_join(
-    cudf::table_view const& probe,
-    std::vector<size_type> const& probe_on,
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-    common_columns_output_side common_columns_output_side,
-    null_equality compare_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const;
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
+  compute_hash_join(cudf::table_view const& probe,
+                    null_equality compare_nulls,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr) const;
 
   /**
    * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`,
@@ -320,15 +279,17 @@ struct hash_join::hash_join_impl {
    * @param probe_table Table of probe side columns to join.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned vectors.
    *
    * @return Join output indices vector pair.
    */
   template <cudf::detail::join_kind JoinKind>
-  std::enable_if_t<JoinKind != cudf::detail::join_kind::FULL_JOIN,
-                   std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>>
+  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+            std::unique_ptr<rmm::device_uvector<size_type>>>
   probe_join_indices(cudf::table_view const& probe,
                      null_equality compare_nulls,
-                     rmm::cuda_stream_view stream) const;
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const;
 };
 
 }  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index ce27cfcd616..f2e4bab02c6 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -26,68 +26,102 @@
 namespace cudf {
 namespace detail {
 
-std::unique_ptr<table> inner_join(
-  table_view const& left_input,
-  table_view const& right_input,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+inner_join(table_view const& left_input,
+           table_view const& right_input,
+           null_equality compare_nulls,
+           rmm::cuda_stream_view stream,
+           rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input.select(left_on), right_input.select(right_on)},
+    {left_input, right_input},
     stream,
     rmm::mr::get_current_device_resource());  // temporary objects returned
 
   // now rebuild the table views with the updated ones
-  auto const left  = scatter_columns(matched.second.front(), left_on, left_input);
-  auto const right = scatter_columns(matched.second.back(), right_on, right_input);
+  auto const left  = matched.second.front();
+  auto const right = matched.second.back();
 
   // For `inner_join`, we can freely choose either the `left` or `right` table to use for
   // building/probing the hash map. Because building is typically more expensive than probing, we
   // build the hash map from the smaller table.
   if (right.num_rows() > left.num_rows()) {
-    cudf::hash_join hj_obj(left, left_on, compare_nulls, stream);
-    auto actual_columns_in_common = columns_in_common;
-    std::for_each(actual_columns_in_common.begin(), actual_columns_in_common.end(), [](auto& pair) {
-      std::swap(pair.first, pair.second);
-    });
-    auto probe_build_pair = hj_obj.inner_join(right,
-                                              right_on,
-                                              actual_columns_in_common,
-                                              cudf::hash_join::common_columns_output_side::BUILD,
-                                              compare_nulls,
-                                              stream,
-                                              mr);
-    return cudf::detail::combine_table_pair(std::move(probe_build_pair.second),
-                                            std::move(probe_build_pair.first));
+    cudf::hash_join hj_obj(left, compare_nulls, stream);
+    auto result = hj_obj.inner_join(right, compare_nulls, stream, mr);
+    return std::make_pair(std::move(result.second), std::move(result.first));
   } else {
-    cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
-    auto probe_build_pair = hj_obj.inner_join(left,
-                                              left_on,
-                                              columns_in_common,
-                                              cudf::hash_join::common_columns_output_side::PROBE,
-                                              compare_nulls,
-                                              stream,
-                                              mr);
-    return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
-                                            std::move(probe_build_pair.second));
+    cudf::hash_join hj_obj(right, compare_nulls, stream);
+    return hj_obj.inner_join(left, compare_nulls, stream, mr);
   }
 }
 
-std::unique_ptr<table> left_join(
-  table_view const& left_input,
-  table_view const& right_input,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> inner_join(table_view const& left_input,
+                                  table_view const& right_input,
+                                  std::vector<size_type> const& left_on,
+                                  std::vector<size_type> const& right_on,
+                                  null_equality compare_nulls,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  // Make sure any dictionary columns have matched key sets.
+  // This will return any new dictionary columns created as well as updated table_views.
+  auto matched = cudf::dictionary::detail::match_dictionaries(
+    {left_input.select(left_on), right_input.select(right_on)},
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
+
+  // now rebuild the table views with the updated ones
+  auto const left  = scatter_columns(matched.second.front(), left_on, left_input);
+  auto const right = scatter_columns(matched.second.back(), right_on, right_input);
+
+  auto join_indices = inner_join(left.select(left_on), right.select(right_on), compare_nulls, mr);
+  std::unique_ptr<table> left_result  = detail::gather(left,
+                                                      join_indices.first->begin(),
+                                                      join_indices.first->end(),
+                                                      out_of_bounds_policy::DONT_CHECK,
+                                                      stream,
+                                                      mr);
+  std::unique_ptr<table> right_result = detail::gather(right,
+                                                       join_indices.second->begin(),
+                                                       join_indices.second->end(),
+                                                       out_of_bounds_policy::DONT_CHECK,
+                                                       stream,
+                                                       mr);
+  return combine_table_pair(std::move(left_result), std::move(right_result));
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+left_join(table_view const& left_input,
+          table_view const& right_input,
+          null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
+          rmm::mr::device_memory_resource* mr)
+{
+  // Make sure any dictionary columns have matched key sets.
+  // This will return any new dictionary columns created as well as updated table_views.
+  auto matched = cudf::dictionary::detail::match_dictionaries(
+    {left_input, right_input},  // these should match
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
+  // now rebuild the table views with the updated ones
+  table_view const left  = matched.second.front();
+  table_view const right = matched.second.back();
+
+  cudf::hash_join hj_obj(right, compare_nulls, stream);
+  return hj_obj.left_join(left, compare_nulls, stream, mr);
+}
+
+std::unique_ptr<table> left_join(table_view const& left_input,
+                                 table_view const& right_input,
+                                 std::vector<size_type> const& left_on,
+                                 std::vector<size_type> const& right_on,
+                                 null_equality compare_nulls,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -99,19 +133,58 @@ std::unique_ptr<table> left_join(
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
-  return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr);
+  auto join_indices = left_join(left.select(left_on), right.select(right_on), compare_nulls);
+
+  if ((left_on.empty() || right_on.empty()) ||
+      is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) {
+    auto probe_build_pair = get_empty_joined_table(left, right);
+    return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
+                                            std::move(probe_build_pair.second));
+  }
+  std::unique_ptr<table> left_result  = detail::gather(left,
+                                                      join_indices.first->begin(),
+                                                      join_indices.first->end(),
+                                                      out_of_bounds_policy::NULLIFY,
+                                                      stream,
+                                                      mr);
+  std::unique_ptr<table> right_result = detail::gather(right,
+                                                       join_indices.second->begin(),
+                                                       join_indices.second->end(),
+                                                       out_of_bounds_policy::NULLIFY,
+                                                       stream,
+                                                       mr);
+  return combine_table_pair(std::move(left_result), std::move(right_result));
 }
 
-std::unique_ptr<table> full_join(
-  table_view const& left_input,
-  table_view const& right_input,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+full_join(table_view const& left_input,
+          table_view const& right_input,
+          null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
+          rmm::mr::device_memory_resource* mr)
+{
+  // Make sure any dictionary columns have matched key sets.
+  // This will return any new dictionary columns created as well as updated table_views.
+  auto matched = cudf::dictionary::detail::match_dictionaries(
+    {left_input, right_input},  // these should match
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
+  // now rebuild the table views with the updated ones
+  table_view const left  = matched.second.front();
+  table_view const right = matched.second.back();
+
+  cudf::hash_join hj_obj(right, compare_nulls, stream);
+  return hj_obj.full_join(left, compare_nulls, stream, mr);
+}
+
+std::unique_ptr<table> full_join(table_view const& left_input,
+                                 table_view const& right_input,
+                                 std::vector<size_type> const& left_on,
+                                 std::vector<size_type> const& right_on,
+                                 null_equality compare_nulls,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -123,8 +196,27 @@ std::unique_ptr<table> full_join(
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  cudf::hash_join hj_obj(right, right_on, compare_nulls, stream);
-  return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, stream, mr);
+  auto join_indices = full_join(left.select(left_on), right.select(right_on), compare_nulls);
+
+  if ((left_on.empty() || right_on.empty()) ||
+      is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) {
+    auto probe_build_pair = get_empty_joined_table(left, right);
+    return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
+                                            std::move(probe_build_pair.second));
+  }
+  std::unique_ptr<table> left_result  = detail::gather(left,
+                                                      join_indices.first->begin(),
+                                                      join_indices.first->end(),
+                                                      out_of_bounds_policy::NULLIFY,
+                                                      stream,
+                                                      mr);
+  std::unique_ptr<table> right_result = detail::gather(right,
+                                                       join_indices.second->begin(),
+                                                       join_indices.second->end(),
+                                                       out_of_bounds_policy::NULLIFY,
+                                                       stream,
+                                                       mr);
+  return combine_table_pair(std::move(left_result), std::move(right_result));
 }
 
 }  // namespace detail
@@ -132,90 +224,111 @@ std::unique_ptr<table> full_join(
 hash_join::~hash_join() = default;
 
 hash_join::hash_join(cudf::table_view const& build,
-                     std::vector<size_type> const& build_on,
                      null_equality compare_nulls,
                      rmm::cuda_stream_view stream)
-  : impl{std::make_unique<const hash_join::hash_join_impl>(build, build_on, compare_nulls, stream)}
+  : impl{std::make_unique<const hash_join::hash_join_impl>(build, compare_nulls, stream)}
 {
 }
 
-std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> hash_join::inner_join(
-  cudf::table_view const& probe,
-  std::vector<size_type> const& probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-  common_columns_output_side common_columns_output_side,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::inner_join(cudf::table_view const& probe,
+                      null_equality compare_nulls,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr) const
 {
-  return impl->inner_join(
-    probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr);
+  return impl->inner_join(probe, compare_nulls, stream, mr);
 }
 
-std::unique_ptr<cudf::table> hash_join::left_join(
-  cudf::table_view const& probe,
-  std::vector<size_type> const& probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::left_join(cudf::table_view const& probe,
+                     null_equality compare_nulls,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const
 {
-  return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr);
+  return impl->left_join(probe, compare_nulls, stream, mr);
 }
 
-std::unique_ptr<cudf::table> hash_join::full_join(
-  cudf::table_view const& probe,
-  std::vector<size_type> const& probe_on,
-  std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+hash_join::full_join(cudf::table_view const& probe,
+                     null_equality compare_nulls,
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr) const
 {
-  return impl->full_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr);
+  return impl->full_join(probe, compare_nulls, stream, mr);
 }
 
 // external APIs
 
-std::unique_ptr<table> inner_join(
-  table_view const& left,
-  table_view const& right,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+inner_join(table_view const& left,
+           table_view const& right,
+           null_equality compare_nulls,
+           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::inner_join(left, right, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<table> inner_join(table_view const& left,
+                                  table_view const& right,
+                                  std::vector<size_type> const& left_on,
+                                  std::vector<size_type> const& right_on,
+                                  null_equality compare_nulls,
+                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::inner_join(
-    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> left_join(
-  table_view const& left,
-  table_view const& right,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+left_join(table_view const& left,
+          table_view const& right,
+          null_equality compare_nulls,
+          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::left_join(left, right, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<table> left_join(table_view const& left,
+                                 table_view const& right,
+                                 std::vector<size_type> const& left_on,
+                                 std::vector<size_type> const& right_on,
+                                 null_equality compare_nulls,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_join(
-    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+full_join(table_view const& left,
+          table_view const& right,
+          null_equality compare_nulls,
+          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::full_join(left, right, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> full_join(
-  table_view const& left,
-  table_view const& right,
-  std::vector<size_type> const& left_on,
-  std::vector<size_type> const& right_on,
-  std::vector<std::pair<size_type, size_type>> const& columns_in_common,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> full_join(table_view const& left,
+                                 table_view const& right,
+                                 std::vector<size_type> const& left_on,
+                                 std::vector<size_type> const& right_on,
+                                 null_equality compare_nulls,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::full_join(
-    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index f0c158c1ef6..9312704f065 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -19,6 +19,8 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/device_uvector.hpp>
+
 #include <hash/concurrent_unordered_multimap.cuh>
 
 #include <limits>
@@ -29,9 +31,10 @@ constexpr size_type MAX_JOIN_SIZE{std::numeric_limits<size_type>::max()};
 
 constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128;
 constexpr int DEFAULT_JOIN_CACHE_SIZE = 128;
-constexpr size_type JoinNoneValue     = -1;
+constexpr size_type JoinNoneValue     = std::numeric_limits<size_type>::min();
 
-using VectorPair = std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>;
+using VectorPair = std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+                             std::unique_ptr<rmm::device_uvector<size_type>>>;
 
 using multimap_type =
   concurrent_unordered_multimap<hash_value_type,
@@ -49,14 +52,10 @@ using row_equality = cudf::row_equality_comparator<true>;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
 
-inline bool is_trivial_join(table_view const& left,
-                            table_view const& right,
-                            std::vector<size_type> const& left_on,
-                            std::vector<size_type> const& right_on,
-                            join_kind join_type)
+inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type)
 {
   // If there is nothing to join, then send empty table with all columns
-  if (left_on.empty() || right_on.empty()) { return true; }
+  if (left.is_empty() || right.is_empty()) { return true; }
 
   // If left join and the left table is empty, return immediately
   if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; }
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 9d046f9983c..80a1ef9e204 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -17,20 +17,106 @@
 #include <hash/concurrent_unordered_map.cuh>
 #include <join/join_common_utils.hpp>
 
+#include <thrust/distance.h>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sequence.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf {
 namespace detail {
+
+template <join_kind JoinKind>
+std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
+  cudf::table_view const& left_keys,
+  cudf::table_view const& right_keys,
+  null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty");
+  CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty");
+
+  if (is_trivial_join(left_keys, right_keys, JoinKind)) {
+    return std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream, mr);
+  }
+  if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) {
+    auto result =
+      std::make_unique<rmm::device_uvector<cudf::size_type>>(left_keys.num_rows(), stream, mr);
+    thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end());
+    return result;
+  }
+
+  auto const left_num_rows  = left_keys.num_rows();
+  auto const right_num_rows = right_keys.num_rows();
+
+  // Only care about existence, so we'll use an unordered map (other joins need a multimap)
+  using hash_table_type = concurrent_unordered_map<cudf::size_type, bool, row_hash, row_equality>;
+
+  // Create hash table containing all keys found in right table
+  auto right_rows_d            = table_device_view::create(right_keys, stream);
+  size_t const hash_table_size = compute_hash_table_size(right_num_rows);
+  row_hash hash_build{*right_rows_d};
+  row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
+
+  // Going to join it with left table
+  auto left_rows_d = table_device_view::create(left_keys, stream);
+  row_hash hash_probe{*left_rows_d};
+  row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
+
+  auto hash_table_ptr = hash_table_type::create(hash_table_size,
+                                                stream,
+                                                std::numeric_limits<bool>::max(),
+                                                std::numeric_limits<cudf::size_type>::max(),
+                                                hash_build,
+                                                equality_build);
+  auto hash_table     = *hash_table_ptr;
+
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<size_type>(0),
+                     right_num_rows,
+                     [hash_table] __device__(size_type idx) mutable {
+                       hash_table.insert(thrust::make_pair(idx, true));
+                     });
+
+  //
+  // Now we have a hash table, we need to iterate over the rows of the left table
+  // and check to see if they are contained in the hash table
+  //
+
+  // For semi join we want contains to be true, for anti join we want contains to be false
+  bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN);
+
+  auto gather_map =
+    std::make_unique<rmm::device_uvector<cudf::size_type>>(left_num_rows, stream, mr);
+
+  // gather_map_end will be the end of valid data in gather_map
+  auto gather_map_end = thrust::copy_if(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(left_num_rows),
+    gather_map->begin(),
+    [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) {
+      auto pos = hash_table.find(idx, hash_probe, equality_probe);
+      return (pos != hash_table.end()) == join_type_boolean;
+    });
+
+  auto join_size = thrust::distance(gather_map->begin(), gather_map_end);
+  gather_map->resize(join_size, stream);
+  return gather_map;
+}
+
 /**
  * @brief  Performs a left semi or anti join on the specified columns of two
  * tables (left, right)
@@ -57,8 +143,6 @@ namespace detail {
  *                             The column from `right` indicated by `right_on[i]`
  *                             will be compared against the column from `left`
  *                             indicated by `left_on[i]`.
- * @param[in] return_columns   A vector of column indices from `left` to
- *                             include in the returned table.
  * @param[in] compare_nulls    Controls whether null join-key values should match or not.
  * @param[in] mr               Device memory resource to used to allocate the returned table's
  *                             device memory
@@ -66,8 +150,7 @@ namespace detail {
  * @tparam    join_kind        Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN
  *
  * @returns                    Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`. The resulting table
- *                             will contain `return_columns` from `left` that match in right.
+ *                             specified by `left_on` and `right_on`.
  */
 template <join_kind JoinKind>
 std::unique_ptr<cudf::table> left_semi_anti_join(
@@ -75,27 +158,19 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
-  std::vector<cudf::size_type> const& return_columns,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
-  CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty");
   CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on");
 
-  if (0 == return_columns.size()) { return empty_like(left.select(return_columns)); }
-
-  if (is_trivial_join(left, right, left_on, right_on, JoinKind)) {
-    return empty_like(left.select(return_columns));
+  if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, JoinKind)) {
+    return empty_like(left);
   }
 
-  auto const left_num_rows  = left.num_rows();
-  auto const right_num_rows = right.num_rows();
-
-  if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_num_rows)) {
+  if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right.num_rows())) {
     // Everything matches, just copy the proper columns from the left table
-    return std::make_unique<table>(left.select(return_columns), stream, mr);
+    return std::make_unique<table>(left, stream, mr);
   }
 
   // Make sure any dictionary columns have matched key sets.
@@ -108,91 +183,64 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   auto const left_selected  = matched.second.front();
   auto const right_selected = matched.second.back();
 
-  // Only care about existence, so we'll use an unordered map (other joins need a multimap)
-  using hash_table_type = concurrent_unordered_map<cudf::size_type, bool, row_hash, row_equality>;
-
-  // Create hash table containing all keys found in right table
-  auto right_rows_d            = table_device_view::create(right_selected, stream);
-  size_t const hash_table_size = compute_hash_table_size(right_num_rows);
-  row_hash hash_build{*right_rows_d};
-  row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
-
-  // Going to join it with left table
-  auto left_rows_d = table_device_view::create(left_selected, stream);
-  row_hash hash_probe{*left_rows_d};
-  row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
-
-  auto hash_table_ptr = hash_table_type::create(hash_table_size,
-                                                stream,
-                                                std::numeric_limits<bool>::max(),
-                                                std::numeric_limits<cudf::size_type>::max(),
-                                                hash_build,
-                                                equality_build);
-  auto hash_table     = *hash_table_ptr;
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     right_num_rows,
-                     [hash_table] __device__(size_type idx) mutable {
-                       hash_table.insert(thrust::make_pair(idx, true));
-                     });
-
-  //
-  // Now we have a hash table, we need to iterate over the rows of the left table
-  // and check to see if they are contained in the hash table
-  //
+  auto gather_map =
+    left_semi_anti_join<JoinKind>(left_selected, right_selected, compare_nulls, stream);
 
-  // For semi join we want contains to be true, for anti join we want contains to be false
-  bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN);
-
-  rmm::device_vector<size_type> gather_map(left_num_rows);
-
-  // gather_map_end will be the end of valid data in gather_map
-  auto gather_map_end = thrust::copy_if(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(left_num_rows),
-    gather_map.begin(),
-    [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) {
-      auto pos = hash_table.find(idx, hash_probe, equality_probe);
-      return (pos != hash_table.end()) == join_type_boolean;
-    });
-
-  // rebuild left table for call to gather
   auto const left_updated = scatter_columns(left_selected, left_on, left);
-  return cudf::detail::gather(left_updated.select(return_columns),
-                              gather_map.begin(),
-                              gather_map_end,
+  return cudf::detail::gather(left_updated,
+                              gather_map->begin(),
+                              gather_map->end(),
                               out_of_bounds_policy::DONT_CHECK,
                               stream,
                               mr);
 }
+
 }  // namespace detail
 
 std::unique_ptr<cudf::table> left_semi_join(cudf::table_view const& left,
                                             cudf::table_view const& right,
                                             std::vector<cudf::size_type> const& left_on,
                                             std::vector<cudf::size_type> const& right_on,
-                                            std::vector<cudf::size_type> const& return_columns,
                                             null_equality compare_nulls,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join<detail::join_kind::LEFT_SEMI_JOIN>(
-    left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
+  cudf::table_view const& left,
+  cudf::table_view const& right,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::left_semi_anti_join<detail::join_kind::LEFT_SEMI_JOIN>(
+    left, right, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::table> left_anti_join(cudf::table_view const& left,
                                             cudf::table_view const& right,
                                             std::vector<cudf::size_type> const& left_on,
                                             std::vector<cudf::size_type> const& right_on,
-                                            std::vector<cudf::size_type> const& return_columns,
                                             null_equality compare_nulls,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join<detail::join_kind::LEFT_ANTI_JOIN>(
-    left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr);
+    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
+  cudf::table_view const& left,
+  cudf::table_view const& right,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::left_semi_anti_join<detail::join_kind::LEFT_ANTI_JOIN>(
+    left, right, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index efc5330ea7d..32192234c56 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -33,11 +33,15 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <limits>
+
 template <typename T>
 using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
 using strcol_wrapper = cudf::test::strings_column_wrapper;
 using CVector        = std::vector<std::unique_ptr<cudf::column>>;
 using Table          = cudf::table;
+constexpr cudf::size_type NoneValue =
+  std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
 
 struct JoinTest : public cudf::test::BaseFixture {
 };
@@ -58,58 +62,11 @@ TEST_F(JoinTest, EmptySentinelRepro)
   cudf::table_view left({left_first_col, left_second_col, left_third_col});
   cudf::table_view right({right_first_col, right_second_col, right_third_col});
 
-  auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2}, {{0, 0}, {1, 1}, {2, 2}});
+  auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2});
 
   EXPECT_EQ(result->num_rows(), 1);
 }
 
-TEST_F(JoinTest, InvalidCommonColumnIndices)
-{
-  column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 3}};
-  column_wrapper<int32_t> col0_1{{0, 1, 2, 4, 1}};
-
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}};
-
-  CVector cols0, cols1;
-  cols0.push_back(col0_0.release());
-  cols0.push_back(col0_1.release());
-  cols1.push_back(col1_0.release());
-  cols1.push_back(col1_1.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  EXPECT_THROW(cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 1}, {1, 0}}), cudf::logic_error);
-}
-
-TEST_F(JoinTest, FullJoinNoCommon)
-{
-  column_wrapper<int32_t> col0_0{{0, 1}};
-  column_wrapper<int32_t> col1_0{{0, 2}};
-  CVector cols0, cols1;
-  cols0.push_back(col0_0.release());
-  cols1.push_back(col1_0.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  column_wrapper<int32_t> exp_col0_0{{0, 1, -1}, {1, 1, 0}};
-  column_wrapper<int32_t> exp_col0_1{{0, -1, 2}, {1, 0, 1}};
-  CVector exp_cols;
-  exp_cols.push_back(exp_col0_0.release());
-  exp_cols.push_back(exp_col0_1.release());
-  Table gold(std::move(exp_cols));
-
-  auto result            = cudf::full_join(t0, t1, {0}, {0}, {});
-  auto result_sort_order = cudf::sorted_order(result->view());
-  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-  auto gold_sort_order = cudf::sorted_order(gold.view());
-  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-}
-
 TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 3}};
@@ -131,7 +88,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0}, {0}, {});
+  auto result            = cudf::left_join(t0, t1, {0}, {0});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -153,7 +110,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon)
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, FullJoinNoNulls)
@@ -177,24 +134,32 @@ TEST_F(JoinTest, FullJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{2, 2, 0, 4, 3, 3, 1, 2, 0}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"});
-  column_wrapper<int32_t> col_gold_2{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""},
+                            {1, 1, 1, 1, 1, 0, 0, 0, 0});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+  strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"},
+                            {0, 0, 0, 0, 1, 1, 1, 1, 1});
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
+
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, FullJoinWithNulls)
@@ -218,24 +183,32 @@ TEST_F(JoinTest, FullJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{2, 2, 0, -1, 3, 3, 1, 2, 0}, {1, 1, 1, 0, 1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"});
-  column_wrapper<int32_t> col_gold_2{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""},
+                            {1, 1, 1, 1, 1, 0, 0, 0, 0});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 0}};
+  strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"},
+                            {0, 0, 0, 0, 1, 1, 1, 1, 1});
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
+
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, FullJoinOnNulls)
@@ -262,7 +235,7 @@ TEST_F(JoinTest, FullJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -273,20 +246,26 @@ TEST_F(JoinTest, FullJoinOnNulls)
   cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t");
   cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t");
 #endif
- 
-  column_wrapper<int32_t> col_gold_0{{   2,    5,    3,    -1},
-                                     {   1,    1,    1,     0}};
-  strcol_wrapper          col_gold_1({ "s1", "s0", "s0",  "s1"});
-  column_wrapper<int32_t> col_gold_2{{  -1,   -1,    0,     1}, 
-                                     {   0,    0,    1,     1}};
-  column_wrapper<int32_t> col_gold_3{{   1,    4,    2,     8}, 
-                                     {   1,    1,    1,     1}};
+
+  column_wrapper<int32_t> col_gold_0{{   3,   -1,   -1,    -1},
+                                     {   1,    0,    0,     0}};
+  strcol_wrapper          col_gold_1{{ "s0", "s1",  "",    ""},
+                                     {   1,    1,    0,     0}};
+  column_wrapper<int32_t> col_gold_2{{   0,    1,   -1,    -1},
+                                     {   1,    1,    0,     0}};
+  column_wrapper<int32_t> col_gold_3{{   3,   -1,    2,     5},
+                                     {   1,    0,    1,     1}};
+  strcol_wrapper          col_gold_4{{ "s0", "s1", "s1",  "s0"}};
+  column_wrapper<int32_t> col_gold_5{{   2,    8,    1,     4}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
+  
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
@@ -300,22 +279,27 @@ TEST_F(JoinTest, FullJoinOnNulls)
   cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t");
 #endif
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL);
+  result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  col_gold_0 =               {{   2,    5,    3,    -1,   -1},
-                              {   1,    1,    1,     0,    0}};
-  col_gold_1 = strcol_wrapper({ "s1", "s0", "s0",  "s1", "s1"});
-  col_gold_2 =               {{  -1,   -1,    0,    -1,    1}, 
-                              {   0,    0,    1,     0,    1}};
-  col_gold_3 =               {{   1,    4,    2,     8,   -1}, 
-                              {   1,    1,    1,     1,    0}};
+  col_gold_0 =               {{   3,   -1,   -1,    -1,   -1},
+                              {   1,    0,    0,     0,    0}};
+  col_gold_1 = strcol_wrapper{{ "s0", "s1",   "",    "",   ""},
+                              {   1,    1,    0,     0,    0}};
+  col_gold_2 =               {{   0,    1,   -1,    -1,   -1},
+                              {   1,    1,    0,     0,    0}};
+  col_gold_3 =               {{   3,   -1,    2,     5,   -1},
+                              {   1,    0,    1,     1,    0}};
+  col_gold_4 = strcol_wrapper{{ "s0",  "",  "s1",  "s0",  "s1"},
+                              {   1,    0,    1,     1,    1}};
+  col_gold_5 =               {{   2,   -1,    1,     4,    8},
+                              {   1,    0,    1,     1,    1}};
 
   // clang-format on
 
@@ -324,23 +308,26 @@ TEST_F(JoinTest, FullJoinOnNulls)
   cols_gold_nulls_unequal.push_back(col_gold_1.release());
   cols_gold_nulls_unequal.push_back(col_gold_2.release());
   cols_gold_nulls_unequal.push_back(col_gold_3.release());
+  cols_gold_nulls_unequal.push_back(col_gold_4.release());
+  cols_gold_nulls_unequal.push_back(col_gold_5.release());
+
   Table gold_nulls_unequal{std::move(cols_gold_nulls_unequal)};
 
   gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view());
   sorted_gold     = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, LeftJoinNoNulls)
 {
-  column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 3}};
+  column_wrapper<int32_t> col0_0({3, 1, 2, 0, 3});
   strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"});
-  column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
+  column_wrapper<int32_t> col0_2({0, 1, 2, 4, 1});
 
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
+  column_wrapper<int32_t> col1_0({2, 2, 0, 4, 3});
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}};
+  column_wrapper<int32_t> col1_2({1, 0, 1, 2, 1});
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -353,30 +340,34 @@ TEST_F(JoinTest, LeftJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_2{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0({3, 1, 2, 0, 3});
+  strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1"});
+  column_wrapper<int32_t> col_gold_2({0, 1, 2, 4, 1});
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3}, {0, 0, 0, 0, 1}};
+  strcol_wrapper col_gold_4{{"", "", "", "", "s1"}, {0, 0, 0, 0, 1}};
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1}, {0, 0, 0, 0, 1}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, LeftJoinWithNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
@@ -394,24 +385,29 @@ TEST_F(JoinTest, LeftJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 2, 1, 2, 0}, {1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s1", "", "s4"}, {1, 1, 1, 0, 1});
-  column_wrapper<int32_t> col_gold_2{{0, 1, 1, 2, 4}, {1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {1, 1, 1, 1, 1}};
+  strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1}, {1, 1, 1, 1, 1}};
+  column_wrapper<int32_t> col_gold_3{{3, -1, -1, -1, 2}, {1, 0, 0, 0, 1}};
+  strcol_wrapper col_gold_4{{"s1", "", "", "", "s0"}, {1, 0, 0, 0, 1}};
+  column_wrapper<int32_t> col_gold_5{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}};
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, LeftJoinOnNulls)
@@ -438,7 +434,7 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -449,21 +445,27 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   cudf::test::print(sorted_result->get_column(2).view(), std::cout, ",\t\t");
   cudf::test::print(sorted_result->get_column(3).view(), std::cout, ",\t\t");
 #endif
- 
+
   column_wrapper<int32_t> col_gold_0{{   3,    -1,    2},
                                      {   1,     0,    1}};
   strcol_wrapper          col_gold_1({ "s0",  "s1", "s2"},
                                      {   1,     1,    1});
-  column_wrapper<int32_t> col_gold_2{{   0,     1,    2}, 
+  column_wrapper<int32_t> col_gold_2{{   0,     1,    2},
                                      {   1,     1,    1}};
-  column_wrapper<int32_t> col_gold_3{{   2,     8,   -1}, 
+  column_wrapper<int32_t> col_gold_3{{   3,    -1,   -1},
+                                     {   1,     0,    0}};
+  strcol_wrapper          col_gold_4({ "s0",  "s1",  ""},
+                                     {   1,     1,    0});
+  column_wrapper<int32_t> col_gold_5{{   2,     8,   -1},
                                      {   1,     1,    0}};
-
+  
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
@@ -477,23 +479,28 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   cudf::test::print(sorted_gold->get_column(3).view(), std::cout, ",\t\t");
 #endif
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL);
+  result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  col_gold_0 =               {{   3,    -1,    2},
-                              {   1,     0,    1}};
-  col_gold_1 = strcol_wrapper({ "s0",  "s1", "s2"},
-                              {   1,     1,    1});
-  col_gold_2 =               {{   0,     1,    2}, 
-                              {   1,     1,    1}};
-  col_gold_3 =               {{   2,    -1,   -1}, 
-                              {   1,     0,    0}};
+  
+  col_gold_0 = {{   3,    -1,    2},
+                {   1,     0,    1}};
+  col_gold_1 = {{ "s0",  "s1", "s2"},
+                {   1,     1,    1}};
+  col_gold_2 = {{   0,     1,    2},
+                {   1,     1,    1}};
+  col_gold_3 = {{   3,    -1,   -1},
+                {   1,     0,    0}};
+  col_gold_4 = {{ "s0",   "",   ""},
+                {   1,     0,    0}};
+  col_gold_5 = {{   2,    -1,   -1},
+                {   1,     0,    0}};
 
   // clang-format on
   CVector cols_gold_nulls_unequal;
@@ -506,7 +513,7 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   gold_sort_order = cudf::sorted_order(gold_nulls_unequal.view());
   sorted_gold     = cudf::gather(gold_nulls_unequal.view(), *gold_sort_order);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, InnerJoinSizeOverflow)
@@ -529,7 +536,7 @@ TEST_F(JoinTest, InnerJoinSizeOverflow)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}}), cudf::logic_error);
+  EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}), cudf::logic_error);
 }
 
 TEST_F(JoinTest, InnerJoinNoNulls)
@@ -553,86 +560,28 @@ TEST_F(JoinTest, InnerJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0{{3, 2, 2}};
   strcol_wrapper col_gold_1({"s1", "s0", "s0"});
   column_wrapper<int32_t> col_gold_2{{0, 2, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, 0, 0}};
+  column_wrapper<int32_t> col_gold_3{{3, 2, 2}};
+  strcol_wrapper col_gold_4({"s1", "s0", "s0"});
+  column_wrapper<int32_t> col_gold_5{{1, 0, 0}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-}
-
-TEST_F(JoinTest, InnerJoinNonAlignedCommon)
-{
-  CVector cols0, cols1;
-  cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-  cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-  cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release());
-  cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
-  cols1.emplace_back(column_wrapper<int32_t>{{2, 2, 0, 4, 3}}.release());
-  cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1"}).release());
-  cols1.emplace_back(column_wrapper<int32_t>{{1, 0, 1, 2, 1}}.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  auto result            = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}});
-  auto result_sort_order = cudf::sorted_order(result->view());
-  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-  CVector cols_gold;
-  cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-  cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{0, 2, 1}}.release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 0}}.release());
-  Table gold(std::move(cols_gold));
-
-  auto gold_sort_order = cudf::sorted_order(gold.view());
-  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-}
-
-TEST_F(JoinTest, InnerJoinNonAlignedCommonSwap)
-{
-  CVector cols0, cols1;
-  cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-  cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-  cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release());
-  cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
-  cols1.emplace_back(column_wrapper<int32_t>{{2, 2, 0, 4, 3, 5}}.release());
-  cols1.emplace_back(strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0"}).release());
-  cols1.emplace_back(column_wrapper<int32_t>{{1, 0, 1, 2, 1, 0}}.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  auto result            = cudf::inner_join(t0, t1, {1, 2}, {0, 1}, {{1, 0}, {2, 1}});
-  auto result_sort_order = cudf::sorted_order(result->view());
-  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-  CVector cols_gold;
-  cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-  cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{0, 2, 1}}.release());
-  cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 0}}.release());
-  Table gold(std::move(cols_gold));
-
-  auto gold_sort_order = cudf::sorted_order(gold.view());
-  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, InnerJoinWithNulls)
@@ -656,37 +605,41 @@ TEST_F(JoinTest, InnerJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
   strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
   column_wrapper<int32_t> col_gold_2{{0, 1}};
-  column_wrapper<int32_t> col_gold_3{{1, -1}, {1, 0}};
+  column_wrapper<int32_t> col_gold_3{{3, 2}};
+  strcol_wrapper col_gold_4({"s1", "s0"}, {1, 1});
+  column_wrapper<int32_t> col_gold_5{{1, -1}, {1, 0}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
-// Test to check join behaviour when join keys are null.
+// // Test to check join behaviour when join keys are null.
 TEST_F(JoinTest, InnerJoinOnNulls)
 {
   // clang-format off
   column_wrapper<int32_t> col0_0{{  3,    1,    2,    0,    2}};
-  strcol_wrapper          col0_1({"s1", "s1", "s8", "s4", "s0"}, 
+  strcol_wrapper          col0_1({"s1", "s1", "s8", "s4", "s0"},
                                  {  1,    1,    0,    1,    1});
   column_wrapper<int32_t> col0_2{{  0,    1,    2,    4,    1}};
 
   column_wrapper<int32_t> col1_0{{  2,    2,    0,    4,    3}};
-  strcol_wrapper          col1_1({"s1", "s0", "s1", "s2", "s1"}, 
+  strcol_wrapper          col1_1({"s1", "s0", "s1", "s2", "s1"},
                                  {  1,    0,    1,    1,    1});
   column_wrapper<int32_t> col1_2{{  1,    0,    1,    2,    1}};
 
@@ -701,38 +654,47 @@ TEST_F(JoinTest, InnerJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0 {{  3,    2}};
-  strcol_wrapper          col_gold_1 ({"s1", "s0"}, 
+  strcol_wrapper          col_gold_1 ({"s1", "s0"},
                                       {  1,    0});
   column_wrapper<int32_t> col_gold_2{{   0,    2}};
-  column_wrapper<int32_t> col_gold_3{{   1,    0}};
+  column_wrapper<int32_t> col_gold_3 {{  3,    2}};
+  strcol_wrapper          col_gold_4 ({"s1", "s0"},
+                                      {  1,    0});
+  column_wrapper<int32_t> col_gold_5{{   1,    0}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
   cols_gold.push_back(col_gold_2.release());
   cols_gold.push_back(col_gold_3.release());
+  cols_gold.push_back(col_gold_4.release());
+  cols_gold.push_back(col_gold_5.release());
+  
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-  
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}}, cudf::null_equality::UNEQUAL);
+  result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1},  cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   col_gold_0 =               {{  3}};
-  col_gold_1 = strcol_wrapper({"s1"}, 
+  col_gold_1 = strcol_wrapper({"s1"},
                               {  1});
   col_gold_2 =               {{  0}};
-  col_gold_3 =               {{  1}};
+  col_gold_3 =               {{  3}};
+  col_gold_4 = strcol_wrapper({"s1"},
+                              {  1});
+  col_gold_5 =               {{  1}};
 
   // clang-format on
 
@@ -741,11 +703,13 @@ TEST_F(JoinTest, InnerJoinOnNulls)
   cols_gold_sql.push_back(col_gold_1.release());
   cols_gold_sql.push_back(col_gold_2.release());
   cols_gold_sql.push_back(col_gold_3.release());
+  cols_gold_sql.push_back(col_gold_4.release());
+  cols_gold_sql.push_back(col_gold_5.release());
   Table gold_sql(std::move(cols_gold_sql));
 
   gold_sort_order = cudf::sorted_order(gold_sql.view());
   sorted_gold     = cudf::gather(gold_sql.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 // Empty Left Table
@@ -766,8 +730,8 @@ TEST_F(JoinTest, EmptyLeftTableInnerJoin)
   Table empty0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result);
+  auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result);
 }
 
 TEST_F(JoinTest, EmptyLeftTableLeftJoin)
@@ -787,36 +751,8 @@ TEST_F(JoinTest, EmptyLeftTableLeftJoin)
   Table empty0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty0, *result);
-}
-
-TEST_F(JoinTest, EmptyLeftTableLeftJoinNonAlignedCommon)
-{
-  column_wrapper<int32_t> col0_0;
-
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
-
-  CVector cols0, cols1;
-  cols0.emplace_back(col0_0.release());
-  cols1.emplace_back(col1_0.release());
-  cols1.emplace_back(col1_1.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  column_wrapper<int32_t> col_gold_0;
-  column_wrapper<int32_t> col_gold_1;
-
-  CVector cols_gold;
-  cols_gold.emplace_back(col_gold_0.release());
-  cols_gold.emplace_back(col_gold_1.release());
-
-  Table gold(std::move(cols_gold));
-
-  auto result = cudf::left_join(t0, t1, {0}, {1}, {{0, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result);
 }
 
 TEST_F(JoinTest, EmptyLeftTableFullJoin)
@@ -833,11 +769,29 @@ TEST_F(JoinTest, EmptyLeftTableFullJoin)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table empty0(std::move(cols0));
-  Table t1(std::move(cols1));
+  Table lhs(std::move(cols0));
+  Table rhs(std::move(cols1));
+
+  auto result            = cudf::full_join(lhs, rhs, {0, 1}, {0, 1});
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_1{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_2{{2, 2, 0, 4, 3}};
+  column_wrapper<int32_t> col_gold_3{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
 
-  auto result = cudf::full_join(empty0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(t1, *result);
+  CVector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 // Empty Right Table
@@ -858,36 +812,8 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result);
-}
-
-TEST_F(JoinTest, EmptyRightTableInnerJoinNonAlignedCommon)
-{
-  column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
-
-  column_wrapper<int32_t> col1_0;
-
-  CVector cols0, cols1;
-  cols0.emplace_back(col0_0.release());
-  cols0.emplace_back(col0_1.release());
-  cols1.emplace_back(col1_0.release());
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  column_wrapper<int32_t> col_gold_0;
-  column_wrapper<int32_t> col_gold_1;
-
-  CVector cols_gold;
-  cols_gold.emplace_back(col_gold_0.release());
-  cols_gold.emplace_back(col_gold_1.release());
-
-  Table gold(std::move(cols_gold));
-
-  auto result = cudf::inner_join(t0, t1, {1}, {0}, {{1, 0}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
 TEST_F(JoinTest, EmptyRightTableLeftJoin)
@@ -907,8 +833,8 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result);
+  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
 }
 
 TEST_F(JoinTest, EmptyRightTableFullJoin)
@@ -928,8 +854,8 @@ TEST_F(JoinTest, EmptyRightTableFullJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(t0, *result);
+  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
 }
 
 // Both tables empty
@@ -950,8 +876,8 @@ TEST_F(JoinTest, BothEmptyInnerJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result);
+  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
 TEST_F(JoinTest, BothEmptyLeftJoin)
@@ -971,8 +897,8 @@ TEST_F(JoinTest, BothEmptyLeftJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result);
+  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
 TEST_F(JoinTest, BothEmptyFullJoin)
@@ -992,11 +918,11 @@ TEST_F(JoinTest, BothEmptyFullJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(empty1, *result);
+  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
-// EqualValues X Inner,Left,Full
+// // EqualValues X Inner,Left,Full
 
 TEST_F(JoinTest, EqualValuesInnerJoin)
 {
@@ -1015,16 +941,22 @@ TEST_F(JoinTest, EqualValuesInnerJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"});
+  column_wrapper<int32_t> col_gold_2{{0, 0, 0, 0}};
+  strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"});
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+
   Table gold(std::move(cols_gold));
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result);
 }
 
 TEST_F(JoinTest, EqualValuesLeftJoin)
@@ -1044,16 +976,21 @@ TEST_F(JoinTest, EqualValuesLeftJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1});
+  column_wrapper<int32_t> col_gold_2{{0, 0, 0, 0}, {1, 1, 1, 1}};
+  strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1});
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
   Table gold(std::move(cols_gold));
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result);
 }
 
 TEST_F(JoinTest, EqualValuesFullJoin)
@@ -1073,16 +1010,21 @@ TEST_F(JoinTest, EqualValuesFullJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"});
+  column_wrapper<int32_t> col_gold_2{{0, 0, 0, 0}};
+  strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"});
+
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
   Table gold(std::move(cols_gold));
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(gold, *result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(gold, *result);
 }
 
 TEST_F(JoinTest, InnerJoinCornerCase)
@@ -1097,18 +1039,20 @@ TEST_F(JoinTest, InnerJoinCornerCase)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0}, {0}, {{0, 0}});
+  auto result            = cudf::inner_join(t0, t1, {0}, {0});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int64_t> col_gold_0{{2, 2, 2, 2}};
+  column_wrapper<int64_t> col_gold_1{{2, 2, 2, 2}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
   Table gold(std::move(cols_gold));
 
   auto gold_sort_order = cudf::sorted_order(gold.view());
   auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, HashJoinSequentialProbes)
@@ -1116,129 +1060,106 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
   CVector cols1;
   cols1.emplace_back(column_wrapper<int32_t>{{2, 2, 0, 4, 3}}.release());
   cols1.emplace_back(strcol_wrapper{{"s1", "s0", "s1", "s2", "s1"}}.release());
-  cols1.emplace_back(column_wrapper<int32_t>{{1, 0, 1, 2, 1}}.release());
 
   Table t1(std::move(cols1));
 
-  cudf::hash_join hash_join(t1, {0, 1}, cudf::null_equality::EQUAL);
+  cudf::hash_join hash_join(t1, cudf::null_equality::EQUAL);
 
   {
     CVector cols0;
     cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 3}}.release());
     cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release());
-    cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
 
     Table t0(std::move(cols0));
 
-    auto result            = hash_join.full_join(t0, {0, 1}, {{0, 0}, {1, 1}});
-    auto result_sort_order = cudf::sorted_order(result->view());
-    auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+    auto result = hash_join.full_join(t0);
+
+    auto result_table =
+      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.first->size()),
+                                          result.first->data()},
+                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.second->size()),
+                                          result.second->data()}});
+    auto result_sort_order = cudf::sorted_order(result_table);
+    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
+
+    column_wrapper<int32_t> col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}};
+    column_wrapper<int32_t> col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}};
 
     CVector cols_gold;
-    cols_gold.emplace_back(column_wrapper<int32_t>{{2, 2, 0, 4, 3, 3, 1, 2, 0}}.release());
-    cols_gold.emplace_back(
-      strcol_wrapper({"s1", "s0", "s1", "s2", "s1", "s0", "s1", "s2", "s4"}).release());
-    cols_gold.emplace_back(
-      column_wrapper<int32_t>{{-1, -1, -1, -1, 1, 0, 1, 2, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}}
-        .release());
-    cols_gold.emplace_back(
-      column_wrapper<int32_t>{{1, 0, 1, 2, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}}
-        .release());
-    Table gold(std::move(cols_gold));
+    cols_gold.push_back(col_gold_0.release());
+    cols_gold.push_back(col_gold_1.release());
 
+    Table gold(std::move(cols_gold));
     auto gold_sort_order = cudf::sorted_order(gold.view());
     auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 
   {
     CVector cols0;
     cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 3}}.release());
     cols0.emplace_back(strcol_wrapper({"s0", "s1", "s2", "s4", "s1"}).release());
-    cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
 
     Table t0(std::move(cols0));
 
-    auto result            = hash_join.left_join(t0, {0, 1}, {{0, 0}, {1, 1}});
-    auto result_sort_order = cudf::sorted_order(result->view());
-    auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
-
-    CVector cols_gold;
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 3, 1, 2, 0}, {1, 1, 1, 1, 1}}.release());
-    cols_gold.emplace_back(
-      strcol_wrapper({"s1", "s0", "s1", "s2", "s4"}, {1, 1, 1, 1, 1, 1}).release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 1, 2, 4}, {1, 1, 1, 1, 1}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}}.release());
-    Table gold(std::move(cols_gold));
-
-    auto gold_sort_order = cudf::sorted_order(gold.view());
-    auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
-  }
-
-  {
-    CVector cols0;
-    cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-    cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-    cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release());
-    cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
-
-    Table t0(std::move(cols0));
+    auto result = hash_join.left_join(t0);
+    auto result_table =
+      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.first->size()),
+                                          result.first->data()},
+                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.second->size()),
+                                          result.second->data()}});
+    auto result_sort_order = cudf::sorted_order(result_table);
+    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
 
-    auto probe_build_pair = hash_join.inner_join(t0, {1, 2}, {{1, 0}, {2, 1}});
-    auto joined_cols      = probe_build_pair.first->release();
-    auto build_cols       = probe_build_pair.second->release();
-    joined_cols.insert(joined_cols.end(),
-                       std::make_move_iterator(build_cols.begin()),
-                       std::make_move_iterator(build_cols.end()));
-    auto result            = std::make_unique<cudf::table>(std::move(joined_cols));
-    auto result_sort_order = cudf::sorted_order(result->view());
-    auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+    column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
+    column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}};
 
     CVector cols_gold;
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-    cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{0, 2, 1}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 0}}.release());
-    Table gold(std::move(cols_gold));
+    cols_gold.push_back(col_gold_0.release());
+    cols_gold.push_back(col_gold_1.release());
 
+    Table gold(std::move(cols_gold));
     auto gold_sort_order = cudf::sorted_order(gold.view());
     auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 
   {
     CVector cols0;
     cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
-    cols0.emplace_back(column_wrapper<int32_t>{{3, 1, 2, 0, 2}}.release());
     cols0.emplace_back(strcol_wrapper({"s1", "s1", "s0", "s4", "s0"}).release());
-    cols0.emplace_back(column_wrapper<int32_t>{{0, 1, 2, 4, 1}}.release());
 
     Table t0(std::move(cols0));
 
-    auto probe_build_pair = hash_join.inner_join(
-      t0, {1, 2}, {{1, 0}, {2, 1}}, cudf::hash_join::common_columns_output_side::BUILD);
-    auto joined_cols = probe_build_pair.second->release();
-    auto probe_cols  = probe_build_pair.first->release();
-    joined_cols.insert(joined_cols.end(),
-                       std::make_move_iterator(probe_cols.begin()),
-                       std::make_move_iterator(probe_cols.end()));
-    auto result            = std::make_unique<cudf::table>(std::move(joined_cols));
-    auto result_sort_order = cudf::sorted_order(result->view());
-    auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+    auto result = hash_join.inner_join(t0);
+    auto result_table =
+      cudf::table_view({cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.first->size()),
+                                          result.first->data()},
+                        cudf::column_view{cudf::data_type{cudf::type_id::INT32},
+                                          static_cast<cudf::size_type>(result.second->size()),
+                                          result.second->data()}});
+    auto result_sort_order = cudf::sorted_order(result_table);
+    auto sorted_result     = cudf::gather(result_table, *result_sort_order);
+
+    column_wrapper<int32_t> col_gold_0{{2, 4, 0}};
+    column_wrapper<int32_t> col_gold_1{{1, 1, 4}};
 
     CVector cols_gold;
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-    cols_gold.emplace_back(strcol_wrapper({"s1", "s0", "s0"}).release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{1, 0, 0}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{3, 2, 2}}.release());
-    cols_gold.emplace_back(column_wrapper<int32_t>{{0, 2, 1}}.release());
-    Table gold(std::move(cols_gold));
+    cols_gold.push_back(col_gold_0.release());
+    cols_gold.push_back(col_gold_1.release());
 
+    Table gold(std::move(cols_gold));
     auto gold_sort_order = cudf::sorted_order(gold.view());
     auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_gold, *sorted_result);
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
   }
 }
 
@@ -1262,7 +1183,7 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls)
   auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2});
   {
-    auto result      = cudf::left_join(t0, t1, {0}, {0}, {});
+    auto result      = cudf::left_join(t0, t1, {0}, {0});
     auto result_view = result->view();
     auto decoded1    = cudf::dictionary::decode(result_view.column(1));
     auto decoded4    = cudf::dictionary::decode(result_view.column(4));
@@ -1273,18 +1194,8 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls)
                                                    decoded4->view(),
                                                    result_view.column(5)});
 
-    auto gold = cudf::left_join(g0, g1, {0}, {0}, {});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
-  }
-  {
-    auto result      = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-    auto result_view = result->view();
-    auto decoded1    = cudf::dictionary::decode(result_view.column(1));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)});
-
-    auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
+    auto gold = cudf::left_join(g0, g1, {0}, {0});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
   }
 }
 
@@ -1303,17 +1214,21 @@ TEST_F(JoinDictionaryTest, LeftJoinWithNulls)
   auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()});
   auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()});
 
-  auto result      = cudf::left_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::left_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded2    = cudf::dictionary::decode(result_view.column(2));
-  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()});
+  auto decoded5    = cudf::dictionary::decode(result_view.column(5));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 result_view.column(1),
+                                                 decoded2->view(),
+                                                 result_view.column(3),
+                                                 result_view.column(4),
+                                                 decoded5->view()});
 
   auto g0   = cudf::table_view({col0_0, col0_1, col0_2_w});
   auto g1   = cudf::table_view({col1_0, col1_1, col1_2_w});
-  auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
+  auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
 TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
@@ -1331,15 +1246,20 @@ TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
   auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2});
   auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2});
 
-  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded1    = cudf::dictionary::decode(result_view.column(1));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)});
+  auto decoded4    = cudf::dictionary::decode(result_view.column(4));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 decoded1->view(),
+                                                 result_view.column(2),
+                                                 result_view.column(3),
+                                                 decoded4->view(),
+                                                 result_view.column(5)});
 
   auto g0   = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1   = cudf::table_view({col1_0, col1_1_w, col1_2});
-  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
@@ -1358,16 +1278,20 @@ TEST_F(JoinDictionaryTest, InnerJoinWithNulls)
   auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()});
   auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()});
 
-  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded2    = cudf::dictionary::decode(result_view.column(2));
-  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), result_view.column(1), decoded2->view(), decoded3->view()});
+  auto decoded5    = cudf::dictionary::decode(result_view.column(5));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 result_view.column(1),
+                                                 decoded2->view(),
+                                                 result_view.column(3),
+                                                 result_view.column(4),
+                                                 decoded5->view()});
 
   auto g0   = cudf::table_view({col0_0, col0_1, col0_2_w});
   auto g1   = cudf::table_view({col1_0, col1_1, col1_2_w});
-  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
@@ -1386,16 +1310,21 @@ TEST_F(JoinDictionaryTest, FullJoinNoNulls)
   auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2});
   auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2});
 
-  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded1    = cudf::dictionary::decode(result_view.column(1));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), decoded1->view(), result_view.column(2), result_view.column(3)});
+  auto decoded4    = cudf::dictionary::decode(result_view.column(4));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 decoded1->view(),
+                                                 result_view.column(2),
+                                                 result_view.column(3),
+                                                 decoded4->view(),
+                                                 result_view.column(5)});
 
   auto g0   = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1   = cudf::table_view({col1_0, col1_1_w, col1_2});
-  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
+  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
 TEST_F(JoinDictionaryTest, FullJoinWithNulls)
@@ -1413,16 +1342,21 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls)
   auto t0 = cudf::table_view({col0_0->view(), col0_1, col0_2});
   auto t1 = cudf::table_view({col1_0->view(), col1_1, col1_2});
 
-  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
+  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded0    = cudf::dictionary::decode(result_view.column(0));
-  std::vector<cudf::column_view> result_decoded(
-    {decoded0->view(), result_view.column(1), result_view.column(2), result_view.column(3)});
+  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
+  std::vector<cudf::column_view> result_decoded({decoded0->view(),
+                                                 result_view.column(1),
+                                                 result_view.column(2),
+                                                 decoded3->view(),
+                                                 result_view.column(4),
+                                                 result_view.column(5)});
 
   auto g0   = cudf::table_view({col0_0_w, col0_1, col0_2});
   auto g1   = cudf::table_view({col1_0_w, col1_1, col1_2});
-  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1}, {{0, 0}, {1, 1}});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*gold, cudf::table_view(result_decoded));
+  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1});
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/join/semi_join_tests.cpp b/cpp/tests/join/semi_join_tests.cpp
index 13c74616484..8de9610b07d 100644
--- a/cpp/tests/join/semi_join_tests.cpp
+++ b/cpp/tests/join/semi_join_tests.cpp
@@ -20,6 +20,7 @@
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -33,809 +34,3 @@ using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
 
 struct JoinTest : public cudf::test::BaseFixture {
 };
-
-TEST_F(JoinTest, LeftSemiJoin)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result"};
-  std::vector<const char*> e_strings{"quick", "composéd", "result", ""};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{10, 20, 20, 20};
-  column_wrapper<float> expect_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> expect_2{90, 61, 62, 63};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftSemiJoin_with_a_string_key)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result"};
-  std::vector<const char*> e_strings{"quick", "result"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{10, 20};
-  column_wrapper<float> expect_1{5.0, .7};
-  column_wrapper<int8_t> expect_2{90, 62};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftSemiJoin_with_null)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{"quick", "result"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{10, 20};
-  column_wrapper<float> expect_1{5.0, .7};
-  column_wrapper<int8_t> expect_2{90, 62};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result"};
-  std::vector<const char*> e_strings{"accénted", "turtlé", "words"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{20, 20, 50};
-  column_wrapper<float> expect_1{.5, .5, .7};
-  column_wrapper<int8_t> expect_2{77, 78, 41};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin_with_a_string_key)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result"};
-  std::vector<const char*> e_strings{"accénted", "turtlé", "composéd", "", "words"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{20, 20, 20, 20, 50};
-  column_wrapper<float> expect_1{.5, .5, .7, .7, .7};
-  column_wrapper<int8_t> expect_2{77, 78, 61, 63, 41};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin_with_null)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{"accénted", "turtlé", "composéd", "", "words"};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{20, 20, 20, 20, 50};
-  column_wrapper<float> expect_1{.5, .5, .7, .7, .7};
-  column_wrapper<int8_t> expect_2{77, 78, 61, 63, 41};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftSemiAntiJoin_exceptions)
-{
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  //
-  //  table_a has no columns, table_b has columns
-  //  Let's check different permutations of passing table
-  //  with no columns to verify that exceptions are thrown
-  //
-  EXPECT_THROW(cudf::left_semi_join(table_a, table_b, {}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_anti_join(table_a, table_b, {}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_semi_join(table_b, table_a, {}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_anti_join(table_b, table_a, {}, {}, {}), cudf::logic_error);
-
-  //
-  //  table_b has columns, so we'll pass the column checks, but
-  //  these should fail the exception check that the number of
-  //  join columns must be the same for each table
-  //
-  EXPECT_THROW(cudf::left_semi_join(table_b, table_b, {0}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_anti_join(table_b, table_b, {0}, {}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_semi_join(table_b, table_b, {}, {0}, {}), cudf::logic_error);
-
-  EXPECT_THROW(cudf::left_anti_join(table_b, table_b, {}, {0}, {}), cudf::logic_error);
-}
-
-TEST_F(JoinTest, LeftSemiJoin_empty_result)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{};
-  column_wrapper<float> expect_1{};
-  column_wrapper<int8_t> expect_2{};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {});
-
-  EXPECT_EQ(join_table->num_columns(), 0);
-  EXPECT_EQ(join_table->num_rows(), 0);
-
-  auto join_table2 = cudf::left_semi_join(table_a, table_b, {}, {}, {0, 1, 3});
-
-  EXPECT_EQ(join_table2->num_columns(), 3);
-  EXPECT_EQ(join_table2->num_rows(), 0);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(2), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin_empty_result)
-{
-  std::vector<const char*> a_strings{
-    "quick", "accénted", "turtlé", "composéd", "result", "", "words"};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{};
-  column_wrapper<float> expect_1{};
-  column_wrapper<int8_t> expect_2{};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {});
-
-  EXPECT_EQ(join_table->num_columns(), 0);
-  EXPECT_EQ(join_table->num_rows(), 0);
-
-  auto join_table2 = cudf::left_anti_join(table_a, table_b, {}, {}, {0, 1, 3});
-
-  EXPECT_EQ(join_table2->num_columns(), 3);
-  EXPECT_EQ(join_table2->num_rows(), 0);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(2), expect_3);
-}
-
-TEST_F(JoinTest, LeftSemiAntiJoin_empty_table)
-{
-  std::vector<const char*> a_strings{};
-  std::vector<const char*> b_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> e_strings{};
-
-  column_wrapper<int32_t> a_0{};
-  column_wrapper<float> a_1{};
-  column_wrapper<int8_t> a_2{};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{};
-  column_wrapper<float> expect_1{};
-  column_wrapper<int8_t> expect_2{};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table->get_column(3), expect_3);
-
-  auto join_table2 = cudf::left_semi_join(table_b, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table2->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table2->get_column(3), expect_3);
-
-  auto join_table3 = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table3->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table3->get_column(3), expect_3);
-
-  auto join_table4 = cudf::left_anti_join(table_a, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table4->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table4->get_column(3), expect_3);
-
-  auto join_table5 = cudf::left_anti_join(table_a, table_a, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table5->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(join_table5->get_column(3), expect_3);
-}
-
-TEST_F(JoinTest, LeftAntiJoin_empty_right_table)
-{
-  std::vector<const char*> a_strings{"quick", "words", "result", nullptr};
-  std::vector<const char*> b_strings{};
-  std::vector<const char*> e_strings{"quick", "words", "result", nullptr};
-
-  column_wrapper<int32_t> a_0{10, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper a_3(
-    a_strings.begin(),
-    a_strings.end(),
-    thrust::make_transform_iterator(a_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> b_0{};
-  column_wrapper<float> b_1{};
-  column_wrapper<int8_t> b_2{};
-
-  cudf::test::strings_column_wrapper b_3(
-    b_strings.begin(),
-    b_strings.end(),
-    thrust::make_transform_iterator(b_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  column_wrapper<int32_t> expect_0{10, 20, 20, 50};
-  column_wrapper<float> expect_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> expect_2{90, 75, 62, 41};
-
-  cudf::test::strings_column_wrapper expect_3(
-    e_strings.begin(),
-    e_strings.end(),
-    thrust::make_transform_iterator(e_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::vector<std::unique_ptr<cudf::column>> column_a;
-  column_a.push_back(a_0.release());
-  column_a.push_back(a_1.release());
-  column_a.push_back(a_2.release());
-  column_a.push_back(a_3.release());
-
-  std::vector<std::unique_ptr<cudf::column>> column_b;
-  column_b.push_back(b_0.release());
-  column_b.push_back(b_1.release());
-  column_b.push_back(b_2.release());
-  column_b.push_back(b_3.release());
-
-  cudf::table table_a(std::move(column_a));
-  cudf::table table_b(std::move(column_b));
-
-  auto join_table = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(0), expect_0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(1), expect_1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(2), expect_2);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(join_table->get_column(3), expect_3);
-}
-
-struct JoinDictionaryTest : public cudf::test::BaseFixture {
-};
-
-TEST_F(JoinDictionaryTest, LeftSemiJoin)
-{
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-  cudf::test::strings_column_wrapper a_3_w(
-    {"quick", "accénted", "turtlé", "composéd", "result", "", "words"});
-  auto a_3 = cudf::dictionary::encode(a_3_w);
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-  cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result"});
-  auto b_3 = cudf::dictionary::encode(b_3_w);
-
-  auto table_a  = cudf::table_view({a_0, a_1, a_2, a_3->view()});
-  auto table_b  = cudf::table_view({b_0, b_1, b_2, b_3->view()});
-  auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w});
-  auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w});
-  {
-    auto result      = cudf::left_semi_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-    auto result_view = result->view();
-    auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-    auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view(result_decoded), *expected);
-  }
-  {
-    auto result      = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-    auto result_view = result->view();
-    auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-    auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected);
-  }
-}
-
-TEST_F(JoinDictionaryTest, LeftSemiJoinWithNulls)
-{
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-  cudf::test::strings_column_wrapper a_3_w(
-    {"quick", "accénted", "turtlé", "composéd", "result", "", "words"});
-  auto a_3 = cudf::dictionary::encode(a_3_w);
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-  cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result", ""}, {1, 1, 1, 0});
-  auto b_3 = cudf::dictionary::encode(b_3_w);
-
-  auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()});
-  auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()});
-
-  auto result      = cudf::left_semi_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-  auto result_view = result->view();
-  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-  auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w});
-  auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w});
-  auto expected = cudf::left_semi_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected);
-}
-
-TEST_F(JoinDictionaryTest, LeftAntiJoin)
-{
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-  cudf::test::strings_column_wrapper a_3_w(
-    {"quick", "accénted", "turtlé", "composéd", "result", "", "words"});
-  auto a_3 = cudf::dictionary::encode(a_3_w);
-
-  column_wrapper<int32_t> b_0{10, 20, 20};
-  column_wrapper<float> b_1{5.0, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62};
-  cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result"});
-  auto b_3 = cudf::dictionary::encode(b_3_w);
-
-  auto table_a  = cudf::table_view({a_0, a_1, a_2, a_3->view()});
-  auto table_b  = cudf::table_view({b_0, b_1, b_2, b_3->view()});
-  auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w});
-  auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w});
-  {
-    auto result      = cudf::left_anti_join(table_a, table_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-    auto result_view = result->view();
-    auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-    auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1}, {0, 1}, {0, 1, 2, 3});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view(result_decoded), *expected);
-  }
-  {
-    auto result      = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-    auto result_view = result->view();
-    auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-    std::vector<cudf::column_view> result_decoded(
-      {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-    auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected);
-  }
-}
-
-TEST_F(JoinDictionaryTest, LeftAntiJoinWithNulls)
-{
-  column_wrapper<int32_t> a_0{10, 20, 20, 20, 20, 20, 50};
-  column_wrapper<float> a_1{5.0, .5, .5, .7, .7, .7, .7};
-  column_wrapper<int8_t> a_2{90, 77, 78, 61, 62, 63, 41};
-  cudf::test::strings_column_wrapper a_3_w(
-    {"quick", "accénted", "turtlé", "composéd", "result", "", "words"});
-  auto a_3 = cudf::dictionary::encode(a_3_w);
-
-  column_wrapper<int32_t> b_0{10, 20, 20, 50};
-  column_wrapper<float> b_1{5.0, .7, .7, .7};
-  column_wrapper<int8_t> b_2{90, 75, 62, 41};
-  cudf::test::strings_column_wrapper b_3_w({"quick", "words", "result", ""}, {1, 1, 1, 0});
-  auto b_3 = cudf::dictionary::encode(b_3_w);
-
-  auto table_a = cudf::table_view({a_0, a_1, a_2, a_3->view()});
-  auto table_b = cudf::table_view({b_0, b_1, b_2, b_3->view()});
-
-  auto result      = cudf::left_anti_join(table_a, table_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-  auto result_view = result->view();
-  auto decoded3    = cudf::dictionary::decode(result_view.column(3));
-  std::vector<cudf::column_view> result_decoded(
-    {result_view.column(0), result_view.column(1), result_view.column(2), decoded3->view()});
-
-  auto expect_a = cudf::table_view({a_0, a_1, a_2, a_3_w});
-  auto expect_b = cudf::table_view({b_0, b_1, b_2, b_3_w});
-  auto expected = cudf::left_anti_join(expect_a, expect_b, {0, 1, 3}, {0, 1, 3}, {0, 1, 2, 3});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view(result_decoded), *expected);
-}
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index e5501428624..4c72ba2e055 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -134,11 +134,16 @@ def copy_range(Column input_column,
                            input_begin, input_end, target_begin)
 
 
-def gather(Table source_table, Column gather_map, bool keep_index=True):
+def gather(
+    Table source_table,
+    Column gather_map,
+    bool keep_index=True,
+    bool nullify=False
+):
     if not pd.api.types.is_integer_dtype(gather_map.dtype):
         raise ValueError("Gather map is not integer dtype.")
 
-    if len(gather_map) > 0:
+    if len(gather_map) > 0 and not nullify:
         gm_min, gm_max = minmax(gather_map)
         if gm_min < -len(source_table) or gm_max >= len(source_table):
             raise IndexError(f"Gather map index with min {gm_min},"
@@ -154,7 +159,8 @@ def gather(Table source_table, Column gather_map, bool keep_index=True):
         source_table_view = source_table.data_view()
     cdef column_view gather_map_view = gather_map.view()
     cdef cpp_copying.out_of_bounds_policy policy = (
-        cpp_copying.out_of_bounds_policy.DONT_CHECK
+        cpp_copying.out_of_bounds_policy.NULLIFY if nullify
+        else cpp_copying.out_of_bounds_policy.DONT_CHECK
     )
 
     with nogil:
diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/cpp/join.pxd
index 10edf370f5d..c221fea926d 100644
--- a/python/cudf/cudf/_lib/cpp/join.pxd
+++ b/python/cudf/cudf/_lib/cpp/join.pxd
@@ -4,44 +4,40 @@ from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
 from libcpp cimport bool
+from libcpp.pair cimport pair
+from libcpp.memory cimport unique_ptr
 
+from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport size_type
+from rmm._lib.device_uvector cimport device_uvector
 
 
+ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
+
 cdef extern from "cudf/join.hpp" namespace "cudf" nogil:
-    cdef unique_ptr[table] inner_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[pair[int, int]] columns_in_common
+    cdef pair[gather_map_type, gather_map_type] inner_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
-    cdef unique_ptr[table] left_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[pair[int, int]] columns_in_common
+
+    cdef pair[gather_map_type, gather_map_type] left_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
-    cdef unique_ptr[table] full_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[pair[int, int]] columns_in_common
+
+    cdef pair[gather_map_type, gather_map_type] full_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
-    cdef unique_ptr[table] left_semi_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[int] return_columns
+
+    cdef gather_map_type left_semi_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
-    cdef unique_ptr[table] left_anti_join(
-        const table_view left,
-        const table_view right,
-        const vector[int] left_on,
-        const vector[int] right_on,
-        const vector[int] return_columns
+
+    cdef gather_map_type left_anti_join(
+        const table_view left_keys,
+        const table_view right_keys,
     ) except +
diff --git a/python/cudf/cudf/_lib/cpp/table/table_view.pxd b/python/cudf/cudf/_lib/cpp/table/table_view.pxd
index 2f386d337cd..7bbfa69836c 100644
--- a/python/cudf/cudf/_lib/cpp/table/table_view.pxd
+++ b/python/cudf/cudf/_lib/cpp/table/table_view.pxd
@@ -15,6 +15,7 @@ cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil:
         column_view column(size_type column_index) except +
         size_type num_columns() except +
         size_type num_rows() except +
+        table_view select(vector[size_type] column_indices) except +
 
     cdef cppclass mutable_table_view:
         mutable_table_view() except +
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 38f13b9f994..69b8004cede 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -1,222 +1,88 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+import cudf
+
 from collections import OrderedDict
 from itertools import chain
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport unique_ptr, make_unique
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 from libcpp.pair cimport pair
 from libcpp cimport bool
 
+from cudf._lib.column cimport Column
 from cudf._lib.table cimport Table, columns_from_ptr
 
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.types cimport size_type, data_type, type_id
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 cimport cudf._lib.cpp.join as cpp_join
 
-cpdef join(Table lhs,
-           Table rhs,
-           object how,
-           object method,
-           object left_on=None,
-           object right_on=None,
-           bool left_index=False,
-           bool right_index=False
-           ):
-    """
-    Call libcudf++ join for full outer, inner and left joins.
-    """
-
-    cdef Table c_lhs = lhs
-    cdef Table c_rhs = rhs
-
-    # Views might or might not include index
-    cdef table_view lhs_view
-    cdef table_view rhs_view
-
-    # Will hold the join column indices into L and R tables
-    cdef vector[int] left_on_ind
-    cdef vector[int] right_on_ind
-
-    # If left/right index, will pass a full view
-    # must offset the data column indices by # of index columns
-    num_inds_left = len(left_on) + (lhs._num_indices * left_index)
-    num_inds_right = len(right_on) + (rhs._num_indices * right_index)
-    left_on_ind.reserve(num_inds_left)
-    right_on_ind.reserve(num_inds_right)
-
-    # Only used for semi or anti joins
-    # The result columns are only the left hand columns
-    cdef vector[int] all_left_inds = range(
-        lhs._num_columns + (lhs._num_indices * left_index)
-    )
-    cdef vector[int] all_right_inds = range(
-        rhs._num_columns + (rhs._num_indices * right_index)
-    )
 
-    result_col_names = compute_result_col_names(lhs, rhs, how)
-
-    columns_in_common = OrderedDict()
-    cdef vector[pair[int, int]] c_columns_in_common
-
-    # keep track of where the desired index column will end up
-    result_index_pos = None
-    if left_index or right_index:
-        # If either true, we need to process both indices as columns
-        lhs_view = c_lhs.view()
-        rhs_view = c_rhs.view()
-
-        left_join_cols = list(lhs._index_names) + list(lhs._data.keys())
-        right_join_cols = list(rhs._index_names) + list(rhs._data.keys())
-        if left_index and right_index:
-            # Index columns will be common, on the left, dropped from right
-            # Index name is from the left
-            # Both views, must take index column indices
-            left_on_indices = right_on_indices = range(lhs._num_indices)
-            result_idx_positions = range(lhs._num_indices)
-            result_index_names = lhs._index_names
-
-        elif left_index:
-            # Joins left index columns with right 'on' columns
-            left_on_indices = range(lhs._num_indices)
-            right_on_indices = [
-                right_join_cols.index(on_col) for on_col in right_on
-            ]
-
-            # The left index columns 'become' the new RHS columns
-            # and the right index 'survives'
-            result_idx_positions = range(
-                len(left_join_cols), len(left_join_cols) + lhs._num_indices
-            )
-            result_index_names = rhs._index_names
-
-            # but since the common columns are gathered from the left
-            # the rhs 'on' cols are returned on the left of the result
-            # rearrange the names so account for this
-            common = [None] * rhs._num_indices
-            for i in range(rhs._num_indices):
-                common[i] = result_col_names.pop(
-                    result_col_names.index(right_on[i])
-                )
-            result_col_names = common + result_col_names
-        elif right_index:
-            # Joins right index columns with left 'on' columns
-            right_on_indices = range(rhs._num_indices)
-            left_on_indices = [
-                left_join_cols.index(on_col) for on_col in left_on
-            ]
-
-            # The right index columns 'become' the new LHS columns
-            # and the left index survives
-            # since they are already gathered from the left,
-            # no rearranging has to be done
-            result_idx_positions = range(lhs._num_indices)
-            result_index_names = lhs._index_names
-        for i_l, i_r in zip(left_on_indices, right_on_indices):
-            left_on_ind.push_back(i_l)
-            right_on_ind.push_back(i_r)
-            columns_in_common[(i_l, i_r)] = None
-    else:
-        # cuDF's Python layer will create a new RangeIndex for this case
-        lhs_view = c_lhs.data_view()
-        rhs_view = c_rhs.data_view()
-
-        left_join_cols = list(lhs._data.keys())
-        right_join_cols = list(rhs._data.keys())
-
-    # If both left/right_index, joining on indices plus additional cols
-    # If neither, joining on just cols, not indices
-    # In both cases, must match up additional column indices in lhs/rhs
-    if left_index == right_index:
-        for name in left_on:
-            left_on_ind.push_back(left_join_cols.index(name))
-            if name in right_on:
-                if (left_on.index(name) == right_on.index(name)):
-                    columns_in_common[(
-                        left_join_cols.index(name),
-                        right_join_cols.index(name)
-                    )] = None
-        for name in right_on:
-            right_on_ind.push_back(right_join_cols.index(name))
-    c_columns_in_common = list(columns_in_common.keys())
-    cdef unique_ptr[table] c_result
-    if how == 'inner':
-        with nogil:
-            c_result = move(cpp_join.inner_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                c_columns_in_common
-            ))
-    elif how == 'left':
-        with nogil:
-            c_result = move(cpp_join.left_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                c_columns_in_common
-            ))
-    elif how == 'outer':
-        with nogil:
-            c_result = move(cpp_join.full_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                c_columns_in_common
-            ))
-    elif how == 'leftsemi':
-        with nogil:
-            c_result = move(cpp_join.left_semi_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                all_left_inds
-            ))
-    elif how == 'leftanti':
-        with nogil:
-            c_result = move(cpp_join.left_anti_join(
-                lhs_view,
-                rhs_view,
-                left_on_ind,
-                right_on_ind,
-                all_left_inds
-            ))
-
-    all_cols_py = columns_from_ptr(move(c_result))
-    if left_index or right_index:
-        ind_cols = OrderedDict()
-        for name, pos in zip(
-            result_index_names[::-1], result_idx_positions[::-1]
-        ):
-            ind_cols[name] = all_cols_py.pop(pos)
-        index = OrderedDict()
-        for k, v in reversed(ind_cols.items()):
-            index[k] = v
-        index = Table(index)
+# The functions below return the *gathermaps* that represent
+# the join result when joining on the keys `lhs` and `rhs`.
+
+cpdef join(Table lhs, Table rhs, how=None):
+    cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
+    cdef table_view c_lhs = lhs.view()
+    cdef table_view c_rhs = rhs.view()
+
+    if how == "inner":
+        c_result = move(cpp_join.inner_join(
+            c_lhs,
+            c_rhs
+        ))
+    elif how == "left":
+        c_result = move(cpp_join.left_join(
+            c_lhs,
+            c_rhs
+        ))
+    elif how == "outer":
+        c_result = move(cpp_join.full_join(
+            c_lhs,
+            c_rhs
+        ))
     else:
-        index = None
-    data_ordered_dict = OrderedDict(zip(result_col_names, all_cols_py))
-    return Table(data=data_ordered_dict, index=index)
-
-
-def compute_result_col_names(lhs, rhs, how):
-    """
-    Determine the names of the data columns in the result of
-    a libcudf join, based on the original left and right frames
-    as well as the type of join that was performed.
-    """
-    if how in {"left", "inner", "outer", "leftsemi", "leftanti"}:
-        a = lhs._data.keys()
-        if how not in {"leftsemi", "leftanti"}:
-            return list(chain(a, (k for k in rhs._data.keys()
-                        if k not in lhs._data.keys())))
-        return list(a)
+        raise ValueError(f"Invalid join type {how}")
+
+    cdef Column left_rows = _gather_map_as_column(move(c_result.first))
+    cdef Column right_rows = _gather_map_as_column(move(c_result.second))
+    return left_rows, right_rows
+
+
+cpdef semi_join(Table lhs, Table rhs, how=None):
+    # left-semi and left-anti joins
+    cdef cpp_join.gather_map_type c_result
+    cdef table_view c_lhs = lhs.view()
+    cdef table_view c_rhs = rhs.view()
+
+    if how == "leftsemi":
+        c_result = move(cpp_join.left_semi_join(
+            c_lhs,
+            c_rhs
+        ))
+    elif how == "leftanti":
+        c_result = move(cpp_join.left_anti_join(
+            c_lhs,
+            c_rhs
+        ))
     else:
-        raise NotImplementedError(
-            f"{how} merge not supported yet"
-        )
+        raise ValueError(f"Invalid join type {how}")
+
+    cdef Column left_rows = _gather_map_as_column(move(c_result))
+    return (
+        left_rows,
+        None
+    )
+
+
+cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
+    # helple to convert a gather map to a Column
+    cdef size_type size = gather_map.get()[0].size()
+    cdef unique_ptr[column] c_col = make_unique[column](
+        data_type(type_id.INT32),
+        size,
+        gather_map.get()[0].release())
+    return Column.from_unique_ptr(move(c_col))
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 39c278d2abf..bb1bf3c5d5c 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -750,6 +750,9 @@ def _set_categories(
             ordered=ordered,
         )
 
+    def _decategorize(self) -> ColumnBase:
+        return self._column._get_decategorized_column()
+
 
 class CategoricalColumn(column.ColumnBase):
     """Implements operations for Columns of Categorical type
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index dd06d97d105..e59b395ec0f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -827,7 +827,12 @@ def quantile(
     def median(self, skipna: bool = None) -> ScalarLike:
         raise TypeError(f"cannot perform median with type {self.dtype}")
 
-    def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T:
+    def take(
+        self: T,
+        indices: ColumnBase,
+        keep_index: bool = True,
+        nullify: bool = False,
+    ) -> T:
         """Return Column by taking values from the corresponding *indices*.
         """
         # Handle zero size
@@ -836,7 +841,7 @@ def take(self: T, indices: ColumnBase, keep_index: bool = True) -> T:
         try:
             return (
                 self.as_frame()
-                ._gather(indices, keep_index=keep_index)
+                ._gather(indices, keep_index=keep_index, nullify=nullify)
                 ._as_column()
             )
         except RuntimeError as e:
@@ -1004,7 +1009,9 @@ def sort_by_values(
         ascending: bool = True,
         na_position: builtins.str = "last",
     ) -> Tuple[ColumnBase, "cudf.core.column.NumericalColumn"]:
-        col_inds = self.as_frame()._get_sorted_inds(ascending, na_position)
+        col_inds = self.as_frame()._get_sorted_inds(
+            ascending=ascending, na_position=na_position
+        )
         col_keys = self.take(col_inds)
         return col_keys, col_inds
 
@@ -1016,6 +1023,9 @@ def distinct_count(
             raise NotImplementedError(msg)
         return cpp_distinct_count(self, ignore_nulls=dropna)
 
+    def can_cast_safely(self, to_dtype: Dtype) -> bool:
+        raise NotImplementedError()
+
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
         if is_numerical_dtype(dtype):
             return self.as_numerical_column(dtype)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 7ad6eed65a8..da77517c75d 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -362,7 +362,9 @@ def _numeric_quantile(
     ) -> NumericalColumn:
         quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
         # get sorted indices and exclude nulls
-        sorted_indices = self.as_frame()._get_sorted_inds(True, "first")
+        sorted_indices = self.as_frame()._get_sorted_inds(
+            ascending=True, na_position="first"
+        )
         sorted_indices = sorted_indices[self.null_count :]
 
         return cpp_quantile(self, quant, interpolation, sorted_indices, exact)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b5f57356698..01b96151485 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4497,12 +4497,9 @@ def merge(
         else:
             lsuffix, rsuffix = suffixes
 
-        lhs = self.copy(deep=False)
-        rhs = right.copy(deep=False)
-
         # Compute merge
-        gdf_result = super(DataFrame, lhs)._merge(
-            rhs,
+        gdf_result = super()._merge(
+            right,
             on=on,
             left_on=left_on,
             right_on=right_on,
@@ -4510,8 +4507,6 @@ def merge(
             right_index=right_index,
             how=how,
             sort=sort,
-            lsuffix=lsuffix,
-            rsuffix=rsuffix,
             method=method,
             indicator=indicator,
             suffixes=suffixes,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ecff3dee573..fb746d6c794 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -20,6 +20,7 @@
 from cudf import _lib as libcudf
 from cudf._typing import ColumnLike, DataFrameOrSeries
 from cudf.core.column import as_column, build_categorical_column, column_empty
+from cudf.core.join import merge
 from cudf.utils.dtypes import (
     is_categorical_dtype,
     is_column_like,
@@ -595,7 +596,7 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             res.index.names = self._index.names
         return res
 
-    def _get_columns_by_label(self, labels, downcast):
+    def _get_columns_by_label(self, labels, downcast=False):
         """
         Returns columns of the Frame specified by `labels`
 
@@ -612,15 +613,18 @@ def _get_columns_by_index(self, indices):
             data, columns=data.to_pandas_index(), index=self.index
         )
 
-    def _gather(self, gather_map, keep_index=True):
+    def _gather(self, gather_map, keep_index=True, nullify=False):
         if not pd.api.types.is_integer_dtype(gather_map.dtype):
             gather_map = gather_map.astype("int32")
         result = self.__class__._from_table(
             libcudf.copying.gather(
-                self, as_column(gather_map), keep_index=keep_index
+                self,
+                as_column(gather_map),
+                keep_index=keep_index,
+                nullify=nullify,
             )
         )
-        result._copy_type_metadata(self)
+        result._copy_type_metadata(self, include_index=keep_index)
         if keep_index and self._index is not None:
             result._index.names = self._index.names
         return result
@@ -2754,12 +2758,15 @@ def searchsorted(
         else:
             return result
 
-    def _get_sorted_inds(self, ascending=True, na_position="last"):
+    def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
         """
         Sort by the values.
 
         Parameters
         ----------
+        by: list, optional
+            Labels specifying columns to sort by. By default,
+            sort by all columns of `self`
         ascending : bool or list of bool, default True
             If True, sort values in ascending order, otherwise descending.
         na_position : {‘first’ or ‘last’}, default ‘last’
@@ -2794,11 +2801,17 @@ def _get_sorted_inds(self, ascending=True, na_position="last"):
             )
             na_position = 0
 
+        to_sort = (
+            self
+            if by is None
+            else self._get_columns_by_label(by, downcast=False)
+        )
+
         # If given a scalar need to construct a sequence of length # of columns
         if np.isscalar(ascending):
-            ascending = [ascending] * self._num_columns
+            ascending = [ascending] * to_sort._num_columns
 
-        return libcudf.sort.order_by(self, ascending, na_position)
+        return libcudf.sort.order_by(to_sort, ascending, na_position)
 
     def sin(self):
         """
@@ -3329,77 +3342,6 @@ def sqrt(self):
         """
         return self._unaryop("sqrt")
 
-    @staticmethod
-    def _validate_merge_cfg(
-        lhs,
-        rhs,
-        left_on,
-        right_on,
-        on,
-        how,
-        left_index=False,
-        right_index=False,
-        lsuffix=None,
-        rsuffix=None,
-    ):
-        """
-        Error for various combinations of merge input parameters
-        """
-        len_left_on = len(left_on) if left_on is not None else 0
-        len_right_on = len(right_on) if right_on is not None else 0
-
-        # must actually support the requested merge type
-        if how not in ["left", "inner", "outer", "leftanti", "leftsemi"]:
-            raise NotImplementedError(f"{how} merge not supported yet")
-
-        # Passing 'on' with 'left_on' or 'right_on' is potentially ambiguous
-        if on:
-            if left_on or right_on:
-                raise ValueError(
-                    'Can only pass argument "on" OR "left_on" '
-                    'and "right_on", not a combination of both.'
-                )
-
-        # Require same total number of columns to join on in both operands
-        if not (len_left_on + left_index * len(lhs.index.names)) == (
-            len_right_on + right_index * len(rhs.index.names)
-        ):
-            raise ValueError(
-                "Merge operands must have same number of join key columns"
-            )
-
-        # If nothing specified, must have common cols to use implicitly
-        same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys())
-        if not (left_index or right_index):
-            if not (left_on or right_on):
-                if len(same_named_columns) == 0:
-                    raise ValueError("No common columns to perform merge on")
-
-        for name in same_named_columns:
-            if not (
-                name in left_on
-                and name in right_on
-                and (left_on.index(name) == right_on.index(name))
-            ):
-                if not (lsuffix or rsuffix):
-                    raise ValueError(
-                        "there are overlapping columns but "
-                        "lsuffix and rsuffix are not defined"
-                    )
-
-        if on:
-            on_keys = [on] if not isinstance(on, list) else on
-            for key in on_keys:
-                if not (key in lhs._data.keys() and key in rhs._data.keys()):
-                    raise KeyError(f"Key {on} not in both operands")
-        else:
-            for key in left_on:
-                if key not in lhs._data.keys():
-                    raise KeyError(f'Key "{key}" not in left operand')
-            for key in right_on:
-                if key not in rhs._data.keys():
-                    raise KeyError(f'Key "{key}" not in right operand')
-
     def _merge(
         self,
         right,
@@ -3410,84 +3352,33 @@ def _merge(
         right_index=False,
         how="inner",
         sort=False,
-        lsuffix=None,
-        rsuffix=None,
         method="hash",
         indicator=False,
         suffixes=("_x", "_y"),
     ):
-        # Merge doesn't support right, so just swap
+        lhs, rhs = self, right
         if how == "right":
-            return right._merge(
-                self,
-                on=on,
-                left_on=right_on,
-                right_on=left_on,
-                left_index=right_index,
-                right_index=left_index,
-                how="left",
-                sort=sort,
-                lsuffix=rsuffix,
-                rsuffix=lsuffix,
-                method=method,
-                indicator=indicator,
-                suffixes=suffixes,
-            )
-
-        lhs = self
-        rhs = right
-
-        from cudf.core.join import Merge
-
-        mergeop = Merge(
+            # Merge doesn't support right, so just swap
+            how = "left"
+            lhs, rhs = right, self
+            left_on, right_on = right_on, left_on
+            left_index, right_index = right_index, left_index
+            suffixes = (suffixes[1], suffixes[0])
+
+        return merge(
             lhs,
             rhs,
-            on,
-            left_on,
-            right_on,
-            left_index,
-            right_index,
-            how,
-            sort,
-            lsuffix,
-            rsuffix,
-            method,
-            indicator,
-            suffixes,
+            on=on,
+            left_on=left_on,
+            right_on=right_on,
+            left_index=left_index,
+            right_index=right_index,
+            how=how,
+            sort=sort,
+            method=method,
+            indicator=indicator,
+            suffixes=suffixes,
         )
-        to_return = mergeop.perform_merge()
-
-        # If sort=True, Pandas would sort on the key columns in the
-        # same order as given in 'on'. If the indices are used as
-        # keys, the index will be sorted. If one index is specified,
-        # the key column on the other side will be used to sort.
-        # If no index is specified, return a new RangeIndex
-        if sort:
-            to_sort = cudf.DataFrame()
-            if left_index and right_index:
-                by = list(to_return._index._data.columns)
-                if left_on and right_on:
-                    by.extend(to_return[mergeop.left_on]._data.columns)
-            elif left_index:
-                by = list(to_return[mergeop.right_on]._data.columns)
-            elif right_index:
-                by = list(to_return[mergeop.left_on]._data.columns)
-            else:
-                # left_on == right_on, or different names but same columns
-                # in both cases we can sort by either
-                by = [to_return._data[name] for name in mergeop.left_on]
-            for i, col in enumerate(by):
-                to_sort[i] = col
-            inds = to_sort.argsort()
-            if isinstance(to_return, cudf.Index):
-                to_return = to_return.take(inds)
-            else:
-                to_return = to_return.take(
-                    inds, keep_index=(left_index or right_index)
-                )
-            return to_return
-        else:
-            return to_return
 
     def _is_sorted(self, ascending=None, null_position=None):
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 2a5d2647e95..5104629eee0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -13,6 +13,7 @@
 from pandas._config import get_option
 
 import cudf
+from cudf._typing import DtypeObj
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     CategoricalColumn,
@@ -66,6 +67,9 @@ def _to_frame(this_index, index=True, name=None):
 
 
 class Index(Frame, Serializable):
+
+    dtype: DtypeObj
+
     def __new__(
         cls,
         data=None,
@@ -1544,6 +1548,10 @@ def _from_table(cls, table):
         else:
             return as_index(table)
 
+    @classmethod
+    def _from_data(cls, data, index=None):
+        return cls._from_table(Frame(data=data))
+
     _accessors = set()  # type: Set[Any]
 
     @property
diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py
index 6d126c8af4d..0463b8f9df1 100644
--- a/python/cudf/cudf/core/join/__init__.py
+++ b/python/cudf/cudf/core/join/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
-from cudf.core.join.join import Merge
+from cudf.core.join.join import merge
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
new file mode 100644
index 00000000000..3807f408369
--- /dev/null
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+from __future__ import annotations
+
+import collections
+import warnings
+from typing import TYPE_CHECKING, Any, Iterable, Tuple
+
+import numpy as np
+import pandas as pd
+
+import cudf
+from cudf.core.dtypes import CategoricalDtype
+
+if TYPE_CHECKING:
+    from cudf.core.column import CategoricalColumn, ColumnBase
+    from cudf.core.frame import Frame
+
+
+class _Indexer:
+    # Indexer into a column (either a data column or index level).
+    #
+    # >>> df
+    #    a
+    # b
+    # 4  1
+    # 5  2
+    # 6  3
+    # >>> _Indexer("a", column=True).get(df)  # returns column "a" of df
+    # >>> _Indexer("b", index=True).get(df)  # returns index level "b" of df
+
+    def __init__(self, name: Any, column=False, index=False):
+        if column and index:
+            raise ValueError("Cannot specify both column and index")
+        self.name = name
+        self.column, self.index = column, index
+
+    def get(self, obj: Frame) -> ColumnBase:
+        # get the column from `obj`
+        if self.column:
+            return obj._data[self.name]
+        else:
+            if obj._index is not None:
+                return obj._index._data[self.name]
+        raise KeyError()
+
+    def set(self, obj: Frame, value: ColumnBase, validate=False):
+        # set the colum in `obj`
+        if self.column:
+            obj._data.set_by_label(self.name, value, validate=validate)
+        else:
+            if obj._index is not None:
+                obj._index._data.set_by_label(
+                    self.name, value, validate=validate
+                )
+            else:
+                raise KeyError()
+
+
+def _frame_select_by_indexers(
+    frame: Frame, indexers: Iterable[_Indexer]
+) -> Frame:
+    # Select columns from the given `Frame` using `indexers`,
+    # and return a new `Frame`.
+    index_data = frame._data.__class__()
+    data = frame._data.__class__()
+
+    for idx in indexers:
+        if idx.index:
+            index_data.set_by_label(idx.name, idx.get(frame), validate=False)
+        else:
+            data.set_by_label(idx.name, idx.get(frame), validate=False)
+
+    result_index = cudf.Index._from_data(index_data) if index_data else None
+    result = cudf.core.frame.Frame(data=data, index=result_index)
+    return result
+
+
+def _match_join_keys(
+    lcol: ColumnBase, rcol: ColumnBase, how: str
+) -> Tuple[ColumnBase, ColumnBase]:
+    # returns the common dtype that lcol and rcol should be casted to,
+    # before they can be used as left and right join keys.
+    # If no casting is necessary, returns None
+
+    common_type = None
+
+    # cast the keys lcol and rcol to a common dtype
+    ltype = lcol.dtype
+    rtype = rcol.dtype
+
+    # if either side is categorical, different logic
+    if isinstance(ltype, CategoricalDtype) or isinstance(
+        rtype, CategoricalDtype
+    ):
+        return _match_categorical_dtypes(lcol, rcol, how)
+
+    if pd.api.types.is_dtype_equal(ltype, rtype):
+        return lcol, rcol
+
+    if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
+        common_type = (
+            max(ltype, rtype)
+            if ltype.kind == rtype.kind
+            else np.find_common_type([], (ltype, rtype))
+        )
+
+    elif np.issubdtype(ltype, np.datetime64) and np.issubdtype(
+        rtype, np.datetime64
+    ):
+        common_type = max(ltype, rtype)
+
+    if how == "left":
+        if rcol.fillna(0).can_cast_safely(ltype):
+            return lcol, rcol.astype(ltype)
+        else:
+            warnings.warn(
+                f"Can't safely cast column from {rtype} to {ltype}, "
+                "upcasting to {common_type}."
+            )
+
+    return lcol.astype(common_type), rcol.astype(common_type)
+
+
+def _match_categorical_dtypes(
+    lcol: ColumnBase, rcol: ColumnBase, how: str
+) -> Tuple[ColumnBase, ColumnBase]:
+    # cast the keys lcol and rcol to a common dtype
+    # when at least one of them is a categorical type
+    ltype, rtype = lcol.dtype, rcol.dtype
+
+    if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance(
+        rcol, cudf.core.column.CategoricalColumn
+    ):
+        # if both are categoricals, logic is complicated:
+        return _match_categorical_dtypes_both(lcol, rcol, how)
+
+    if isinstance(ltype, CategoricalDtype):
+        if how in {"left", "leftsemi", "leftanti"}:
+            return lcol, rcol.astype(ltype)
+        common_type = ltype.categories.dtype
+    elif isinstance(rtype, CategoricalDtype):
+        common_type = rtype.categories.dtype
+    return lcol.astype(common_type), rcol.astype(common_type)
+
+
+def _match_categorical_dtypes_both(
+    lcol: CategoricalColumn, rcol: CategoricalColumn, how: str
+) -> Tuple[ColumnBase, ColumnBase]:
+    # The commontype depends on both `how` and the specifics of the
+    # categorical variables to be merged.
+
+    ltype, rtype = lcol.dtype, rcol.dtype
+
+    # when both are ordered and both have the same categories,
+    # no casting required:
+    if ltype == rtype:
+        return lcol, rcol
+
+    # Merging categorical variables when only one side is ordered is
+    # ambiguous and not allowed.
+    if ltype.ordered != rtype.ordered:
+        raise TypeError(
+            "Merging on categorical variables with mismatched"
+            " ordering is ambiguous"
+        )
+
+    if ltype.ordered and rtype.ordered:
+        # if we get to here, categories must be what causes the
+        # dtype equality check to fail. And we can never merge
+        # two ordered categoricals with different categories
+        raise TypeError(
+            f"{how} merge between categoricals with "
+            "different categories is only valid when "
+            "neither side is ordered"
+        )
+
+    # the following should now always hold
+    assert not ltype.ordered and not rtype.ordered
+
+    if how == "inner":
+        # cast to category types -- we must cast them back later
+        return _match_join_keys(
+            lcol.cat()._decategorize(), rcol.cat()._decategorize(), how,
+        )
+    elif how in {"left", "leftanti", "leftsemi"}:
+        # always cast to left type
+        return lcol, rcol.astype(ltype)
+    else:
+        # merge categories
+        merged_categories = cudf.concat(
+            [ltype.categories, rtype.categories]
+        ).unique()
+        common_type = cudf.CategoricalDtype(
+            categories=merged_categories, ordered=False
+        )
+        return lcol.astype(common_type), rcol.astype(common_type)
+
+
+def _coerce_to_tuple(obj):
+    if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
+        return tuple(obj)
+    else:
+        return (obj,)
diff --git a/python/cudf/cudf/core/join/casting_logic.py b/python/cudf/cudf/core/join/casting_logic.py
deleted file mode 100644
index eb85cecd14d..00000000000
--- a/python/cudf/cudf/core/join/casting_logic.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-import warnings
-
-import numpy as np
-import pandas as pd
-
-import cudf
-from cudf.core.dtypes import CategoricalDtype
-
-
-def _input_to_libcudf_castrules_both_cat(lcol, rcol, how):
-    """
-    Based off the left and right operands, determine the libcudf
-    merge dtype or error for corner cases where the merge cannot
-    proceed. This function handles categorical variables.
-    Categorical variable typecasting logic depends on both `how`
-    and the specifics of the categorical variables to be merged.
-    Merging categorical variables when only one side is ordered
-    is ambiguous and not allowed. Merging when both categoricals
-    are ordered is allowed, but only when the categories are
-    exactly equal and have equal ordering, and will result in the
-    common dtype.
-    When both sides are unordered, the result categorical depends
-    on the kind of join:
-    - For inner joins, the result will be the intersection of the
-    categories
-    - For left or right joins, the result will be the the left or
-    right dtype respectively. This extends to semi and anti joins.
-    - For outer joins, the result will be the union of categories
-    from both sides.
-
-    """
-    ltype = lcol.dtype
-    rtype = rcol.dtype
-
-    # this function is only to be used to resolve the result when both
-    # sides are categorical
-    if not isinstance(ltype, CategoricalDtype) and isinstance(
-        rtype, CategoricalDtype
-    ):
-        raise TypeError("Both operands must be CategoricalDtype")
-
-    # true for every configuration
-    if ltype == rtype:
-        return ltype
-
-    # raise for any join where ordering doesn't match
-    if ltype.ordered != rtype.ordered:
-        raise TypeError(
-            "Merging on categorical variables with mismatched"
-            " ordering is ambiguous"
-        )
-    elif ltype.ordered and rtype.ordered:
-        # if we get to here, categories must be what causes the
-        # dtype equality check to fail. And we can never merge
-        # two ordered categoricals with different categories
-        raise TypeError(
-            f"{how} merge between categoricals with "
-            "different categories is only valid when "
-            "neither side is ordered"
-        )
-
-    elif how == "inner":
-        # neither ordered, so categories must be different
-        # demote to underlying types
-        return _input_to_libcudf_castrules_any(
-            ltype.categories, rtype.categories, how
-        )
-
-    elif how == "left":
-        return ltype
-    elif how == "right":
-        return rtype
-
-    elif how == "outer":
-        new_cats = cudf.concat([ltype.categories, rtype.categories]).unique()
-        return cudf.CategoricalDtype(categories=new_cats, ordered=False)
-
-
-def _input_to_libcudf_castrules_any_cat(lcol, rcol, how):
-
-    l_is_cat = isinstance(lcol.dtype, CategoricalDtype)
-    r_is_cat = isinstance(rcol.dtype, CategoricalDtype)
-
-    if l_is_cat and r_is_cat:
-        return _input_to_libcudf_castrules_both_cat(lcol, rcol, how)
-    elif l_is_cat or r_is_cat:
-        if l_is_cat and how == "left":
-            return lcol.dtype
-        if r_is_cat and how == "right":
-            return rcol.dtype
-        return (
-            lcol.dtype.categories.dtype
-            if l_is_cat
-            else rcol.dtype.categories.dtype
-        )
-    else:
-        raise ValueError("Neither operand is categorical")
-
-
-def _input_to_libcudf_castrules_any(lcol, rcol, how):
-    """
-    Determine what dtype the left and right hand
-    input columns must be cast to for a libcudf
-    join to proceed.
-    """
-
-    cast_warn = (
-        "can't safely cast column from {} with type"
-        " {} to {}, upcasting to {}"
-    )
-
-    ltype = lcol.dtype
-    rtype = rcol.dtype
-
-    # if either side is categorical, different logic
-    if isinstance(ltype, CategoricalDtype) or isinstance(
-        rtype, CategoricalDtype
-    ):
-        return _input_to_libcudf_castrules_any_cat(lcol, rcol, how)
-
-    libcudf_join_type = None
-    if pd.api.types.is_dtype_equal(ltype, rtype):
-        libcudf_join_type = ltype
-    elif how == "left":
-        check_col = rcol.fillna(0)
-        if not check_col.can_cast_safely(ltype):
-            libcudf_join_type = _input_to_libcudf_castrules_any(
-                lcol, rcol, "inner"
-            )
-            warnings.warn(
-                cast_warn.format("right", rtype, ltype, libcudf_join_type)
-            )
-        else:
-            libcudf_join_type = ltype
-    elif how == "right":
-        check_col = lcol.fillna(0)
-        if not check_col.can_cast_safely(rtype):
-            libcudf_join_type = _input_to_libcudf_castrules_any(
-                lcol, rcol, "inner"
-            )
-            warnings.warn(
-                cast_warn.format("left", ltype, rtype, libcudf_join_type)
-            )
-        else:
-            libcudf_join_type = rtype
-    elif how in {"inner", "outer"}:
-        if (np.issubdtype(ltype, np.number)) and (
-            np.issubdtype(rtype, np.number)
-        ):
-            if ltype.kind == rtype.kind:
-                # both ints or both floats
-                libcudf_join_type = max(ltype, rtype)
-            else:
-                libcudf_join_type = np.find_common_type([], [ltype, rtype])
-        elif np.issubdtype(ltype, np.datetime64) and np.issubdtype(
-            rtype, np.datetime64
-        ):
-            libcudf_join_type = max(ltype, rtype)
-    return libcudf_join_type
-
-
-def _libcudf_to_output_castrules(lcol, rcol, how):
-    """
-    Determine what dtype an output merge key column should be
-    cast to after it has been processed by libcudf. Determine
-    if a column should be promoted to a categorical datatype.
-    For inner merges between unordered categoricals, we get a
-    new categorical variable containing the intersection of
-    the two source variables. For left or right joins, we get
-    the original categorical variable from whichever was the
-    major operand of the join, e.g. left for a left join or
-    right for a right join. In the case of an outer join, the
-    result will be a new categorical variable with both sets
-    of categories.
-    """
-    merge_return_type = None
-
-    ltype = lcol.dtype
-    rtype = rcol.dtype
-
-    if pd.api.types.is_dtype_equal(ltype, rtype):
-        return ltype
-
-    l_is_cat = isinstance(ltype, CategoricalDtype)
-    r_is_cat = isinstance(rtype, CategoricalDtype)
-
-    # we  currently only need to do this for categorical variables
-    if how == "inner":
-        if l_is_cat and r_is_cat:
-            merge_return_type = "category"
-    elif how == "left":
-        if l_is_cat:
-            merge_return_type = ltype
-    elif how == "right":
-        if r_is_cat:
-            merge_return_type = rtype
-    elif how == "outer":
-        if l_is_cat and r_is_cat:
-            new_cats = cudf.concat(
-                [ltype.categories, rtype.categories]
-            ).unique()
-            merge_return_type = cudf.CategoricalDtype(
-                categories=new_cats, ordered=ltype.ordered
-            )
-    return merge_return_type
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index c6da3ee8dc4..1a4826d0570 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,22 +1,85 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
-import itertools
+from __future__ import annotations
 
-import pandas as pd
+import functools
+from collections import namedtuple
+from typing import TYPE_CHECKING, Callable, Tuple
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.join import compute_result_col_names
-from cudf.core.join.casting_logic import (
-    _input_to_libcudf_castrules_any,
-    _libcudf_to_output_castrules,
+from cudf.core.join._join_helpers import (
+    _coerce_to_tuple,
+    _frame_select_by_indexers,
+    _Indexer,
+    _match_join_keys,
 )
 
+if TYPE_CHECKING:
+    from cudf.core.frame import Frame
+
+
+def merge(
+    lhs,
+    rhs,
+    *,
+    on,
+    left_on,
+    right_on,
+    left_index,
+    right_index,
+    how,
+    sort,
+    method,
+    indicator,
+    suffixes,
+):
+    if how in {"leftsemi", "leftanti"}:
+        merge_cls = MergeSemi
+    else:
+        merge_cls = Merge
+    mergeobj = merge_cls(
+        lhs,
+        rhs,
+        on=on,
+        left_on=left_on,
+        right_on=right_on,
+        left_index=left_index,
+        right_index=right_index,
+        how=how,
+        sort=sort,
+        method=method,
+        indicator=indicator,
+        suffixes=suffixes,
+    )
+    return mergeobj.perform_merge()
+
+
+_JoinKeys = namedtuple("JoinKeys", ["left", "right"])
+
 
 class Merge(object):
+    # A namedtuple of indexers representing the left and right keys
+    _keys: _JoinKeys
+
+    # The joiner function must have the following signature:
+    #
+    #     def joiner(
+    #         lhs: Frame,
+    #         rhs: Frame
+    #     ) -> Tuple[Optional[Column], Optional[Column]]:
+    #          ...
+    #
+    # where `lhs` and `rhs` are Frames composed of the left and right
+    # join key. The `joiner` returns a tuple of two Columns
+    # representing the rows to gather from the left- and right- side
+    # tables respectively.
+    _joiner: Callable
+
     def __init__(
         self,
         lhs,
         rhs,
+        *,
         on,
         left_on,
         right_on,
@@ -24,8 +87,6 @@ def __init__(
         right_index,
         how,
         sort,
-        lsuffix,
-        rsuffix,
         method,
         indicator,
         suffixes,
@@ -60,140 +121,252 @@ def __init__(
         sort : bool
             Boolean flag indicating if the output Frame is to be
             sorted on the output's join keys, in left to right order.
-        lsuffix : string
-            The suffix to be appended to left hand column names that
-            are found to exist in the right frame, but are not specified
-            as join keys themselves.
-        rsuffix : string
-            The suffix to be appended to right hand column names that
-            are found to exist in the left frame, but are not specified
-            as join keys themselves.
         suffixes : list like
             Left and right suffixes specified together, unpacked into lsuffix
             and rsuffix.
         """
+        self._validate_merge_params(
+            lhs,
+            rhs,
+            on=on,
+            left_on=left_on,
+            right_on=right_on,
+            left_index=left_index,
+            right_index=right_index,
+            how=how,
+            suffixes=suffixes,
+        )
+        self._joiner = functools.partial(libcudf.join.join, how=how)
+
         self.lhs = lhs
         self.rhs = rhs
+        self.on = on
+        self.left_on = left_on
+        self.right_on = right_on
         self.left_index = left_index
         self.right_index = right_index
-        self.method = method
-        self.sort = sort
-
-        # check that the merge is valid
-
-        self.validate_merge_cfg(
-            lhs,
-            rhs,
-            on,
-            left_on,
-            right_on,
-            left_index,
-            right_index,
-            how,
-            lsuffix,
-            rsuffix,
-            suffixes,
-        )
         self.how = how
-        self.preprocess_merge_params(
-            on, left_on, right_on, lsuffix, rsuffix, suffixes
-        )
-
-    def perform_merge(self):
-        """
-        Call libcudf to perform a merge between the operands. If
-        necessary, cast the input key columns to compatible types.
-        Potentially also cast the output back to categorical.
-        """
-        output_dtypes = self.compute_output_dtypes()
-        self.typecast_input_to_libcudf()
-        libcudf_result = libcudf.join.join(
-            self.lhs,
-            self.rhs,
-            self.how,
-            self.method,
-            left_on=self.left_on,
-            right_on=self.right_on,
-            left_index=self.left_index,
-            right_index=self.right_index,
-        )
-        result = self.out_class._from_table(libcudf_result)
-        result = self.typecast_libcudf_to_output(result, output_dtypes)
-        if isinstance(result, cudf.Index):
-            return result
-        else:
-            return result[
-                compute_result_col_names(self.lhs, self.rhs, self.how)
-            ]
+        self.sort = sort
+        if suffixes:
+            self.lsuffix, self.rsuffix = suffixes
+        self._compute_join_keys()
 
-    def preprocess_merge_params(
-        self, on, left_on, right_on, lsuffix, rsuffix, suffixes
-    ):
-        """
-        Translate a valid configuration of user input parameters into
-        the subset of input configurations handled by the cython layer.
-        Apply suffixes to columns.
-        """
+    @property
+    def _out_class(self):
+        # type of the result
+        out_class = cudf.DataFrame
 
-        self.out_class = cudf.DataFrame
         if isinstance(self.lhs, cudf.MultiIndex) or isinstance(
             self.rhs, cudf.MultiIndex
         ):
-            self.out_class = cudf.MultiIndex
+            out_class = cudf.MultiIndex
         elif isinstance(self.lhs, cudf.Index):
-            self.out_class = self.lhs.__class__
+            out_class = self.lhs.__class__
+        return out_class
 
-        if on:
-            on = [on] if isinstance(on, str) else list(on)
-            left_on = right_on = on
-        else:
-            if left_on:
-                left_on = (
-                    [left_on] if isinstance(left_on, str) else list(left_on)
-                )
-            if right_on:
-                right_on = (
-                    [right_on] if isinstance(right_on, str) else list(right_on)
-                )
+    def perform_merge(self) -> Frame:
+        lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs)
 
-        same_named_columns = set(self.lhs._data.keys()) & set(
-            self.rhs._data.keys()
+        left_table = _frame_select_by_indexers(lhs, self._keys.left)
+        right_table = _frame_select_by_indexers(rhs, self._keys.right)
+
+        left_rows, right_rows = self._joiner(
+            left_table, right_table, how=self.how,
         )
-        if not (left_on or right_on) and not (
-            self.left_index and self.right_index
-        ):
-            left_on = right_on = list(same_named_columns)
-
-        no_suffix_cols = []
-        if left_on and right_on:
-            no_suffix_cols = [
-                left_name
-                for left_name, right_name in zip(left_on, right_on)
-                if left_name == right_name and left_name in same_named_columns
-            ]
+        lhs, rhs = self._restore_categorical_keys(lhs, rhs)
 
-        if suffixes:
-            lsuffix, rsuffix = suffixes
-        for name in same_named_columns:
-            if name not in no_suffix_cols:
-                self.lhs.rename(
-                    {name: f"{name}{lsuffix}"}, inplace=True, axis=1
+        left_result = cudf.core.frame.Frame()
+        right_result = cudf.core.frame.Frame()
+
+        gather_index = self.left_index or self.right_index
+        if left_rows is not None:
+            left_result = lhs._gather(
+                left_rows, nullify=True, keep_index=gather_index
+            )
+        if right_rows is not None:
+            right_result = rhs._gather(
+                right_rows, nullify=True, keep_index=gather_index
+            )
+
+        result = self._merge_results(left_result, right_result)
+
+        if self.sort:
+            result = self._sort_result(result)
+        return result
+
+    def _compute_join_keys(self):
+        # Computes self._keys
+        if (
+            self.left_index
+            or self.right_index
+            or self.left_on
+            or self.right_on
+        ):
+            left_keys = []
+            right_keys = []
+            if self.left_index:
+                left_keys.extend(
+                    [
+                        _Indexer(name=on, index=True)
+                        for on in self.lhs.index.names
+                    ]
                 )
-                self.rhs.rename(
-                    {name: f"{name}{rsuffix}"}, inplace=True, axis=1
+            if self.left_on:
+                # TODO: require left_on or left_index to be specified
+                left_keys.extend(
+                    [
+                        _Indexer(name=on, column=True)
+                        for on in _coerce_to_tuple(self.left_on)
+                    ]
                 )
-                if left_on and name in left_on:
-                    left_on[left_on.index(name)] = f"{name}{lsuffix}"
-                if right_on and name in right_on:
-                    right_on[right_on.index(name)] = f"{name}{rsuffix}"
+            if self.right_index:
+                right_keys.extend(
+                    [
+                        _Indexer(name=on, index=True)
+                        for on in self.rhs.index.names
+                    ]
+                )
+            if self.right_on:
+                # TODO: require right_on or right_index to be specified
+                right_keys.extend(
+                    [
+                        _Indexer(name=on, column=True)
+                        for on in _coerce_to_tuple(self.right_on)
+                    ]
+                )
+        else:
+            # Use `on` if provided. Otherwise,
+            # implicitly use identically named columns as the key columns:
+            on_names = (
+                _coerce_to_tuple(self.on)
+                if self.on is not None
+                else set(self.lhs._data) & set(self.rhs._data)
+            )
+            left_keys = [_Indexer(name=on, column=True) for on in on_names]
+            right_keys = [_Indexer(name=on, column=True) for on in on_names]
+
+        if len(left_keys) != len(right_keys):
+            raise ValueError(
+                "Merge operands must have same number of join key columns"
+            )
+
+        self._keys = _JoinKeys(left=left_keys, right=right_keys)
+
+    def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
+        # Merge the Frames `left_result` and `right_result` into a single
+        # `Frame`, suffixing column names if necessary.
+
+        # If two key columns have the same name, a single output column appears
+        # in the result. For all other join types, the key column from the rhs
+        # is simply dropped. For outer joins, the two key columns are combined
+        # by filling nulls in the left key column with corresponding values
+        # from the right key column:
+        if self.how == "outer":
+            for lkey, rkey in zip(*self._keys):
+                if lkey.name == rkey.name:
+                    # fill nulls in lhs from values in the rhs
+                    lkey.set(
+                        left_result,
+                        lkey.get(left_result).fillna(rkey.get(right_result)),
+                        validate=False,
+                    )
+
+        # Compute the result column names:
+        # left_names and right_names will be a mappings of input column names
+        # to the corresponding names in the final result.
+        left_names = dict(zip(left_result._data, left_result._data))
+        right_names = dict(zip(right_result._data, right_result._data))
+
+        # For any columns from left_result and right_result that have the same
+        # name:
+        # - if they are key columns, keep only the left column
+        # - if they are not key columns, use suffixes to differentiate them
+        #   in the final result
+        common_names = set(left_names) & set(right_names)
+
+        if self.on:
+            key_columns_with_same_name = self.on
+        else:
+            key_columns_with_same_name = [
+                lkey.name
+                for lkey, rkey in zip(*self._keys)
+                if (
+                    (lkey.index, rkey.index) == (False, False)
+                    and lkey.name == rkey.name
+                )
+            ]
+        for name in common_names:
+            if name not in key_columns_with_same_name:
+                left_names[name] = f"{name}{self.lsuffix}"
+                right_names[name] = f"{name}{self.rsuffix}"
+            else:
+                del right_names[name]
+
+        # Assemble the data columns of the result:
+        data = left_result._data.__class__()
+
+        for lcol in left_names:
+            data.set_by_label(
+                left_names[lcol], left_result._data[lcol], validate=False
+            )
+        for rcol in right_names:
+            data.set_by_label(
+                right_names[rcol], right_result._data[rcol], validate=False
+            )
+
+        # Index of the result:
+        if self.left_index and self.right_index:
+            index = left_result._index
+        elif self.left_index:
+            # left_index and right_on
+            index = right_result._index
+        elif self.right_index:
+            # right_index and left_on
+            index = left_result._index
+        else:
+            index = None
 
-        self.left_on = left_on if left_on is not None else []
-        self.right_on = right_on if right_on is not None else []
-        self.lsuffix = lsuffix
-        self.rsuffix = rsuffix
+        # Construct result from data and index:
+        result = self._out_class._from_data(data=data, index=index)
+
+        return result
+
+    def _sort_result(self, result: Frame) -> Frame:
+        # Pandas sorts on the key columns in the
+        # same order as given in 'on'. If the indices are used as
+        # keys, the index will be sorted. If one index is specified,
+        # the key columns on the other side will be used to sort.
+        if self.on:
+            if isinstance(result, cudf.Index):
+                sort_order = result._get_sorted_inds()
+            else:
+                # need a list instead of a tuple here because
+                # _get_sorted_inds calls down to ColumnAccessor.get_by_label
+                # which handles lists and tuples differently
+                sort_order = result._get_sorted_inds(
+                    list(_coerce_to_tuple(self.on))
+                )
+            return result._gather(sort_order, keep_index=False)
+        by = []
+        if self.left_index and self.right_index:
+            if result._index is not None:
+                by.extend(result._index._data.columns)
+        if self.left_on:
+            by.extend(
+                [result._data[col] for col in _coerce_to_tuple(self.left_on)]
+            )
+        if self.right_on:
+            by.extend(
+                [result._data[col] for col in _coerce_to_tuple(self.right_on)]
+            )
+        if by:
+            to_sort = cudf.DataFrame._from_columns(by)
+            sort_order = to_sort.argsort()
+            result = result._gather(sort_order)
+        return result
 
     @staticmethod
-    def validate_merge_cfg(
+    def _validate_merge_params(
         lhs,
         rhs,
         on,
@@ -202,14 +375,11 @@ def validate_merge_cfg(
         left_index,
         right_index,
         how,
-        lsuffix,
-        rsuffix,
         suffixes,
     ):
         """
         Error for various invalid combinations of merge input parameters
         """
-
         # must actually support the requested merge type
         if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}:
             raise NotImplementedError(f"{how} merge not supported yet")
@@ -227,52 +397,8 @@ def validate_merge_cfg(
         ):
             raise ValueError("Can not merge on unnamed Series")
 
-        # Keys need to be in their corresponding operands
-        if on:
-            if isinstance(on, str):
-                on_keys = [on]
-            elif isinstance(on, tuple):
-                on_keys = list(on)
-            else:
-                on_keys = on
-            for key in on_keys:
-                if not (key in lhs._data.keys() and key in rhs._data.keys()):
-                    raise KeyError(f"on key {on} not in both operands")
-        elif left_on and right_on:
-            left_on_keys = (
-                [left_on] if not isinstance(left_on, list) else left_on
-            )
-            right_on_keys = (
-                [right_on] if not isinstance(right_on, list) else right_on
-            )
-
-            for key in left_on_keys:
-                if key not in lhs._data.keys():
-                    raise KeyError(f'Key "{key}" not in left operand')
-            for key in right_on_keys:
-                if key not in rhs._data.keys():
-                    raise KeyError(f'Key "{key}" not in right operand')
-
-        # Require same total number of columns to join on in both operands
-        len_left_on = 0
-        len_right_on = 0
-        if left_on:
-            len_left_on += (
-                len(left_on) if pd.api.types.is_list_like(left_on) else 1
-            )
-        if right_on:
-            len_right_on += (
-                len(right_on) if pd.api.types.is_list_like(right_on) else 1
-            )
-        if not (len_left_on + left_index * lhs._num_indices) == (
-            len_right_on + right_index * rhs._num_indices
-        ):
-            raise ValueError(
-                "Merge operands must have same number of join key columns"
-            )
-
         # If nothing specified, must have common cols to use implicitly
-        same_named_columns = set(lhs._data.keys()) & set(rhs._data.keys())
+        same_named_columns = set(lhs._data) & set(rhs._data)
         if (
             not (left_index or right_index)
             and not (left_on or right_on)
@@ -280,8 +406,7 @@ def validate_merge_cfg(
         ):
             raise ValueError("No common columns to perform merge on")
 
-        if suffixes:
-            lsuffix, rsuffix = suffixes
+        lsuffix, rsuffix = suffixes
         for name in same_named_columns:
             if name == left_on == right_on:
                 continue
@@ -297,134 +422,59 @@ def validate_merge_cfg(
                         "lsuffix and rsuffix are not defined"
                     )
 
-    def typecast_input_to_libcudf(self):
-        """
-        Check each pair of join keys in the left and right hand
-        operands and apply casting rules to match their types
-        before passing the result to libcudf.
-        """
-        lhs_keys, rhs_keys, lhs_cols, rhs_cols = [], [], [], []
-        if self.left_index:
-            lhs_keys.append(self.lhs.index._data.keys())
-            lhs_cols.append(self.lhs.index)
-        if self.right_index:
-            rhs_keys.append(self.rhs.index._data.keys())
-            rhs_cols.append(self.rhs.index)
-        if self.left_on:
-            lhs_keys.append(self.left_on)
-            lhs_cols.append(self.lhs)
-        if self.right_on:
-            rhs_keys.append(self.right_on)
-            rhs_cols.append(self.rhs)
-
-        for l_key_grp, r_key_grp, l_col_grp, r_col_grp in zip(
-            lhs_keys, rhs_keys, lhs_cols, rhs_cols
-        ):
-            for l_key, r_key in zip(l_key_grp, r_key_grp):
-                to_dtype = _input_to_libcudf_castrules_any(
-                    l_col_grp._data[l_key], r_col_grp._data[r_key], self.how
-                )
-                l_col_grp._data[l_key] = l_col_grp._data[l_key].astype(
-                    to_dtype
-                )
-                r_col_grp._data[r_key] = r_col_grp._data[r_key].astype(
-                    to_dtype
-                )
-
-    def compute_output_dtypes(self):
-        """
-        Determine what datatypes should be applied to the result
-        of a libcudf join, baesd on the original left and right
-        frames.
-        """
-
-        index_dtypes = {}
-        l_data_join_cols = {}
-        r_data_join_cols = {}
-
-        data_dtypes = {
-            name: col.dtype
-            for name, col in itertools.chain(
-                self.lhs._data.items(), self.rhs._data.items()
+    def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]:
+        # Match the dtypes of the key columns from lhs and rhs
+        out_lhs = lhs.copy(deep=False)
+        out_rhs = rhs.copy(deep=False)
+        for left_key, right_key in zip(*self._keys):
+            lcol, rcol = left_key.get(lhs), right_key.get(rhs)
+            lcol_casted, rcol_casted = _match_join_keys(
+                lcol, rcol, how=self.how
             )
-        }
-
-        if self.left_index and self.right_index:
-            l_idx_join_cols = list(self.lhs.index._data.values())
-            r_idx_join_cols = list(self.rhs.index._data.values())
-        elif self.left_on and self.right_index:
-            # Keep the orignal dtypes in the LEFT index if possible
-            # should trigger a bunch of no-ops
-            l_idx_join_cols = list(self.lhs.index._data.values())
-            r_idx_join_cols = list(self.lhs.index._data.values())
-            for i, name in enumerate(self.left_on):
-                l_data_join_cols[name] = self.lhs._data[name]
-                r_data_join_cols[name] = list(self.rhs.index._data.values())[i]
-
-        elif self.left_index and self.right_on:
-            # see above
-            l_idx_join_cols = list(self.rhs.index._data.values())
-            r_idx_join_cols = list(self.rhs.index._data.values())
-            for i, name in enumerate(self.right_on):
-                l_data_join_cols[name] = list(self.lhs.index._data.values())[i]
-                r_data_join_cols[name] = self.rhs._data[name]
-
-        if self.left_on and self.right_on:
-            l_data_join_cols = self.lhs._data
-            r_data_join_cols = self.rhs._data
-
-        if self.left_index or self.right_index:
-            for i in range(len(self.lhs.index._data.items())):
-                index_dtypes[i] = _libcudf_to_output_castrules(
-                    l_idx_join_cols[i], r_idx_join_cols[i], self.how
-                )
-
-        for name in itertools.chain(self.left_on, self.right_on):
-            if name in self.left_on and name in self.right_on:
-                data_dtypes[name] = _libcudf_to_output_castrules(
-                    l_data_join_cols[name], r_data_join_cols[name], self.how
-                )
-        return (index_dtypes, data_dtypes)
+            if lcol is not lcol_casted:
+                left_key.set(out_lhs, lcol_casted, validate=False)
+            if rcol is not rcol_casted:
+                right_key.set(out_rhs, rcol_casted, validate=False)
+        return out_lhs, out_rhs
+
+    def _restore_categorical_keys(
+        self, lhs: Frame, rhs: Frame
+    ) -> Tuple[Frame, Frame]:
+        # For inner joins, any categorical keys in `self.lhs` and `self.rhs`
+        # were casted to their category type to produce `lhs` and `rhs`.
+        # Here, we cast them back.
+        out_lhs = lhs.copy(deep=False)
+        out_rhs = rhs.copy(deep=False)
+        if self.how == "inner":
+            for left_key, right_key in zip(*self._keys):
+                if isinstance(
+                    left_key.get(self.lhs).dtype, cudf.CategoricalDtype
+                ) and isinstance(
+                    right_key.get(self.rhs).dtype, cudf.CategoricalDtype
+                ):
+                    left_key.set(
+                        out_lhs,
+                        left_key.get(out_lhs).astype("category"),
+                        validate=False,
+                    )
+                    right_key.set(
+                        out_rhs,
+                        right_key.get(out_rhs).astype("category"),
+                        validate=False,
+                    )
+        return out_lhs, out_rhs
 
-    def typecast_libcudf_to_output(self, output, output_dtypes):
-        """
-        Apply precomputed output index and data column data types
-        to the output of a libcudf join.
-        """
 
-        index_dtypes, data_dtypes = output_dtypes
-        if output._index and len(index_dtypes) > 0:
-            for index_dtype, index_col_lbl, index_col in zip(
-                index_dtypes.values(),
-                output._index._data.keys(),
-                output._index._data.values(),
-            ):
-                if index_dtype:
-                    output._index._data[
-                        index_col_lbl
-                    ] = self._build_output_col(index_col, index_dtype)
-            # reconstruct the Index object as the underlying data types
-            # have changed:
-            output._index = cudf.core.index.Index._from_table(output._index)
-
-        for data_col_lbl, data_col in output._data.items():
-            data_dtype = data_dtypes[data_col_lbl]
-            if data_dtype:
-                output._data[data_col_lbl] = self._build_output_col(
-                    data_col, data_dtype
-                )
-        return output
+class MergeSemi(Merge):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._joiner = functools.partial(
+            libcudf.join.semi_join, how=kwargs["how"]
+        )
 
-    def _build_output_col(self, col, dtype):
-        if isinstance(
-            dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype)
-        ):
-            outcol = cudf.core.column.build_categorical_column(
-                categories=dtype.categories,
-                codes=col.set_mask(None),
-                mask=col.base_mask,
-                ordered=dtype.ordered,
-            )
+    def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame:
+        # semi-join result includes only lhs columns
+        if issubclass(self._out_class, cudf.Index):
+            return self._out_class._from_data(lhs._data)
         else:
-            outcol = col.astype(dtype)
-        return outcol
+            return self._out_class._from_data(lhs._data, index=lhs._index)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 82e89bb00f4..1c1e48e7372 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import itertools
 import numbers
@@ -18,6 +19,7 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.column import column
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import Index, as_index
 
@@ -188,6 +190,19 @@ def names(self):
     def names(self, value):
         value = [None] * self.nlevels if value is None else value
         assert len(value) == self.nlevels
+
+        if len(value) == len(set(value)):
+            # IMPORTANT: if the provided names are unique,
+            # we reconstruct self._data with the names as keys.
+            # If they are not unique, the keys of self._data
+            # and self._names will be different, which can lead
+            # to unexpected behaviour in some cases. This is
+            # definitely buggy, but we can't disallow non-unique
+            # names either...
+            self._data = self._data.__class__._create_unsafe(
+                dict(zip(value, self._data.values())),
+                level_names=self._data.level_names,
+            )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
     def rename(self, names, inplace=False):
@@ -234,7 +249,6 @@ def rename(self, names, inplace=False):
         ValueError: Length of names must match number of levels in MultiIndex.
 
         """
-
         return self.set_names(names, level=None, inplace=inplace)
 
     def set_names(self, names, level=None, inplace=False):
@@ -278,6 +292,10 @@ def set_names(self, names, level=None, inplace=False):
 
         return self._set_names(names=names, inplace=inplace)
 
+    @classmethod
+    def _from_data(cls, data: ColumnAccessor, index=None) -> MultiIndex:
+        return cls.from_frame(cudf.DataFrame._from_data(data))
+
     @classmethod
     def _from_table(cls, table, names=None):
         df = cudf.DataFrame(table._data)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a664c4fb182..71a4a48a07a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -6299,17 +6299,24 @@ def merge(
         method="hash",
         suffixes=("_x", "_y"),
     ):
-
         if left_on not in (self.name, None):
             raise ValueError(
                 "Series to other merge uses series name as key implicitly"
             )
 
-        lhs = self.copy(deep=False)
-        rhs = other.copy(deep=False)
+        if lsuffix or rsuffix:
+            raise ValueError(
+                "The lsuffix and rsuffix keywords have been replaced with the "
+                "``suffixes=`` keyword.  "
+                "Please provide the following instead: \n\n"
+                "    suffixes=('%s', '%s')"
+                % (lsuffix or "_x", rsuffix or "_y")
+            )
+        else:
+            lsuffix, rsuffix = suffixes
 
-        result = super(Series, lhs)._merge(
-            rhs,
+        result = super()._merge(
+            other,
             on=on,
             left_on=left_on,
             right_on=right_on,
@@ -6317,8 +6324,6 @@ def merge(
             right_index=right_index,
             how=how,
             sort=sort,
-            lsuffix=lsuffix,
-            rsuffix=rsuffix,
             method=method,
             indicator=False,
             suffixes=suffixes,
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 969cf1bf549..9164bfe98d1 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -14,11 +14,13 @@
     assert_exceptions_equal,
 )
 
+_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi")
+
 
 def make_params():
     np.random.seed(0)
 
-    hows = "left,inner,outer,right,leftanti,leftsemi".split(",")
+    hows = _JOIN_TYPES
     methods = "hash,sort".split(",")
 
     # Test specific cases (1)
@@ -69,6 +71,37 @@ def pd_odd_joins(left, right, join_type):
         return left[left.index.isin(right.index)][left.columns]
 
 
+def assert_join_results_equal(expect, got, how, **kwargs):
+    if how not in _JOIN_TYPES:
+        raise ValueError(f"Unrecognized join type {how}")
+    if how == "right":
+        got = got[expect.columns]
+
+    if isinstance(expect, (pd.Series, cudf.Series)):
+        return assert_eq(
+            expect.sort_values().reset_index(drop=True),
+            got.sort_values().reset_index(drop=True),
+            **kwargs,
+        )
+    elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)):
+        if not len(
+            expect.columns
+        ):  # can't sort_values() on a df without columns
+            return assert_eq(expect, got, **kwargs)
+
+        return assert_eq(
+            expect.sort_values(expect.columns.to_list()).reset_index(
+                drop=True
+            ),
+            got.sort_values(got.columns.to_list()).reset_index(drop=True),
+            **kwargs,
+        )
+    elif isinstance(expect, (pd.Index, cudf.Index)):
+        return assert_eq(expect.sort_values(), got.sort_values(), **kwargs)
+    else:
+        raise ValueError(f"Not a join result: {type(expect).__name__}")
+
+
 @pytest.mark.parametrize("aa,bb,how,method", make_params())
 def test_dataframe_join_how(aa, bb, how, method):
     df = cudf.DataFrame()
@@ -113,12 +146,7 @@ def work_gdf(df):
             # TODO: What is the less hacky way?
             expect.index.name = "bob"
             got.index.name = "mary"
-            assert_eq(
-                got.sort_values(got.columns.to_list()).reset_index(drop=True),
-                expect.sort_values(expect.columns.to_list()).reset_index(
-                    drop=True
-                ),
-            )
+            assert_join_results_equal(expect, got, how=how)
         # if(how=='right'):
         #     _sorted_check_series(expect['a'], expect['b'],
         #                          got['a'], got['b'])
@@ -187,10 +215,7 @@ def test_dataframe_join_cats():
     expect = lhs.to_pandas().join(rhs.to_pandas())
 
     # Note: pandas make an object Index after joining
-    assert_eq(
-        got.sort_values(by="b").sort_index().reset_index(drop=True),
-        expect.reset_index(drop=True),
-    )
+    assert_join_results_equal(expect, got, how="inner")
 
     # Just do some rough checking here.
     assert list(got.columns) == ["b", "c"]
@@ -264,7 +289,7 @@ def test_dataframe_join_mismatch_cats(how):
     expect.data_col_right = expect.data_col_right.astype(np.int64)
     expect.data_col_left = expect.data_col_left.astype(np.int64)
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how=how, check_categorical=False)
 
 
 @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None])
@@ -323,7 +348,7 @@ def test_dataframe_merge_on(on):
         list(pddf_joined.columns)
     ).reset_index(drop=True)
 
-    assert_eq(cdf_result, pdf_result, check_like=True)
+    assert_join_results_equal(cdf_result, pdf_result, how="left")
 
     merge_func_result_cdf = (
         join_result_cudf.to_pandas()
@@ -331,7 +356,7 @@ def test_dataframe_merge_on(on):
         .reset_index(drop=True)
     )
 
-    assert_eq(merge_func_result_cdf, cdf_result, check_like=True)
+    assert_join_results_equal(merge_func_result_cdf, cdf_result, how="left")
 
 
 def test_dataframe_merge_on_unknown_column():
@@ -383,7 +408,7 @@ def test_dataframe_empty_merge():
     expect = cudf.DataFrame({"a": [], "b": [], "c": []})
     got = gdf1.merge(gdf2, how="left", on=["a"])
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 def test_dataframe_merge_order():
@@ -408,7 +433,7 @@ def test_dataframe_merge_order():
     df2["a"] = [7, 8]
 
     df = df1.merge(df2, how="left", on=["id", "a"])
-    assert_eq(gdf, df)
+    assert_join_results_equal(df, gdf, how="left")
 
 
 @pytest.mark.parametrize(
@@ -550,7 +575,7 @@ def test_merge_left_index_zero():
     pd_merge = left.merge(right, left_on="x", right_on="y")
     gd_merge = gleft.merge(gright, left_on="x", right_on="y")
 
-    assert_eq(pd_merge, gd_merge)
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 @pytest.mark.parametrize(
@@ -571,7 +596,7 @@ def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs):
     gright = cudf.from_pandas(right)
     pd_merge = left.merge(right, **kwargs)
     gd_merge = gleft.merge(gright, **kwargs)
-    assert_eq(pd_merge, gd_merge)
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 @pytest.mark.parametrize(
@@ -592,7 +617,7 @@ def test_merge_left_right_index_left_right_on_kwargs(kwargs):
     gright = cudf.from_pandas(right)
     pd_merge = left.merge(right, **kwargs)
     gd_merge = gleft.merge(gright, **kwargs)
-    assert_eq(pd_merge, gd_merge)
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 def test_indicator():
@@ -608,9 +633,10 @@ def test_indicator():
 def test_merge_suffixes():
     pdf = cudf.DataFrame({"x": [1, 2, 1]})
     gdf = cudf.DataFrame({"x": [1, 2, 1]})
-    assert_eq(
+    assert_join_results_equal(
         gdf.merge(gdf, suffixes=("left", "right")),
         pdf.merge(pdf, suffixes=("left", "right")),
+        how="left",
     )
 
     assert_exceptions_equal(
@@ -628,11 +654,14 @@ def test_merge_left_on_right_on():
     gleft = cudf.from_pandas(left)
     gright = cudf.from_pandas(right)
 
-    assert_eq(left.merge(right, on="xx"), gleft.merge(gright, on="xx"))
+    assert_join_results_equal(
+        left.merge(right, on="xx"), gleft.merge(gright, on="xx"), how="left"
+    )
 
-    assert_eq(
+    assert_join_results_equal(
         left.merge(right, left_on="xx", right_on="xx"),
         gleft.merge(gright, left_on="xx", right_on="xx"),
+        how="left",
     )
 
 
@@ -708,7 +737,9 @@ def test_merge_sort(ons, hows):
     pd_merge = left.merge(right, **kwargs)
     # require the join keys themselves to be sorted correctly
     # the non-key columns will NOT match pandas ordering
-    assert_eq(pd_merge[kwargs["on"]], gd_merge[kwargs["on"]])
+    assert_join_results_equal(
+        pd_merge[kwargs["on"]], gd_merge[kwargs["on"]], how="left"
+    )
     pd_merge = pd_merge.drop(kwargs["on"], axis=1)
     gd_merge = gd_merge.drop(kwargs["on"], axis=1)
     if not pd_merge.empty:
@@ -720,7 +751,7 @@ def test_merge_sort(ons, hows):
             drop=True
         )
 
-    assert_eq(pd_merge, gd_merge)
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 @pytest.mark.parametrize(
@@ -781,7 +812,7 @@ def test_join_datetimes_index(dtype):
 
     assert gdf["d"].dtype == np.dtype(dtype)
 
-    assert_eq(pdf, gdf)
+    assert_join_results_equal(pdf, gdf, how="inner")
 
 
 def test_join_with_different_names():
@@ -791,7 +822,7 @@ def test_join_with_different_names():
     gright = cudf.from_pandas(right)
     pd_merge = left.merge(right, how="outer", left_on=["a"], right_on=["b"])
     gd_merge = gleft.merge(gright, how="outer", left_on=["a"], right_on=["b"])
-    assert_eq(pd_merge, gd_merge.sort_values(by=["a"]).reset_index(drop=True))
+    assert_join_results_equal(pd_merge, gd_merge, how="outer")
 
 
 def test_join_same_name_different_order():
@@ -801,9 +832,7 @@ def test_join_same_name_different_order():
     gright = cudf.from_pandas(right)
     pd_merge = left.merge(right, left_on=["a", "b"], right_on=["b", "a"])
     gd_merge = gleft.merge(gright, left_on=["a", "b"], right_on=["b", "a"])
-    assert_eq(
-        pd_merge, gd_merge.sort_values(by=["a_x"]).reset_index(drop=True)
-    )
+    assert_join_results_equal(pd_merge, gd_merge, how="left")
 
 
 def test_join_empty_table_dtype():
@@ -874,10 +903,7 @@ def test_join_multi(how, column_a, column_b, column_c):
     gdf_result = gdf_result[columns]
     pdf_result = pdf_result[columns]
 
-    assert_eq(
-        gdf_result.reset_index(drop=True).fillna(-1),
-        pdf_result.sort_index().reset_index(drop=True).fillna(-1),
-    )
+    assert_join_results_equal(pdf_result, gdf_result, how="inner")
 
 
 @pytest.mark.parametrize(
@@ -967,7 +993,7 @@ def test_merge_multi(kwargs):
     expect.index = range(len(expect))
     got.index = range(len(got))
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 @pytest.mark.parametrize("dtype_l", INTEGER_TYPES)
@@ -997,7 +1023,7 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r):
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 @pytest.mark.parametrize("dtype_l", ["float32", "float64"])
@@ -1032,7 +1058,7 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r):
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 @pytest.mark.parametrize("dtype_l", NUMERIC_TYPES)
@@ -1068,7 +1094,7 @@ def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r):
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 def test_typecast_on_join_no_float_round():
@@ -1092,7 +1118,7 @@ def test_typecast_on_join_no_float_round():
 
     got = gdf_l.merge(gdf_r, on="join_col", how="left")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 @pytest.mark.parametrize(
@@ -1121,10 +1147,7 @@ def test_typecast_on_join_overflow_unsafe(dtypes):
 
     with pytest.warns(
         UserWarning,
-        match=(
-            f"can't safely cast column"
-            f" from right with type {dtype_r} to {dtype_l}"
-        ),
+        match=(f"Can't safely cast column" f" from {dtype_r} to {dtype_l}"),
     ):
         merged = lhs.merge(rhs, on="a", how="left")  # noqa: F841
 
@@ -1165,7 +1188,7 @@ def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r):
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 @pytest.mark.parametrize("dtype_l", ["category", "str", "int32", "float32"])
@@ -1200,7 +1223,7 @@ def test_typecast_on_join_categorical(dtype_l, dtype_r):
     )
 
     got = gdf_l.merge(gdf_r, on="join_col", how="inner")
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 def make_categorical_dataframe(categories, ordered=False):
@@ -1220,7 +1243,7 @@ def test_categorical_typecast_inner():
     expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False)
     expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key")
 
-    assert_eq(expect_data, result["key"])
+    assert_eq(expect_data, result["key"], check_categorical=False)
 
     # Equal categories, unequal ordering -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=False)
@@ -1238,7 +1261,7 @@ def test_categorical_typecast_inner():
 
     expect_dtype = cudf.CategoricalDtype(categories=[2, 3], ordered=False)
     expect_data = cudf.Series([2, 3], dtype=expect_dtype, name="key")
-    assert_eq(expect_data, result["key"])
+    assert_eq(expect_data, result["key"], check_categorical=False)
 
     # One is ordered -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=False)
@@ -1427,20 +1450,10 @@ def test_index_join(lhs, rhs, how, level):
     g_lhs = l_df.set_index(lhs).index
     g_rhs = r_df.set_index(rhs).index
 
-    expected = (
-        p_lhs.join(p_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
-    got = (
-        g_lhs.join(g_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
+    expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False)
+    got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False)
 
-    assert_eq(expected, got)
+    assert_join_results_equal(expected, got, how=how)
 
 
 def test_index_join_corner_cases():
@@ -1461,20 +1474,10 @@ def test_index_join_corner_cases():
     p_rhs = r_pdf.set_index(rhs).index
     g_lhs = l_df.set_index(lhs).index
     g_rhs = r_df.set_index(rhs).index
-    expected = (
-        p_lhs.join(p_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
-    got = (
-        g_lhs.join(g_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
+    expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False)
+    got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False)
 
-    assert_eq(expected, got)
+    assert_join_results_equal(expected, got, how=how)
 
     # sort is supported only in case of two non-MultiIndex join
     # Join when column name doesn't match with level
@@ -1490,7 +1493,7 @@ def test_index_join_corner_cases():
     expected = p_lhs.join(p_rhs, how=how, sort=True)
     got = g_lhs.join(g_rhs, how=how, sort=True)
 
-    assert_eq(expected, got)
+    assert_join_results_equal(expected, got, how=how)
 
     # Pandas Index.join on categorical column returns generic column
     # but cudf will be returning a categorical column itself.
@@ -1504,22 +1507,12 @@ def test_index_join_corner_cases():
     p_rhs = r_pdf.set_index(rhs).index
     g_lhs = l_df.set_index(lhs).index
     g_rhs = r_df.set_index(rhs).index
-    expected = (
-        p_lhs.join(p_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
-    got = (
-        g_lhs.join(g_rhs, level=level, how=how)
-        .to_frame(index=False)
-        .sort_values(by=lhs)
-        .reset_index(drop=True)
-    )
+    expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False)
+    got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False)
 
     got["a"] = got["a"].astype(expected["a"].dtype)
 
-    assert_eq(expected, got)
+    assert_join_results_equal(expected, got, how=how)
 
 
 def test_index_join_exception_cases():
@@ -1573,7 +1566,7 @@ def test_typecast_on_join_indexes():
 
     got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 def test_typecast_on_join_multiindices():
@@ -1624,7 +1617,7 @@ def test_typecast_on_join_multiindices():
     expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"])
     got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 def test_typecast_on_join_indexes_matching_categorical():
@@ -1651,7 +1644,7 @@ def test_typecast_on_join_indexes_matching_categorical():
     expect = expect.set_index("join_col")
     got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="inner")
 
 
 @pytest.mark.parametrize(
@@ -1703,9 +1696,10 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs):
     expect = check_lhs.merge(check_rhs, how=how, **kwargs)
     got = lhs.merge(rhs, how=how, **kwargs)
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how=how)
 
 
+@pytest.mark.xfail(reason="Cannot sort values of list dtype")
 @pytest.mark.parametrize(
     "how", ["left", "inner", "right", "leftanti", "leftsemi"]
 )
@@ -1730,4 +1724,17 @@ def test_merge_with_lists(how):
     expect = pd_left.merge(pd_right, on="a")
     got = gd_left.merge(gd_right, on="a")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how=how)
+
+
+def test_join_renamed_index():
+    df = cudf.DataFrame(
+        {0: [1, 2, 3, 4, 5], 1: [1, 2, 3, 4, 5], "c": [1, 2, 3, 4, 5]}
+    ).set_index([0, 1])
+    df.index.names = ["a", "b"]  # doesn't actually change df._index._data
+
+    expect = df.to_pandas().merge(
+        df.to_pandas(), left_index=True, right_index=True
+    )
+    got = df.merge(df, left_index=True, right_index=True, how="inner")
+    assert_join_results_equal(expect, got, how="inner")
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 8b1ad696f04..2ca6bc622be 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -17,6 +17,7 @@
 from cudf.core._compat import PANDAS_GE_110
 from cudf.core.column.string import StringColumn
 from cudf.core.index import StringIndex, as_index
+from cudf.tests.test_joining import assert_join_results_equal
 from cudf.tests.utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -919,16 +920,12 @@ def test_string_split(data, pat, n, expand):
 
 
 @pytest.mark.parametrize(
-    "str_data,str_data_raise",
-    [
-        ([], 0),
-        (["a", "b", "c", "d", "e"], 0),
-        ([None, None, None, None, None], 1),
-    ],
+    "str_data",
+    [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]],
 )
 @pytest.mark.parametrize("num_keys", [1, 2, 3])
 @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
-def test_string_join_key(str_data, str_data_raise, num_keys, how):
+def test_string_join_key(str_data, num_keys, how):
     other_data = [1, 2, 3, 4, 5][: len(str_data)]
 
     pdf = pd.DataFrame()
@@ -942,19 +939,17 @@ def test_string_join_key(str_data, str_data_raise, num_keys, how):
     pdf2 = pdf.copy()
     gdf2 = gdf.copy()
 
-    expectation = raise_builder(
-        [0 if how == "right" else str_data_raise], (AssertionError)
-    )
+    expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how)
+    got = gdf.merge(gdf2, on=list(range(num_keys)), how=how)
 
-    with expectation:
-        expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how)
-        got = gdf.merge(gdf2, on=list(range(num_keys)), how=how)
+    if len(expect) == 0 and len(got) == 0:
+        expect = expect.reset_index(drop=True)
+        got = got[expect.columns]  # reorder columns
 
-        if len(expect) == 0 and len(got) == 0:
-            expect = expect.reset_index(drop=True)
-            got = got[expect.columns]
+    if how == "right":
+        got = got[expect.columns]  # reorder columns
 
-        assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how=how)
 
 
 @pytest.mark.parametrize(
@@ -998,7 +993,7 @@ def test_string_join_key_nulls(str_data_nulls):
 
     expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64")
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 @pytest.mark.parametrize(
@@ -1027,7 +1022,10 @@ def test_string_join_non_key(str_data, num_cols, how):
         expect = expect.reset_index(drop=True)
         got = got[expect.columns]
 
-    assert_eq(expect, got)
+    if how == "right":
+        got = got[expect.columns]  # reorder columns
+
+    assert_join_results_equal(expect, got, how=how)
 
 
 @pytest.mark.parametrize(
@@ -1068,7 +1066,7 @@ def test_string_join_non_key_nulls(str_data_nulls):
         expect = expect.reset_index(drop=True)
         got = got[expect.columns]
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 def test_string_join_values_nulls():
@@ -1108,7 +1106,7 @@ def test_string_join_values_nulls():
     expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
     got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True)
 
-    assert_eq(expect, got)
+    assert_join_results_equal(expect, got, how="left")
 
 
 @pytest.mark.parametrize(