diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index b6c47afe19d..1c229d2e971 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -57,8 +57,11 @@ struct expression_result {
   /**
    * Helper function to get the subclass type to dispatch methods to.
    */
-  Subclass& subclass() { return static_cast<Subclass&>(*this); }
-  Subclass const& subclass() const { return static_cast<Subclass const&>(*this); }
+  CUDA_DEVICE_CALLABLE Subclass& subclass() { return static_cast<Subclass&>(*this); }
+  CUDA_DEVICE_CALLABLE Subclass const& subclass() const
+  {
+    return static_cast<Subclass const&>(*this);
+  }
 
   // TODO: The index is ignored by the value subclass, but is included in this
   // signature because it is required by the implementation in the template
@@ -70,14 +73,15 @@ struct expression_result {
   // used, whereas passing it as a parameter keeps it in registers for fast
   // access at the point where indexing occurs.
   template <typename Element>
-  __device__ void set_value(cudf::size_type index, possibly_null_value_t<Element, has_nulls> result)
+  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
+                                      possibly_null_value_t<Element, has_nulls> const& result)
   {
-    subclass()->set_value();
+    subclass().template set_value<Element>(index, result);
   }
 
-  __device__ bool is_valid() const { subclass()->is_valid(); }
+  CUDA_DEVICE_CALLABLE bool is_valid() const { return subclass().is_valid(); }
 
-  __device__ T value() const { subclass()->value(); }
+  CUDA_DEVICE_CALLABLE T value() const { return subclass().value(); }
 };
 
 /**
@@ -93,10 +97,11 @@ struct expression_result {
 template <typename T, bool has_nulls>
 struct value_expression_result
   : public expression_result<value_expression_result<T, has_nulls>, T, has_nulls> {
-  __device__ value_expression_result() {}
+  CUDA_DEVICE_CALLABLE value_expression_result() {}
 
   template <typename Element>
-  __device__ void set_value(cudf::size_type index, possibly_null_value_t<Element, has_nulls> result)
+  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
+                                      possibly_null_value_t<Element, has_nulls> const& result)
   {
     if constexpr (std::is_same_v<Element, T>) {
       _obj = result;
@@ -108,7 +113,7 @@ struct value_expression_result
   /**
    * @brief Returns true if the underlying data is valid and false otherwise.
    */
-  __device__ bool is_valid() const
+  CUDA_DEVICE_CALLABLE bool is_valid() const
   {
     if constexpr (has_nulls) { return _obj.has_value(); }
     return true;
@@ -120,7 +125,7 @@ struct value_expression_result
    * If the underlying data is not valid, behavior is undefined. Callers should
    * use is_valid to check for validity before accessing the value.
    */
-  __device__ T value() const
+  CUDA_DEVICE_CALLABLE T value() const
   {
     // Using two separate constexprs silences compiler warnings, whereas an
     // if/else does not. An unconditional return is not ignored by the compiler
@@ -151,10 +156,13 @@ struct mutable_column_expression_result
   : public expression_result<mutable_column_expression_result<has_nulls>,
                              mutable_column_device_view,
                              has_nulls> {
-  __device__ mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj) {}
+  CUDA_DEVICE_CALLABLE mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj)
+  {
+  }
 
   template <typename Element>
-  __device__ void set_value(cudf::size_type index, possibly_null_value_t<Element, has_nulls> result)
+  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
+                                      possibly_null_value_t<Element, has_nulls> const& result)
   {
     if constexpr (has_nulls) {
       if (result.has_value()) {
@@ -171,17 +179,19 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  __device__ bool is_valid() const
+  CUDA_DEVICE_CALLABLE bool is_valid() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
     cudf_assert(false && "This method is not implemented.");
+    // Unreachable return used to silence compiler warnings.
+    return {};
   }
 
   /**
    * @brief Not implemented for this specialization.
    */
-  __device__ mutable_column_device_view value() const
+  CUDA_DEVICE_CALLABLE mutable_column_device_view value() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
@@ -237,11 +247,10 @@ struct expression_evaluator {
    * storing intermediates.
 
    */
-  __device__ expression_evaluator(table_device_view const& left,
-                                  table_device_view const& right,
-                                  expression_device_view const& plan,
-                                  IntermediateDataType<has_nulls>* thread_intermediate_storage)
-    : left(left), right(right), plan(plan), thread_intermediate_storage(thread_intermediate_storage)
+  CUDA_DEVICE_CALLABLE expression_evaluator(table_device_view const& left,
+                                            table_device_view const& right,
+                                            expression_device_view const& plan)
+    : left(left), right(right), plan(plan)
   {
   }
 
@@ -253,13 +262,9 @@ struct expression_evaluator {
    * @param thread_intermediate_storage Pointer to this thread's portion of shared memory for
    * storing intermediates.
    */
-  __device__ expression_evaluator(table_device_view const& table,
-                                  expression_device_view const& plan,
-                                  IntermediateDataType<has_nulls>* thread_intermediate_storage)
-    : left(table),
-      right(table),
-      plan(plan),
-      thread_intermediate_storage(thread_intermediate_storage)
+  CUDA_DEVICE_CALLABLE expression_evaluator(table_device_view const& table,
+                                            expression_device_view const& plan)
+    : expression_evaluator(table, table, plan)
   {
   }
 
@@ -277,48 +282,47 @@ struct expression_evaluator {
    * @return Element The type- and null-resolved data.
    */
   template <typename Element, CUDF_ENABLE_IF(column_device_view::has_element_accessor<Element>())>
-  __device__ possibly_null_value_t<Element, has_nulls> resolve_input(
-    detail::device_data_reference device_data_reference,
+  CUDA_DEVICE_CALLABLE possibly_null_value_t<Element, has_nulls> resolve_input(
+    detail::device_data_reference const& input_reference,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
     thrust::optional<cudf::size_type> right_row_index = {}) const
   {
-    auto const data_index = device_data_reference.data_index;
-    auto const ref_type   = device_data_reference.reference_type;
     // TODO: Everywhere in the code assumes that the table reference is either
     // left or right. Should we error-check somewhere to prevent
     // table_reference::OUTPUT from being specified?
     using ReturnType = possibly_null_value_t<Element, has_nulls>;
-    if (ref_type == detail::device_data_reference_type::COLUMN) {
+    if (input_reference.reference_type == detail::device_data_reference_type::COLUMN) {
       // If we have nullable data, return an empty nullable type with no value if the data is null.
-      auto const& table =
-        (device_data_reference.table_source == table_reference::LEFT) ? left : right;
+      auto const& table = (input_reference.table_source == table_reference::LEFT) ? left : right;
       // Note that the code below assumes that a right index has been passed in
-      // any case where device_data_reference.table_source == table_reference::RIGHT.
+      // any case where input_reference.table_source == table_reference::RIGHT.
       // Otherwise, behavior is undefined.
-      auto const row_index = (device_data_reference.table_source == table_reference::LEFT)
-                               ? left_row_index
-                               : *right_row_index;
+      auto const row_index =
+        (input_reference.table_source == table_reference::LEFT) ? left_row_index : *right_row_index;
       if constexpr (has_nulls) {
-        return table.column(data_index).is_valid(row_index)
-                 ? ReturnType(table.column(data_index).element<Element>(row_index))
+        return table.column(input_reference.data_index).is_valid(row_index)
+                 ? ReturnType(table.column(input_reference.data_index).element<Element>(row_index))
                  : ReturnType();
 
       } else {
-        return ReturnType(table.column(data_index).element<Element>(row_index));
+        return ReturnType(table.column(input_reference.data_index).element<Element>(row_index));
       }
-    } else if (ref_type == detail::device_data_reference_type::LITERAL) {
+    } else if (input_reference.reference_type == detail::device_data_reference_type::LITERAL) {
       if constexpr (has_nulls) {
-        return plan.literals[data_index].is_valid()
-                 ? ReturnType(plan.literals[data_index].value<Element>())
+        return plan.literals[input_reference.data_index].is_valid()
+                 ? ReturnType(plan.literals[input_reference.data_index].value<Element>())
                  : ReturnType();
 
       } else {
-        return ReturnType(plan.literals[data_index].value<Element>());
+        return ReturnType(plan.literals[input_reference.data_index].value<Element>());
       }
-    } else {  // Assumes ref_type == detail::device_data_reference_type::INTERMEDIATE
+    } else {  // Assumes input_reference.reference_type ==
+              // detail::device_data_reference_type::INTERMEDIATE
       // Using memcpy instead of reinterpret_cast<Element*> for safe type aliasing
       // Using a temporary variable ensures that the compiler knows the result is aligned
-      IntermediateDataType<has_nulls> intermediate = thread_intermediate_storage[data_index];
+      IntermediateDataType<has_nulls> intermediate =
+        thread_intermediate_storage[input_reference.data_index];
       ReturnType tmp;
       memcpy(&tmp, &intermediate, sizeof(ReturnType));
       return tmp;
@@ -329,8 +333,9 @@ struct expression_evaluator {
 
   template <typename Element,
             CUDF_ENABLE_IF(not column_device_view::has_element_accessor<Element>())>
-  __device__ possibly_null_value_t<Element, has_nulls> resolve_input(
-    detail::device_data_reference device_data_reference,
+  CUDA_DEVICE_CALLABLE possibly_null_value_t<Element, has_nulls> resolve_input(
+    detail::device_data_reference const& device_data_reference,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
     thrust::optional<cudf::size_type> right_row_index = {}) const
   {
@@ -352,25 +357,29 @@ struct expression_evaluator {
    * @param output_row_index The row in the output to insert the result.
    * @param op The operator to act with.
    */
-  template <typename Input, typename OutputType>
-  __device__ void operator()(OutputType& output_object,
-                             cudf::size_type const input_row_index,
-                             detail::device_data_reference const input,
-                             detail::device_data_reference const output,
-                             cudf::size_type const output_row_index,
-                             ast_operator const op) const
+  template <typename Input, typename ResultSubclass, typename T, bool result_has_nulls>
+  CUDA_DEVICE_CALLABLE void operator()(
+    expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+    cudf::size_type const input_row_index,
+    detail::device_data_reference const& input,
+    detail::device_data_reference const& output,
+    cudf::size_type const output_row_index,
+    ast_operator const op,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage) const
   {
-    auto const typed_input = resolve_input<Input>(input, input_row_index);
+    auto const typed_input =
+      resolve_input<Input>(input, thread_intermediate_storage, input_row_index);
     ast_operator_dispatcher(op,
-                            unary_expression_output_handler<Input>(*this),
+                            unary_expression_output_handler<Input>{},
                             output_object,
                             output_row_index,
                             typed_input,
-                            output);
+                            output,
+                            thread_intermediate_storage);
   }
 
   /**
-   * @brief Callable to perform a unary operation.
+   * @brief Callable to perform a binary operation.
    *
    * @tparam LHS Type of the left input value.
    * @tparam RHS Type of the right input value.
@@ -385,42 +394,30 @@ struct expression_evaluator {
    * @param output_row_index The row in the output to insert the result.
    * @param op The operator to act with.
    */
-  template <typename LHS, typename RHS, typename OutputType>
-  __device__ void operator()(OutputType& output_object,
-                             cudf::size_type const left_row_index,
-                             cudf::size_type const right_row_index,
-                             detail::device_data_reference const lhs,
-                             detail::device_data_reference const rhs,
-                             detail::device_data_reference const output,
-                             cudf::size_type const output_row_index,
-                             ast_operator const op) const
+  template <typename LHS, typename RHS, typename ResultSubclass, typename T, bool result_has_nulls>
+  CUDA_DEVICE_CALLABLE void operator()(
+    expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+    cudf::size_type const left_row_index,
+    cudf::size_type const right_row_index,
+    detail::device_data_reference const& lhs,
+    detail::device_data_reference const& rhs,
+    detail::device_data_reference const& output,
+    cudf::size_type const output_row_index,
+    ast_operator const op,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage) const
   {
-    auto const typed_lhs = resolve_input<LHS>(lhs, left_row_index, right_row_index);
-    auto const typed_rhs = resolve_input<RHS>(rhs, left_row_index, right_row_index);
+    auto const typed_lhs =
+      resolve_input<LHS>(lhs, thread_intermediate_storage, left_row_index, right_row_index);
+    auto const typed_rhs =
+      resolve_input<RHS>(rhs, thread_intermediate_storage, left_row_index, right_row_index);
     ast_operator_dispatcher(op,
-                            binary_expression_output_handler<LHS, RHS>(*this),
+                            binary_expression_output_handler<LHS, RHS>{},
                             output_object,
                             output_row_index,
                             typed_lhs,
                             typed_rhs,
-                            output);
-  }
-
-  template <typename OperatorFunctor,
-            typename LHS,
-            typename RHS,
-            typename OutputType,
-            std::enable_if_t<!detail::is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  __device__ void operator()(OutputType& output_object,
-                             cudf::size_type left_row_index,
-                             cudf::size_type right_row_index,
-                             detail::device_data_reference const lhs,
-                             detail::device_data_reference const rhs,
-                             detail::device_data_reference const output,
-                             cudf::size_type output_row_index,
-                             ast_operator const op) const
-  {
-    cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
+                            output,
+                            thread_intermediate_storage);
   }
 
   /**
@@ -433,10 +430,13 @@ struct expression_evaluator {
    * @param output_object The container that data will be inserted into.
    * @param row_index Row index of all input and output data column(s).
    */
-  template <typename OutputType>
-  __device__ void evaluate(OutputType& output_object, cudf::size_type const row_index)
+  template <typename ResultSubclass, typename T, bool result_has_nulls>
+  CUDA_DEVICE_CALLABLE void evaluate(
+    expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+    cudf::size_type const row_index,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage)
   {
-    evaluate(output_object, row_index, row_index, row_index);
+    evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage);
   }
 
   /**
@@ -451,11 +451,13 @@ struct expression_evaluator {
    * @param right_row_index The row to pull the data from the right table.
    * @param output_row_index The row in the output to insert the result.
    */
-  template <typename OutputType>
-  __device__ void evaluate(OutputType& output_object,
-                           cudf::size_type const left_row_index,
-                           cudf::size_type const right_row_index,
-                           cudf::size_type const output_row_index)
+  template <typename ResultSubclass, typename T, bool result_has_nulls>
+  CUDA_DEVICE_CALLABLE void evaluate(
+    expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+    cudf::size_type const left_row_index,
+    cudf::size_type const right_row_index,
+    cudf::size_type const output_row_index,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage)
   {
     cudf::size_type operator_source_index{0};
     for (cudf::size_type operator_index = 0; operator_index < plan.operators.size();
@@ -465,9 +467,9 @@ struct expression_evaluator {
       auto const arity = ast_operator_arity(op);
       if (arity == 1) {
         // Unary operator
-        auto const input =
+        auto const& input =
           plan.data_references[plan.operator_source_indices[operator_source_index++]];
-        auto const output =
+        auto const& output =
           plan.data_references[plan.operator_source_indices[operator_source_index++]];
         auto input_row_index =
           input.table_source == table_reference::LEFT ? left_row_index : right_row_index;
@@ -478,14 +480,15 @@ struct expression_evaluator {
                         input,
                         output,
                         output_row_index,
-                        op);
+                        op,
+                        thread_intermediate_storage);
       } else if (arity == 2) {
         // Binary operator
-        auto const lhs =
+        auto const& lhs =
           plan.data_references[plan.operator_source_indices[operator_source_index++]];
-        auto const rhs =
+        auto const& rhs =
           plan.data_references[plan.operator_source_indices[operator_source_index++]];
-        auto const output =
+        auto const& output =
           plan.data_references[plan.operator_source_indices[operator_source_index++]];
         type_dispatcher(lhs.data_type,
                         detail::single_dispatch_binary_operator{},
@@ -497,7 +500,8 @@ struct expression_evaluator {
                         rhs,
                         output,
                         output_row_index,
-                        op);
+                        op,
+                        thread_intermediate_storage);
       } else {
         cudf_assert(false && "Invalid operator arity.");
       }
@@ -515,10 +519,7 @@ struct expression_evaluator {
    */
   struct expression_output_handler {
    public:
-    __device__ expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
-      : evaluator(evaluator)
-    {
-    }
+    CUDA_DEVICE_CALLABLE expression_output_handler() {}
 
     /**
      * @brief Resolves an output data reference and assigns result value.
@@ -536,38 +537,43 @@ struct expression_evaluator {
      * @param result Value to assign to output.
      */
     template <typename Element,
-              typename OutputType,
+              typename ResultSubclass,
+              typename T,
+              bool result_has_nulls,
               CUDF_ENABLE_IF(is_rep_layout_compatible<Element>())>
-    __device__ void resolve_output(OutputType& output_object,
-                                   detail::device_data_reference const device_data_reference,
-                                   cudf::size_type const row_index,
-                                   possibly_null_value_t<Element, has_nulls> const result) const
+    CUDA_DEVICE_CALLABLE void resolve_output(
+      expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+      detail::device_data_reference const& device_data_reference,
+      cudf::size_type const row_index,
+      IntermediateDataType<has_nulls>* thread_intermediate_storage,
+      possibly_null_value_t<Element, has_nulls> const& result) const
     {
-      auto const ref_type = device_data_reference.reference_type;
-      if (ref_type == detail::device_data_reference_type::COLUMN) {
+      if (device_data_reference.reference_type == detail::device_data_reference_type::COLUMN) {
         output_object.template set_value<Element>(row_index, result);
-      } else {  // Assumes ref_type == detail::device_data_reference_type::INTERMEDIATE
+      } else {  // Assumes device_data_reference.reference_type ==
+                // detail::device_data_reference_type::INTERMEDIATE
         // Using memcpy instead of reinterpret_cast<Element*> for safe type aliasing.
         // Using a temporary variable ensures that the compiler knows the result is aligned.
         IntermediateDataType<has_nulls> tmp;
         memcpy(&tmp, &result, sizeof(possibly_null_value_t<Element, has_nulls>));
-        evaluator.thread_intermediate_storage[device_data_reference.data_index] = tmp;
+        thread_intermediate_storage[device_data_reference.data_index] = tmp;
       }
     }
 
     template <typename Element,
-              typename OutputType,
-              CUDF_ENABLE_IF(not is_rep_layout_compatible<Element>())>
-    __device__ void resolve_output(OutputType& output_object,
-                                   detail::device_data_reference const device_data_reference,
-                                   cudf::size_type const row_index,
-                                   possibly_null_value_t<Element, has_nulls> const result) const
+              typename ResultSubclass,
+              typename T,
+              bool result_has_nulls,
+              CUDF_ENABLE_IF(!is_rep_layout_compatible<Element>())>
+    CUDA_DEVICE_CALLABLE void resolve_output(
+      expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+      detail::device_data_reference const& device_data_reference,
+      cudf::size_type const row_index,
+      IntermediateDataType<has_nulls>* thread_intermediate_storage,
+      possibly_null_value_t<Element, has_nulls> const& result) const
     {
       cudf_assert(false && "Invalid type in resolve_output.");
     }
-
-   protected:
-    expression_evaluator<has_nulls> const& evaluator;
   };
 
   /**
@@ -578,10 +584,7 @@ struct expression_evaluator {
    */
   template <typename Input>
   struct unary_expression_output_handler : public expression_output_handler {
-    __device__ unary_expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
-      : expression_output_handler(evaluator)
-    {
-    }
+    CUDA_DEVICE_CALLABLE unary_expression_output_handler() {}
 
     /**
      * @brief Callable to perform a unary operation.
@@ -595,31 +598,42 @@ struct expression_evaluator {
      * @param output Output data reference.
      */
     template <ast_operator op,
-              typename OutputType,
+              typename ResultSubclass,
+              typename T,
+              bool result_has_nulls,
               std::enable_if_t<
                 detail::is_valid_unary_op<detail::operator_functor<op, has_nulls>,
                                           possibly_null_value_t<Input, has_nulls>>>* = nullptr>
-    __device__ void operator()(OutputType& output_object,
-                               cudf::size_type const output_row_index,
-                               possibly_null_value_t<Input, has_nulls> const input,
-                               detail::device_data_reference const output) const
+    CUDA_DEVICE_CALLABLE void operator()(
+      expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+      cudf::size_type const output_row_index,
+      possibly_null_value_t<Input, has_nulls> const& input,
+      detail::device_data_reference const& output,
+      IntermediateDataType<has_nulls>* thread_intermediate_storage) const
     {
       // The output data type is the same whether or not nulls are present, so
       // pull from the non-nullable operator.
       using Out = cuda::std::invoke_result_t<detail::operator_functor<op, false>, Input>;
-      this->template resolve_output<Out>(
-        output_object, output, output_row_index, detail::operator_functor<op, has_nulls>{}(input));
+      this->template resolve_output<Out>(output_object,
+                                         output,
+                                         output_row_index,
+                                         thread_intermediate_storage,
+                                         detail::operator_functor<op, has_nulls>{}(input));
     }
 
     template <ast_operator op,
-              typename OutputType,
+              typename ResultSubclass,
+              typename T,
+              bool result_has_nulls,
               std::enable_if_t<
                 !detail::is_valid_unary_op<detail::operator_functor<op, has_nulls>,
                                            possibly_null_value_t<Input, has_nulls>>>* = nullptr>
-    __device__ void operator()(OutputType& output_object,
-                               cudf::size_type const output_row_index,
-                               possibly_null_value_t<Input, has_nulls> const input,
-                               detail::device_data_reference const output) const
+    CUDA_DEVICE_CALLABLE void operator()(
+      expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+      cudf::size_type const output_row_index,
+      possibly_null_value_t<Input, has_nulls> const& input,
+      detail::device_data_reference const& output,
+      IntermediateDataType<has_nulls>* thread_intermediate_storage) const
     {
       cudf_assert(false && "Invalid unary dispatch operator for the provided input.");
     }
@@ -633,10 +647,7 @@ struct expression_evaluator {
    */
   template <typename LHS, typename RHS>
   struct binary_expression_output_handler : public expression_output_handler {
-    __device__ binary_expression_output_handler(expression_evaluator<has_nulls> const& evaluator)
-      : expression_output_handler(evaluator)
-    {
-    }
+    CUDA_DEVICE_CALLABLE binary_expression_output_handler() {}
 
     /**
      * @brief Callable to perform a binary operation.
@@ -651,16 +662,20 @@ struct expression_evaluator {
      * @param output Output data reference.
      */
     template <ast_operator op,
-              typename OutputType,
+              typename ResultSubclass,
+              typename T,
+              bool result_has_nulls,
               std::enable_if_t<detail::is_valid_binary_op<detail::operator_functor<op, has_nulls>,
                                                           possibly_null_value_t<LHS, has_nulls>,
                                                           possibly_null_value_t<RHS, has_nulls>>>* =
                 nullptr>
-    __device__ void operator()(OutputType& output_object,
-                               cudf::size_type const output_row_index,
-                               possibly_null_value_t<LHS, has_nulls> const lhs,
-                               possibly_null_value_t<RHS, has_nulls> const rhs,
-                               detail::device_data_reference const output) const
+    CUDA_DEVICE_CALLABLE void operator()(
+      expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+      cudf::size_type const output_row_index,
+      possibly_null_value_t<LHS, has_nulls> const& lhs,
+      possibly_null_value_t<RHS, has_nulls> const& rhs,
+      detail::device_data_reference const& output,
+      IntermediateDataType<has_nulls>* thread_intermediate_storage) const
     {
       // The output data type is the same whether or not nulls are present, so
       // pull from the non-nullable operator.
@@ -668,20 +683,25 @@ struct expression_evaluator {
       this->template resolve_output<Out>(output_object,
                                          output,
                                          output_row_index,
+                                         thread_intermediate_storage,
                                          detail::operator_functor<op, has_nulls>{}(lhs, rhs));
     }
 
     template <ast_operator op,
-              typename OutputType,
+              typename ResultSubclass,
+              typename T,
+              bool result_has_nulls,
               std::enable_if_t<
                 !detail::is_valid_binary_op<detail::operator_functor<op, has_nulls>,
                                             possibly_null_value_t<LHS, has_nulls>,
                                             possibly_null_value_t<RHS, has_nulls>>>* = nullptr>
-    __device__ void operator()(OutputType& output_object,
-                               cudf::size_type const output_row_index,
-                               possibly_null_value_t<LHS, has_nulls> const lhs,
-                               possibly_null_value_t<RHS, has_nulls> const rhs,
-                               detail::device_data_reference output) const
+    CUDA_DEVICE_CALLABLE void operator()(
+      expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+      cudf::size_type const output_row_index,
+      possibly_null_value_t<LHS, has_nulls> const& lhs,
+      possibly_null_value_t<RHS, has_nulls> const& rhs,
+      detail::device_data_reference const& output,
+      IntermediateDataType<has_nulls>* thread_intermediate_storage) const
     {
       cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
     }
@@ -691,9 +711,6 @@ struct expression_evaluator {
   table_device_view const& right;  ///< The right table to operate on.
   expression_device_view const&
     plan;  ///< The container of device data representing the expression to evaluate.
-  IntermediateDataType<has_nulls>*
-    thread_intermediate_storage;  ///< The shared memory store of intermediates produced during
-                                  ///< evaluation.
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index 1f35b54ea61..dc800bde527 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -107,7 +107,6 @@ struct expression_device_view {
   device_span<ast_operator const> operators;
   device_span<cudf::size_type const> operator_source_indices;
   cudf::size_type num_intermediates;
-  int shmem_per_thread;
 };
 
 /**
@@ -230,6 +229,7 @@ class expression_parser {
 
   expression_device_view device_expression_data;  ///< The collection of data required to evaluate
                                                   ///< the expression on the device.
+  int shmem_per_thread;
 
  private:
   /**
@@ -292,7 +292,7 @@ class expression_parser {
       reinterpret_cast<cudf::size_type const*>(device_data_buffer_ptr + buffer_offsets[3]),
       _operator_source_indices.size());
     device_expression_data.num_intermediates = _intermediate_counter.get_max_used();
-    device_expression_data.shmem_per_thread  = static_cast<int>(
+    shmem_per_thread                         = static_cast<int>(
       (_has_nulls ? sizeof(IntermediateDataType<true>) : sizeof(IntermediateDataType<false>)) *
       device_expression_data.num_intermediates);
   }
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 1f49ee749ec..f4fe670d3cd 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -94,8 +94,7 @@ conditional_join(table_view const& left,
 
   // Allocate storage for the counter used to get the size of the join output
   detail::grid_1d config(left_table->num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block =
-    parser.device_expression_data.shmem_per_thread * config.num_threads_per_block;
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
   join_kind kernel_join_type = join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
 
   // If the join size was not provided as an input, compute it here.
@@ -229,8 +228,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   rmm::device_scalar<std::size_t> size(0, stream, mr);
   CHECK_CUDA(stream.value());
   detail::grid_1d config(left_table->num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block =
-    parser.device_expression_data.shmem_per_thread * config.num_threads_per_block;
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 2ad7c6ad8b8..6dc204441b3 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -70,14 +70,15 @@ __global__ void compute_conditional_join_output_size(
   cudf::size_type const right_num_rows = right_table.num_rows();
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data, thread_intermediate_storage);
+    left_table, right_table, device_expression_data);
 
   for (cudf::size_type left_row_index = left_start_idx; left_row_index < left_num_rows;
        left_row_index += left_stride) {
     bool found_match = false;
     for (cudf::size_type right_row_index = 0; right_row_index < right_num_rows; right_row_index++) {
       auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
-      evaluator.evaluate(output_dest, left_row_index, right_row_index, 0);
+      evaluator.evaluate(
+        output_dest, left_row_index, right_row_index, 0, thread_intermediate_storage);
       if (output_dest.is_valid() && output_dest.value()) {
         if ((join_type != join_kind::LEFT_ANTI_JOIN) &&
             !(join_type == join_kind::LEFT_SEMI_JOIN && found_match)) {
@@ -161,13 +162,14 @@ __global__ void conditional_join(table_device_view left_table,
   unsigned int const activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows);
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data, thread_intermediate_storage);
+    left_table, right_table, device_expression_data);
 
   if (left_row_index < left_num_rows) {
     bool found_match = false;
     for (size_type right_row_index(0); right_row_index < right_num_rows; ++right_row_index) {
       auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
-      evaluator.evaluate(output_dest, left_row_index, right_row_index, 0);
+      evaluator.evaluate(
+        output_dest, left_row_index, right_row_index, 0, thread_intermediate_storage);
 
       if (output_dest.is_valid() && output_dest.value()) {
         // If the rows are equal, then we have found a true match
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 742c01b9c60..bf109dbe1e5 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -69,12 +69,12 @@ __launch_bounds__(max_block_size) __global__
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
   auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
   auto const stride    = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
-  auto evaluator       = cudf::ast::detail::expression_evaluator<has_nulls>(
-    table, device_expression_data, thread_intermediate_storage);
+  auto evaluator =
+    cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);
 
   for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
     auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
-    evaluator.evaluate(output_dest, row_index);
+    evaluator.evaluate(output_dest, row_index, thread_intermediate_storage);
   }
 }
 
@@ -107,12 +107,11 @@ std::unique_ptr<column> compute_column(table_view const& table,
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
   auto constexpr MAX_BLOCK_SIZE = 128;
   auto const block_size =
-    device_expression_data.shmem_per_thread != 0
-      ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / device_expression_data.shmem_per_thread)
+    parser.shmem_per_thread != 0
+      ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / parser.shmem_per_thread)
       : MAX_BLOCK_SIZE;
-  auto const config = cudf::detail::grid_1d{table.num_rows(), block_size};
-  auto const shmem_per_block =
-    device_expression_data.shmem_per_thread * config.num_threads_per_block;
+  auto const config          = cudf::detail::grid_1d{table.num_rows(), block_size};
+  auto const shmem_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   // Execute the kernel
   auto table_device = table_device_view::create(table, stream);