From 496aa47973aa89daf5a3f0b0b76405a8c7f2b92e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 10 Jan 2022 16:33:15 -0600
Subject: [PATCH] Refactor host device macros (#9797)

This PR is a follow-up to #9530 to standardize the names of the macros used for the `__host__ __device__` attributes. Aliases for `__device__` and combinations with inlining have been removed, and the only remaining macro is `CUDF_HOST_DEVICE` which is `__host__ __device__` in device code and empty in host code. See https://github.com/rapidsai/cudf/pull/9530#discussion_r739597422 for more discussion.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jake Hemstad (https://github.com/jrhemstad)
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9797
---
 .../cudf/ast/detail/expression_evaluator.cuh  |  93 ++++++-----
 cpp/include/cudf/ast/detail/operators.hpp     | 152 +++++++++---------
 .../cudf/column/column_device_view.cuh        |  13 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  18 +--
 cpp/include/cudf/detail/indexalator.cuh       |  44 ++---
 cpp/include/cudf/detail/iterator.cuh          |  22 +--
 cpp/include/cudf/detail/null_mask.cuh         |   6 +-
 .../cudf/detail/reduction_operators.cuh       |  41 +++--
 .../detail/utilities/device_operators.cuh     |  28 ++--
 .../cudf/detail/utilities/hash_functions.cuh  | 100 ++++++------
 .../utilities/transform_unary_functions.cuh   |  25 ++-
 cpp/include/cudf/fixed_point/fixed_point.hpp  | 126 +++++++--------
 cpp/include/cudf/fixed_point/temporary.hpp    |   4 +-
 .../cudf/lists/detail/scatter_helper.cuh      |  24 +--
 cpp/include/cudf/lists/list_device_view.cuh   |  43 +++--
 .../cudf/lists/lists_column_device_view.cuh   |  12 +-
 cpp/include/cudf/strings/json.hpp             |   4 +-
 cpp/include/cudf/strings/string_view.cuh      |   4 +-
 cpp/include/cudf/strings/string_view.hpp      | 124 +++++++-------
 cpp/include/cudf/types.hpp                    |  14 +-
 cpp/include/cudf/utilities/bit.hpp            |  22 +--
 .../cudf/utilities/type_dispatcher.hpp        |  16 +-
 cpp/include/cudf/wrappers/dictionary.hpp      |  36 ++---
 cpp/src/binaryop/compiled/binary_ops.cu       |  18 +--
 cpp/src/binaryop/compiled/binary_ops.cuh      |  12 +-
 cpp/src/binaryop/compiled/operation.cuh       |  74 ++++-----
 cpp/src/datetime/datetime_ops.cu              |  16 +-
 cpp/src/groupby/sort/group_correlation.cu     |   2 +-
 .../sort/group_single_pass_reduction_util.cuh |   2 +-
 cpp/src/groupby/sort/sort_helper.cu           |   3 +-
 cpp/src/hash/md5_hash.cu                      |  29 ++--
 cpp/src/io/orc/timezone.cuh                   |   8 +-
 cpp/src/quantiles/quantiles_util.hpp          |  33 ++--
 cpp/src/reductions/scan/scan_inclusive.cu     |   3 +-
 cpp/src/rolling/rolling_detail.hpp            |   8 +-
 cpp/src/strings/json/json_path.cu             |  36 ++---
 cpp/src/strings/regex/regex.cuh               |   2 +-
 cpp/src/strings/regex/regex.inl               |  10 +-
 cpp/src/unary/cast_ops.cu                     |  18 +--
 .../optional_iterator_test_numeric.cu         |   5 +-
 .../iterator/pair_iterator_test_numeric.cu    |   7 +-
 41 files changed, 610 insertions(+), 647 deletions(-)
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index f974088c8e7..0b739482c4d 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -57,11 +57,8 @@ struct expression_result {
   /**
    * Helper function to get the subclass type to dispatch methods to.
    */
-  CUDA_DEVICE_CALLABLE Subclass& subclass() { return static_cast<Subclass&>(*this); }
-  CUDA_DEVICE_CALLABLE Subclass const& subclass() const
-  {
-    return static_cast<Subclass const&>(*this);
-  }
+  __device__ inline Subclass& subclass() { return static_cast<Subclass&>(*this); }
+  __device__ inline Subclass const& subclass() const { return static_cast<Subclass const&>(*this); }
 
   // TODO: The index is ignored by the value subclass, but is included in this
   // signature because it is required by the implementation in the template
@@ -73,15 +70,15 @@ struct expression_result {
   // used, whereas passing it as a parameter keeps it in registers for fast
   // access at the point where indexing occurs.
   template <typename Element>
-  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
-                                      possibly_null_value_t<Element, has_nulls> const& result)
+  __device__ inline void set_value(cudf::size_type index,
+                                   possibly_null_value_t<Element, has_nulls> const& result)
   {
     subclass().template set_value<Element>(index, result);
   }
 
-  CUDA_DEVICE_CALLABLE bool is_valid() const { return subclass().is_valid(); }
+  __device__ inline bool is_valid() const { return subclass().is_valid(); }
 
-  CUDA_DEVICE_CALLABLE T value() const { return subclass().value(); }
+  __device__ inline T value() const { return subclass().value(); }
 };
 
 /**
@@ -97,11 +94,11 @@ struct expression_result {
 template <typename T, bool has_nulls>
 struct value_expression_result
   : public expression_result<value_expression_result<T, has_nulls>, T, has_nulls> {
-  CUDA_DEVICE_CALLABLE value_expression_result() {}
+  __device__ inline value_expression_result() {}
 
   template <typename Element>
-  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
-                                      possibly_null_value_t<Element, has_nulls> const& result)
+  __device__ inline void set_value(cudf::size_type index,
+                                   possibly_null_value_t<Element, has_nulls> const& result)
   {
     if constexpr (std::is_same_v<Element, T>) {
       _obj = result;
@@ -113,7 +110,7 @@ struct value_expression_result
   /**
    * @brief Returns true if the underlying data is valid and false otherwise.
    */
-  CUDA_DEVICE_CALLABLE bool is_valid() const
+  __device__ inline bool is_valid() const
   {
     if constexpr (has_nulls) { return _obj.has_value(); }
     return true;
@@ -125,7 +122,7 @@ struct value_expression_result
    * If the underlying data is not valid, behavior is undefined. Callers should
    * use is_valid to check for validity before accessing the value.
    */
-  CUDA_DEVICE_CALLABLE T value() const
+  __device__ inline T value() const
   {
     // Using two separate constexprs silences compiler warnings, whereas an
     // if/else does not. An unconditional return is not ignored by the compiler
@@ -156,13 +153,11 @@ struct mutable_column_expression_result
   : public expression_result<mutable_column_expression_result<has_nulls>,
                              mutable_column_device_view,
                              has_nulls> {
-  CUDA_DEVICE_CALLABLE mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj)
-  {
-  }
+  __device__ inline mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj) {}
 
   template <typename Element>
-  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
-                                      possibly_null_value_t<Element, has_nulls> const& result)
+  __device__ inline void set_value(cudf::size_type index,
+                                   possibly_null_value_t<Element, has_nulls> const& result)
   {
     if constexpr (has_nulls) {
       if (result.has_value()) {
@@ -179,7 +174,7 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  CUDA_DEVICE_CALLABLE bool is_valid() const
+  __device__ inline bool is_valid() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
@@ -191,7 +186,7 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  CUDA_DEVICE_CALLABLE mutable_column_device_view value() const
+  __device__ inline mutable_column_device_view value() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
@@ -222,7 +217,7 @@ struct single_dispatch_binary_operator {
    * @param args Forwarded arguments to `operator()` of `f`.
    */
   template <typename LHS, typename F, typename... Ts>
-  CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args)
+  __device__ inline auto operator()(F&& f, Ts&&... args)
   {
     f.template operator()<LHS, LHS>(std::forward<Ts>(args)...);
   }
@@ -247,9 +242,9 @@ struct expression_evaluator {
    * storing intermediates.
 
    */
-  CUDA_DEVICE_CALLABLE expression_evaluator(table_device_view const& left,
-                                            table_device_view const& right,
-                                            expression_device_view const& plan)
+  __device__ inline expression_evaluator(table_device_view const& left,
+                                         table_device_view const& right,
+                                         expression_device_view const& plan)
     : left(left), right(right), plan(plan)
   {
   }
@@ -262,8 +257,8 @@ struct expression_evaluator {
    * @param thread_intermediate_storage Pointer to this thread's portion of shared memory for
    * storing intermediates.
    */
-  CUDA_DEVICE_CALLABLE expression_evaluator(table_device_view const& table,
-                                            expression_device_view const& plan)
+  __device__ inline expression_evaluator(table_device_view const& table,
+                                         expression_device_view const& plan)
     : expression_evaluator(table, table, plan)
   {
   }
@@ -282,7 +277,7 @@ struct expression_evaluator {
    * @return Element The type- and null-resolved data.
    */
   template <typename Element, CUDF_ENABLE_IF(column_device_view::has_element_accessor<Element>())>
-  CUDA_DEVICE_CALLABLE possibly_null_value_t<Element, has_nulls> resolve_input(
+  __device__ inline possibly_null_value_t<Element, has_nulls> resolve_input(
     detail::device_data_reference const& input_reference,
     IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
@@ -333,7 +328,7 @@ struct expression_evaluator {
 
   template <typename Element,
             CUDF_ENABLE_IF(not column_device_view::has_element_accessor<Element>())>
-  CUDA_DEVICE_CALLABLE possibly_null_value_t<Element, has_nulls> resolve_input(
+  __device__ inline possibly_null_value_t<Element, has_nulls> resolve_input(
     detail::device_data_reference const& device_data_reference,
     IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
@@ -358,7 +353,7 @@ struct expression_evaluator {
    * @param op The operator to act with.
    */
   template <typename Input, typename ResultSubclass, typename T, bool result_has_nulls>
-  CUDA_DEVICE_CALLABLE void operator()(
+  __device__ inline void operator()(
     expression_result<ResultSubclass, T, result_has_nulls>& output_object,
     cudf::size_type const input_row_index,
     detail::device_data_reference const& input,
@@ -395,7 +390,7 @@ struct expression_evaluator {
    * @param op The operator to act with.
    */
   template <typename LHS, typename RHS, typename ResultSubclass, typename T, bool result_has_nulls>
-  CUDA_DEVICE_CALLABLE void operator()(
+  __device__ inline void operator()(
     expression_result<ResultSubclass, T, result_has_nulls>& output_object,
     cudf::size_type const left_row_index,
     cudf::size_type const right_row_index,
@@ -431,9 +426,10 @@ struct expression_evaluator {
    * @param row_index Row index of all input and output data column(s).
    */
   template <typename ResultSubclass, typename T, bool result_has_nulls>
-  CUDF_DFI void evaluate(expression_result<ResultSubclass, T, result_has_nulls>& output_object,
-                         cudf::size_type const row_index,
-                         IntermediateDataType<has_nulls>* thread_intermediate_storage)
+  __device__ __forceinline__ void evaluate(
+    expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+    cudf::size_type const row_index,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage)
   {
     evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage);
   }
@@ -451,11 +447,12 @@ struct expression_evaluator {
    * @param output_row_index The row in the output to insert the result.
    */
   template <typename ResultSubclass, typename T, bool result_has_nulls>
-  CUDF_DFI void evaluate(expression_result<ResultSubclass, T, result_has_nulls>& output_object,
-                         cudf::size_type const left_row_index,
-                         cudf::size_type const right_row_index,
-                         cudf::size_type const output_row_index,
-                         IntermediateDataType<has_nulls>* thread_intermediate_storage)
+  __device__ __forceinline__ void evaluate(
+    expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+    cudf::size_type const left_row_index,
+    cudf::size_type const right_row_index,
+    cudf::size_type const output_row_index,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage)
   {
     cudf::size_type operator_source_index{0};
     for (cudf::size_type operator_index = 0; operator_index < plan.operators.size();
@@ -517,7 +514,7 @@ struct expression_evaluator {
    */
   struct expression_output_handler {
    public:
-    CUDA_DEVICE_CALLABLE expression_output_handler() {}
+    __device__ inline expression_output_handler() {}
 
     /**
      * @brief Resolves an output data reference and assigns result value.
@@ -539,7 +536,7 @@ struct expression_evaluator {
               typename T,
               bool result_has_nulls,
               CUDF_ENABLE_IF(is_rep_layout_compatible<Element>())>
-    CUDA_DEVICE_CALLABLE void resolve_output(
+    __device__ inline void resolve_output(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       detail::device_data_reference const& device_data_reference,
       cudf::size_type const row_index,
@@ -563,7 +560,7 @@ struct expression_evaluator {
               typename T,
               bool result_has_nulls,
               CUDF_ENABLE_IF(!is_rep_layout_compatible<Element>())>
-    CUDA_DEVICE_CALLABLE void resolve_output(
+    __device__ inline void resolve_output(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       detail::device_data_reference const& device_data_reference,
       cudf::size_type const row_index,
@@ -582,7 +579,7 @@ struct expression_evaluator {
    */
   template <typename Input>
   struct unary_expression_output_handler : public expression_output_handler {
-    CUDA_DEVICE_CALLABLE unary_expression_output_handler() {}
+    __device__ inline unary_expression_output_handler() {}
 
     /**
      * @brief Callable to perform a unary operation.
@@ -602,7 +599,7 @@ struct expression_evaluator {
               std::enable_if_t<
                 detail::is_valid_unary_op<detail::operator_functor<op, has_nulls>,
                                           possibly_null_value_t<Input, has_nulls>>>* = nullptr>
-    CUDA_DEVICE_CALLABLE void operator()(
+    __device__ inline void operator()(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       cudf::size_type const output_row_index,
       possibly_null_value_t<Input, has_nulls> const& input,
@@ -626,7 +623,7 @@ struct expression_evaluator {
               std::enable_if_t<
                 !detail::is_valid_unary_op<detail::operator_functor<op, has_nulls>,
                                            possibly_null_value_t<Input, has_nulls>>>* = nullptr>
-    CUDA_DEVICE_CALLABLE void operator()(
+    __device__ inline void operator()(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       cudf::size_type const output_row_index,
       possibly_null_value_t<Input, has_nulls> const& input,
@@ -645,7 +642,7 @@ struct expression_evaluator {
    */
   template <typename LHS, typename RHS>
   struct binary_expression_output_handler : public expression_output_handler {
-    CUDA_DEVICE_CALLABLE binary_expression_output_handler() {}
+    __device__ inline binary_expression_output_handler() {}
 
     /**
      * @brief Callable to perform a binary operation.
@@ -667,7 +664,7 @@ struct expression_evaluator {
                                                           possibly_null_value_t<LHS, has_nulls>,
                                                           possibly_null_value_t<RHS, has_nulls>>>* =
                 nullptr>
-    CUDA_DEVICE_CALLABLE void operator()(
+    __device__ inline void operator()(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       cudf::size_type const output_row_index,
       possibly_null_value_t<LHS, has_nulls> const& lhs,
@@ -693,7 +690,7 @@ struct expression_evaluator {
                 !detail::is_valid_binary_op<detail::operator_functor<op, has_nulls>,
                                             possibly_null_value_t<LHS, has_nulls>,
                                             possibly_null_value_t<RHS, has_nulls>>>* = nullptr>
-    CUDA_DEVICE_CALLABLE void operator()(
+    __device__ inline void operator()(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       cudf::size_type const output_row_index,
       possibly_null_value_t<LHS, has_nulls> const& lhs,
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index cffefcaf9cd..d7fd109f12a 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -49,9 +49,7 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable<Op, T>::value;
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr void ast_operator_dispatcher(ast_operator op,
-                                                                 F&& f,
-                                                                 Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args)
 {
   switch (op) {
     case ast_operator::ADD:
@@ -234,7 +232,7 @@ struct operator_functor<ast_operator::ADD, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs + rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -245,7 +243,7 @@ struct operator_functor<ast_operator::SUB, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs - rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs - rhs)
   {
     return lhs - rhs;
   }
@@ -256,7 +254,7 @@ struct operator_functor<ast_operator::MUL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs * rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -267,7 +265,7 @@ struct operator_functor<ast_operator::DIV, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs / rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs / rhs)
   {
     return lhs / rhs;
   }
@@ -278,7 +276,7 @@ struct operator_functor<ast_operator::TRUE_DIV, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(static_cast<double>(lhs) / static_cast<double>(rhs))
   {
     return static_cast<double>(lhs) / static_cast<double>(rhs);
@@ -290,7 +288,7 @@ struct operator_functor<ast_operator::FLOOR_DIV, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(floor(static_cast<double>(lhs) / static_cast<double>(rhs)))
   {
     return floor(static_cast<double>(lhs) / static_cast<double>(rhs));
@@ -305,7 +303,7 @@ struct operator_functor<ast_operator::MOD, false> {
             typename RHS,
             typename CommonType                                    = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_integral<CommonType>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(static_cast<CommonType>(lhs) % static_cast<CommonType>(rhs))
   {
     return static_cast<CommonType>(lhs) % static_cast<CommonType>(rhs);
@@ -315,7 +313,7 @@ struct operator_functor<ast_operator::MOD, false> {
             typename RHS,
             typename CommonType                                  = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_same_v<CommonType, float>>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(fmodf(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs)))
   {
     return fmodf(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs));
@@ -325,7 +323,7 @@ struct operator_functor<ast_operator::MOD, false> {
             typename RHS,
             typename CommonType                                   = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_same_v<CommonType, double>>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(fmod(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs)))
   {
     return fmod(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs));
@@ -340,7 +338,7 @@ struct operator_functor<ast_operator::PYMOD, false> {
             typename RHS,
             typename CommonType                                    = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_integral<CommonType>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(((static_cast<CommonType>(lhs) % static_cast<CommonType>(rhs)) +
                  static_cast<CommonType>(rhs)) %
                 static_cast<CommonType>(rhs))
@@ -354,7 +352,7 @@ struct operator_functor<ast_operator::PYMOD, false> {
             typename RHS,
             typename CommonType                                  = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_same_v<CommonType, float>>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(fmodf(fmodf(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs)) +
                         static_cast<CommonType>(rhs),
                       static_cast<CommonType>(rhs)))
@@ -368,7 +366,7 @@ struct operator_functor<ast_operator::PYMOD, false> {
             typename RHS,
             typename CommonType                                   = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_same_v<CommonType, double>>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(fmod(fmod(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs)) +
                        static_cast<CommonType>(rhs),
                      static_cast<CommonType>(rhs)))
@@ -384,7 +382,7 @@ struct operator_functor<ast_operator::POW, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(std::pow(lhs, rhs))
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(std::pow(lhs, rhs))
   {
     return std::pow(lhs, rhs);
   }
@@ -395,7 +393,7 @@ struct operator_functor<ast_operator::EQUAL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs == rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs == rhs)
   {
     return lhs == rhs;
   }
@@ -412,7 +410,7 @@ struct operator_functor<ast_operator::NOT_EQUAL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs != rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs != rhs)
   {
     return lhs != rhs;
   }
@@ -423,7 +421,7 @@ struct operator_functor<ast_operator::LESS, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs < rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs < rhs)
   {
     return lhs < rhs;
   }
@@ -434,7 +432,7 @@ struct operator_functor<ast_operator::GREATER, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs > rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs > rhs)
   {
     return lhs > rhs;
   }
@@ -445,7 +443,7 @@ struct operator_functor<ast_operator::LESS_EQUAL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs <= rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs <= rhs)
   {
     return lhs <= rhs;
   }
@@ -456,7 +454,7 @@ struct operator_functor<ast_operator::GREATER_EQUAL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs >= rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs >= rhs)
   {
     return lhs >= rhs;
   }
@@ -467,7 +465,7 @@ struct operator_functor<ast_operator::BITWISE_AND, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs & rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs & rhs)
   {
     return lhs & rhs;
   }
@@ -478,7 +476,7 @@ struct operator_functor<ast_operator::BITWISE_OR, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs | rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs | rhs)
   {
     return lhs | rhs;
   }
@@ -489,7 +487,7 @@ struct operator_functor<ast_operator::BITWISE_XOR, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs ^ rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs ^ rhs)
   {
     return lhs ^ rhs;
   }
@@ -500,7 +498,7 @@ struct operator_functor<ast_operator::LOGICAL_AND, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs && rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs && rhs)
   {
     return lhs && rhs;
   }
@@ -517,7 +515,7 @@ struct operator_functor<ast_operator::LOGICAL_OR, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs || rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs || rhs)
   {
     return lhs || rhs;
   }
@@ -534,7 +532,7 @@ struct operator_functor<ast_operator::IDENTITY, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(input)
+  __device__ inline auto operator()(InputT input) -> decltype(input)
   {
     return input;
   }
@@ -545,7 +543,7 @@ struct operator_functor<ast_operator::SIN, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sin(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::sin(input))
   {
     return std::sin(input);
   }
@@ -556,7 +554,7 @@ struct operator_functor<ast_operator::COS, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cos(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::cos(input))
   {
     return std::cos(input);
   }
@@ -567,7 +565,7 @@ struct operator_functor<ast_operator::TAN, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::tan(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::tan(input))
   {
     return std::tan(input);
   }
@@ -578,7 +576,7 @@ struct operator_functor<ast_operator::ARCSIN, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::asin(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::asin(input))
   {
     return std::asin(input);
   }
@@ -589,7 +587,7 @@ struct operator_functor<ast_operator::ARCCOS, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::acos(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::acos(input))
   {
     return std::acos(input);
   }
@@ -600,7 +598,7 @@ struct operator_functor<ast_operator::ARCTAN, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::atan(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::atan(input))
   {
     return std::atan(input);
   }
@@ -611,7 +609,7 @@ struct operator_functor<ast_operator::SINH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sinh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::sinh(input))
   {
     return std::sinh(input);
   }
@@ -622,7 +620,7 @@ struct operator_functor<ast_operator::COSH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cosh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::cosh(input))
   {
     return std::cosh(input);
   }
@@ -633,7 +631,7 @@ struct operator_functor<ast_operator::TANH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::tanh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::tanh(input))
   {
     return std::tanh(input);
   }
@@ -644,7 +642,7 @@ struct operator_functor<ast_operator::ARCSINH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::asinh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::asinh(input))
   {
     return std::asinh(input);
   }
@@ -655,7 +653,7 @@ struct operator_functor<ast_operator::ARCCOSH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::acosh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::acosh(input))
   {
     return std::acosh(input);
   }
@@ -666,7 +664,7 @@ struct operator_functor<ast_operator::ARCTANH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::atanh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::atanh(input))
   {
     return std::atanh(input);
   }
@@ -677,7 +675,7 @@ struct operator_functor<ast_operator::EXP, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::exp(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::exp(input))
   {
     return std::exp(input);
   }
@@ -688,7 +686,7 @@ struct operator_functor<ast_operator::LOG, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::log(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::log(input))
   {
     return std::log(input);
   }
@@ -699,7 +697,7 @@ struct operator_functor<ast_operator::SQRT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sqrt(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::sqrt(input))
   {
     return std::sqrt(input);
   }
@@ -710,7 +708,7 @@ struct operator_functor<ast_operator::CBRT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cbrt(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::cbrt(input))
   {
     return std::cbrt(input);
   }
@@ -721,7 +719,7 @@ struct operator_functor<ast_operator::CEIL, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::ceil(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::ceil(input))
   {
     return std::ceil(input);
   }
@@ -732,7 +730,7 @@ struct operator_functor<ast_operator::FLOOR, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::floor(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::floor(input))
   {
     return std::floor(input);
   }
@@ -744,13 +742,13 @@ struct operator_functor<ast_operator::ABS, false> {
 
   // Only accept signed or unsigned types (both require is_arithmetic<T> to be true)
   template <typename InputT, std::enable_if_t<std::is_signed<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::abs(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::abs(input))
   {
     return std::abs(input);
   }
 
   template <typename InputT, std::enable_if_t<std::is_unsigned<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(input)
+  __device__ inline auto operator()(InputT input) -> decltype(input)
   {
     return input;
   }
@@ -761,7 +759,7 @@ struct operator_functor<ast_operator::RINT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::rint(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::rint(input))
   {
     return std::rint(input);
   }
@@ -772,7 +770,7 @@ struct operator_functor<ast_operator::BIT_INVERT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(~input)
+  __device__ inline auto operator()(InputT input) -> decltype(~input)
   {
     return ~input;
   }
@@ -783,7 +781,7 @@ struct operator_functor<ast_operator::NOT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(!input)
+  __device__ inline auto operator()(InputT input) -> decltype(!input)
   {
     return !input;
   }
@@ -793,7 +791,7 @@ template <typename To>
 struct cast {
   static constexpr auto arity{1};
   template <typename From>
-  CUDA_DEVICE_CALLABLE auto operator()(From f) -> decltype(static_cast<To>(f))
+  __device__ inline auto operator()(From f) -> decltype(static_cast<To>(f))
   {
     return static_cast<To>(f);
   }
@@ -822,7 +820,7 @@ struct operator_functor<op, true> {
             typename RHS,
             std::size_t arity_placeholder             = arity,
             std::enable_if_t<arity_placeholder == 2>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs)
+  __device__ inline auto operator()(LHS const lhs, RHS const rhs)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>
   {
     using Out = possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>;
@@ -832,7 +830,7 @@ struct operator_functor<op, true> {
   template <typename Input,
             std::size_t arity_placeholder             = arity,
             std::enable_if_t<arity_placeholder == 1>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(Input const input)
+  __device__ inline auto operator()(Input const input)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*input)), true>
   {
     using Out = possibly_null_value_t<decltype(NonNullOperator{}(*input)), true>;
@@ -848,7 +846,7 @@ struct operator_functor<ast_operator::NULL_EQUAL, true> {
   static constexpr auto arity = NonNullOperator::arity;
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs)
+  __device__ inline auto operator()(LHS const lhs, RHS const rhs)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>
   {
     // Case 1: Neither is null, so the output is given by the operation.
@@ -869,7 +867,7 @@ struct operator_functor<ast_operator::NULL_LOGICAL_AND, true> {
   static constexpr auto arity = NonNullOperator::arity;
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs)
+  __device__ inline auto operator()(LHS const lhs, RHS const rhs)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>
   {
     // Case 1: Neither is null, so the output is given by the operation.
@@ -892,7 +890,7 @@ struct operator_functor<ast_operator::NULL_LOGICAL_OR, true> {
   static constexpr auto arity = NonNullOperator::arity;
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs)
+  __device__ inline auto operator()(LHS const lhs, RHS const rhs)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>
   {
     // Case 1: Neither is null, so the output is given by the operation.
@@ -922,7 +920,7 @@ struct single_dispatch_binary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
     f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
   }
@@ -931,7 +929,7 @@ struct single_dispatch_binary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation.");
@@ -960,10 +958,10 @@ struct type_dispatch_binary_op {
    * @param args Forwarded arguments to `operator()` of `f`.
    */
   template <ast_operator op, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type lhs_type,
-                                            cudf::data_type rhs_type,
-                                            F&& f,
-                                            Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type,
+                                          cudf::data_type rhs_type,
+                                          F&& f,
+                                          Ts&&... args)
   {
     // Single dispatch (assume lhs_type == rhs_type)
     type_dispatcher(
@@ -986,7 +984,7 @@ struct type_dispatch_binary_op {
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr void binary_operator_dispatcher(
+CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher(
   ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
 {
   ast_operator_dispatcher(op,
@@ -1011,7 +1009,7 @@ struct dispatch_unary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
     f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
   }
@@ -1020,7 +1018,7 @@ struct dispatch_unary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid unary operation.");
@@ -1035,7 +1033,7 @@ struct dispatch_unary_operator_types {
  */
 struct type_dispatch_unary_op {
   template <ast_operator op, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
   {
     type_dispatcher(
       input_type,
@@ -1056,10 +1054,10 @@ struct type_dispatch_unary_op {
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr void unary_operator_dispatcher(ast_operator op,
-                                                                   cudf::data_type input_type,
-                                                                   F&& f,
-                                                                   Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op,
+                                                                 cudf::data_type input_type,
+                                                                 F&& f,
+                                                                 Ts&&... args)
 {
   ast_operator_dispatcher(op,
                           detail::type_dispatch_unary_op{},
@@ -1084,7 +1082,7 @@ struct return_type_functor {
             typename LHS,
             typename RHS,
             std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
     using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
     result    = cudf::data_type(cudf::type_to_id<Out>());
@@ -1094,7 +1092,7 @@ struct return_type_functor {
             typename LHS,
             typename RHS,
             std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
@@ -1113,7 +1111,7 @@ struct return_type_functor {
   template <typename OperatorFunctor,
             typename T,
             std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
     using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
     result    = cudf::data_type(cudf::type_to_id<Out>());
@@ -1122,7 +1120,7 @@ struct return_type_functor {
   template <typename OperatorFunctor,
             typename T,
             std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
@@ -1161,7 +1159,7 @@ inline cudf::data_type ast_operator_return_type(ast_operator op,
  */
 struct arity_functor {
   template <ast_operator op>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::size_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result)
   {
     // Arity is not dependent on null handling, so just use the false implementation here.
     result = operator_functor<op, false>::arity;
@@ -1174,7 +1172,7 @@ struct arity_functor {
  * @param op Operator used to determine arity.
  * @return Arity of the operator.
  */
-CUDA_HOST_DEVICE_CALLABLE cudf::size_type ast_operator_arity(ast_operator op)
+CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op)
 {
   auto result = cudf::size_type(0);
   ast_operator_dispatcher(op, detail::arity_functor{}, result);
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index a15f20ef52d..b29df1852b2 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -1151,8 +1151,7 @@ struct optional_accessor {
     if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
   }
 
-  CUDA_DEVICE_CALLABLE
-  thrust::optional<T> operator()(cudf::size_type i) const
+  __device__ inline thrust::optional<T> operator()(cudf::size_type i) const
   {
     if (has_nulls) {
       return (col.is_valid_nocheck(i)) ? thrust::optional<T>{col.element<T>(i)}
@@ -1196,8 +1195,7 @@ struct pair_accessor {
     if (has_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
   }
 
-  CUDA_DEVICE_CALLABLE
-  thrust::pair<T, bool> operator()(cudf::size_type i) const
+  __device__ inline thrust::pair<T, bool> operator()(cudf::size_type i) const
   {
     return {col.element<T>(i), (has_nulls ? col.is_valid_nocheck(i) : true)};
   }
@@ -1237,21 +1235,20 @@ struct pair_rep_accessor {
     if (has_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
   }
 
-  CUDA_DEVICE_CALLABLE
-  thrust::pair<rep_type, bool> operator()(cudf::size_type i) const
+  __device__ inline thrust::pair<rep_type, bool> operator()(cudf::size_type i) const
   {
     return {get_rep<T>(i), (has_nulls ? col.is_valid_nocheck(i) : true)};
   }
 
  private:
   template <typename R, std::enable_if_t<std::is_same_v<R, rep_type>, void>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto get_rep(cudf::size_type i) const
+  __device__ inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i);
   }
 
   template <typename R, std::enable_if_t<not std::is_same_v<R, rep_type>, void>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto get_rep(cudf::size_type i) const
+  __device__ inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i).value();
   }
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index c2bd7a4893c..3674efbcc7b 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1324,9 +1324,9 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation);
  */
 #pragma nv_exec_check_disable
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kind k,
-                                                                F&& f,
-                                                                Ts&&... args)
+CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind k,
+                                                              F&& f,
+                                                              Ts&&... args)
 {
   switch (k) {
     case aggregation::SUM:
@@ -1418,7 +1418,7 @@ template <typename Element>
 struct dispatch_aggregation {
 #pragma nv_exec_check_disable
   template <aggregation::Kind k, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE inline decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<Element, k>(std::forward<Ts>(args)...);
   }
@@ -1427,9 +1427,7 @@ struct dispatch_aggregation {
 struct dispatch_source {
 #pragma nv_exec_check_disable
   template <typename Element, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(aggregation::Kind k,
-                                                      F&& f,
-                                                      Ts&&... args) const
+  CUDF_HOST_DEVICE inline decltype(auto) operator()(aggregation::Kind k, F&& f, Ts&&... args) const
   {
     return aggregation_dispatcher(
       k, dispatch_aggregation<Element>{}, std::forward<F>(f), std::forward<Ts>(args)...);
@@ -1453,8 +1451,10 @@ struct dispatch_source {
  */
 #pragma nv_exec_check_disable
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) dispatch_type_and_aggregation(
-  data_type type, aggregation::Kind k, F&& f, Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(data_type type,
+                                                                               aggregation::Kind k,
+                                                                               F&& f,
+                                                                               Ts&&... args)
 {
   return type_dispatcher(type, dispatch_source{}, k, std::forward<F>(f), std::forward<Ts>(args)...);
 }
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index d0fa4e02440..11c82da8097 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -55,7 +55,7 @@ struct base_indexalator {
   /**
    * @brief Prefix increment operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T& operator++()
+  CUDF_HOST_DEVICE inline T& operator++()
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ += width_;
@@ -65,7 +65,7 @@ struct base_indexalator {
   /**
    * @brief Postfix increment operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T operator++(int)
+  CUDF_HOST_DEVICE inline T operator++(int)
   {
     T tmp{static_cast<T&>(*this)};
     operator++();
@@ -75,7 +75,7 @@ struct base_indexalator {
   /**
    * @brief Prefix decrement operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T& operator--()
+  CUDF_HOST_DEVICE inline T& operator--()
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ -= width_;
@@ -85,7 +85,7 @@ struct base_indexalator {
   /**
    * @brief Postfix decrement operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T operator--(int)
+  CUDF_HOST_DEVICE inline T operator--(int)
   {
     T tmp{static_cast<T&>(*this)};
     operator--();
@@ -95,7 +95,7 @@ struct base_indexalator {
   /**
    * @brief Compound assignment by sum operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T& operator+=(difference_type offset)
+  CUDF_HOST_DEVICE inline T& operator+=(difference_type offset)
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ += offset * width_;
@@ -105,7 +105,7 @@ struct base_indexalator {
   /**
    * @brief Increment by offset operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T operator+(difference_type offset) const
+  CUDF_HOST_DEVICE inline T operator+(difference_type offset) const
   {
     auto tmp = T{static_cast<T const&>(*this)};
     tmp.p_ += (offset * width_);
@@ -115,7 +115,7 @@ struct base_indexalator {
   /**
    * @brief Addition assignment operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE friend T operator+(difference_type offset, T const& rhs)
+  CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs)
   {
     T tmp{rhs};
     tmp.p_ += (offset * rhs.width_);
@@ -125,7 +125,7 @@ struct base_indexalator {
   /**
    * @brief Compound assignment by difference operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T& operator-=(difference_type offset)
+  CUDF_HOST_DEVICE inline T& operator-=(difference_type offset)
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ -= offset * width_;
@@ -135,7 +135,7 @@ struct base_indexalator {
   /**
    * @brief Decrement by offset operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T operator-(difference_type offset) const
+  CUDF_HOST_DEVICE inline T operator-(difference_type offset) const
   {
     auto tmp = T{static_cast<T const&>(*this)};
     tmp.p_ -= (offset * width_);
@@ -145,7 +145,7 @@ struct base_indexalator {
   /**
    * @brief Subtraction assignment operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE friend T operator-(difference_type offset, T const& rhs)
+  CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs)
   {
     T tmp{rhs};
     tmp.p_ -= (offset * rhs.width_);
@@ -155,7 +155,7 @@ struct base_indexalator {
   /**
    * @brief Compute offset from iterator difference operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE difference_type operator-(T const& rhs) const
+  CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const
   {
     return (static_cast<T const&>(*this).p_ - rhs.p_) / width_;
   }
@@ -163,42 +163,42 @@ struct base_indexalator {
   /**
    * @brief Equals to operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator==(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const
   {
     return rhs.p_ == static_cast<T const&>(*this).p_;
   }
   /**
    * @brief Not equals to operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator!=(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const
   {
     return rhs.p_ != static_cast<T const&>(*this).p_;
   }
   /**
    * @brief Less than operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator<(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const
   {
     return static_cast<T const&>(*this).p_ < rhs.p_;
   }
   /**
    * @brief Greater than operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator>(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const
   {
     return static_cast<T const&>(*this).p_ > rhs.p_;
   }
   /**
    * @brief Less than or equals to operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator<=(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const
   {
     return static_cast<T const&>(*this).p_ <= rhs.p_;
   }
   /**
    * @brief Greater than or equals to operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator>=(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const
   {
     return static_cast<T const&>(*this).p_ >= rhs.p_;
   }
@@ -253,7 +253,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
   /**
    * @brief Indirection operator returns the value at the current iterator position.
    */
-  CUDA_DEVICE_CALLABLE size_type operator*() const { return operator[](0); }
+  __device__ inline size_type operator*() const { return operator[](0); }
 
   /**
    * @brief Dispatch functor for resolving a size_type value from any index type.
@@ -275,7 +275,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
    * @brief Array subscript operator returns a value at the input
    * `idx` position as a `size_type` value.
    */
-  CUDA_DEVICE_CALLABLE size_type operator[](size_type idx) const
+  __device__ inline size_type operator[](size_type idx) const
   {
     void const* tp = p_ + (idx * width_);
     return type_dispatcher(dtype_, index_as_size_type{}, tp);
@@ -339,14 +339,14 @@ struct output_indexalator : base_indexalator<output_indexalator> {
    * @brief Indirection operator returns this iterator instance in order
    * to capture the `operator=(size_type)` calls.
    */
-  CUDA_DEVICE_CALLABLE output_indexalator const& operator*() const { return *this; }
+  __device__ inline output_indexalator const& operator*() const { return *this; }
 
   /**
    * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
    *
    * This allows capturing the subsequent `operator=(size_type)` call in this class.
    */
-  CUDA_DEVICE_CALLABLE output_indexalator const operator[](size_type idx) const
+  __device__ inline output_indexalator const operator[](size_type idx) const
   {
     output_indexalator tmp{*this};
     tmp.p_ += (idx * width_);
@@ -372,7 +372,7 @@ struct output_indexalator : base_indexalator<output_indexalator> {
   /**
    * @brief Assign a size_type value to the current iterator position.
    */
-  CUDA_DEVICE_CALLABLE output_indexalator const& operator=(size_type const value) const
+  __device__ inline output_indexalator const& operator=(size_type const value) const
   {
     void* tp = p_;
     type_dispatcher(dtype_, size_type_to_index{}, tp, value);
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 01742384972..10d9cda55dd 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -107,8 +107,7 @@ struct null_replaced_value_accessor {
     if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
 
-  CUDA_DEVICE_CALLABLE
-  Element operator()(cudf::size_type i) const
+  __device__ inline Element operator()(cudf::size_type i) const
   {
     return has_nulls && col.is_null_nocheck(i) ? null_replacement : col.element<Element>(i);
   }
@@ -135,8 +134,7 @@ struct validity_accessor {
     CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column.");
   }
 
-  CUDA_DEVICE_CALLABLE
-  bool operator()(cudf::size_type i) const { return col.is_valid_nocheck(i); }
+  __device__ inline bool operator()(cudf::size_type i) const { return col.is_valid_nocheck(i); }
 };
 
 /**
@@ -344,8 +342,7 @@ struct scalar_value_accessor {
    *
    * @return value of the scalar.
    */
-  CUDA_DEVICE_CALLABLE
-  const Element operator()(size_type) const
+  __device__ inline const Element operator()(size_type) const
   {
 #if defined(__CUDA_ARCH__)
     return dscalar.value();
@@ -423,8 +420,7 @@ struct scalar_optional_accessor : public scalar_value_accessor<Element> {
    *
    * @return a thrust::optional<Element> for the scalar value.
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  const value_type operator()(size_type) const
+  CUDF_HOST_DEVICE inline const value_type operator()(size_type) const
   {
     if (has_nulls) {
       return (super_t::dscalar.is_valid()) ? Element{super_t::dscalar.value()}
@@ -457,8 +453,7 @@ struct scalar_pair_accessor : public scalar_value_accessor<Element> {
    *
    * @return a pair with value and validity of the scalar.
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  const value_type operator()(size_type) const
+  CUDF_HOST_DEVICE inline const value_type operator()(size_type) const
   {
 #if defined(__CUDA_ARCH__)
     return {Element(super_t::dscalar.value()), super_t::dscalar.is_valid()};
@@ -509,8 +504,7 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
    *
    * @return a pair with representative value and validity of the scalar.
    */
-  CUDA_DEVICE_CALLABLE
-  const value_type operator()(size_type) const
+  __device__ inline const value_type operator()(size_type) const
   {
     return {get_rep(base::dscalar), base::dscalar.is_valid()};
   }
@@ -518,14 +512,14 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
  private:
   template <typename DeviceScalar,
             std::enable_if_t<!has_rep_member<DeviceScalar>::value, void>* = nullptr>
-  CUDA_DEVICE_CALLABLE rep_type get_rep(DeviceScalar const& dscalar) const
+  __device__ inline rep_type get_rep(DeviceScalar const& dscalar) const
   {
     return dscalar.value();
   }
 
   template <typename DeviceScalar,
             std::enable_if_t<has_rep_member<DeviceScalar>::value, void>* = nullptr>
-  CUDA_DEVICE_CALLABLE rep_type get_rep(DeviceScalar const& dscalar) const
+  __device__ inline rep_type get_rep(DeviceScalar const& dscalar) const
   {
     return dscalar.rep();
   }
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 6090477c28d..df06ad9e4f3 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -261,7 +261,7 @@ __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bi
  */
 struct bit_to_word_index {
   bit_to_word_index(bool inclusive) : inclusive(inclusive) {}
-  CUDA_DEVICE_CALLABLE size_type operator()(const size_type& bit_index) const
+  __device__ inline size_type operator()(const size_type& bit_index) const
   {
     return word_index(bit_index) + ((inclusive || intra_word_index(bit_index) == 0) ? 0 : 1);
   }
@@ -269,7 +269,7 @@ struct bit_to_word_index {
 };
 
 struct popc {
-  CUDA_DEVICE_CALLABLE size_type operator()(bitmask_type word) const { return __popc(word); }
+  __device__ inline size_type operator()(bitmask_type word) const { return __popc(word); }
 };
 
 // Count set/unset bits in a segmented null mask, using offset iterators accessible by the device.
@@ -377,7 +377,7 @@ size_type validate_segmented_indices(IndexIterator indices_begin, IndexIterator
 }
 
 struct index_alternator {
-  CUDA_DEVICE_CALLABLE size_type operator()(const size_type& i) const
+  __device__ inline size_type operator()(const size_type& i) const
   {
     return *(d_indices + 2 * i + (is_end ? 1 : 0));
   }
diff --git a/cpp/include/cudf/detail/reduction_operators.cuh b/cpp/include/cudf/detail/reduction_operators.cuh
index 866e26cd655..5a0cb4c1714 100644
--- a/cpp/include/cudf/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/detail/reduction_operators.cuh
@@ -19,7 +19,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/transform_unary_functions.cuh>
-#include <cudf/types.hpp>  //for CUDA_HOST_DEVICE_CALLABLE
+#include <cudf/types.hpp>  //for CUDF_HOST_DEVICE
 
 #include <cmath>
 #include <thrust/functional.h>
@@ -32,14 +32,12 @@ struct var_std {
   ResultType value;          /// the value
   ResultType value_squared;  /// the value of squared
 
-  CUDA_HOST_DEVICE_CALLABLE
-  var_std(ResultType _value = 0, ResultType _value_squared = 0)
+  CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0)
     : value(_value), value_squared(_value_squared){};
 
   using this_t = var_std<ResultType>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  this_t operator+(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
   {
     return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared));
   };
@@ -50,8 +48,10 @@ template <typename ResultType>
 struct transformer_var_std {
   using OutputType = var_std<ResultType>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  OutputType operator()(ResultType const& value) { return OutputType(value, value * value); };
+  CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value)
+  {
+    return OutputType(value, value * value);
+  };
 };
 
 // ------------------------------------------------------------------------
@@ -201,9 +201,9 @@ struct compound_op : public simple_op<Derived> {
    * @return transformed output result of compound operator
    */
   template <typename ResultType, typename IntermediateType>
-  CUDA_HOST_DEVICE_CALLABLE static ResultType compute_result(const IntermediateType& input,
-                                                             const cudf::size_type& count,
-                                                             const cudf::size_type& ddof)
+  CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                           const cudf::size_type& count,
+                                                           const cudf::size_type& ddof)
   {
     // Enforced interface
     return Derived::template intermediate<ResultType>::compute_result(input, count, ddof);
@@ -230,10 +230,9 @@ struct mean : public compound_op<mean> {
     using IntermediateType = ResultType;  // sum value
 
     // compute `mean` from intermediate type `IntermediateType`
-    CUDA_HOST_DEVICE_CALLABLE
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       return (input / count);
     };
@@ -252,10 +251,9 @@ struct variance : public compound_op<variance> {
     using IntermediateType = var_std<ResultType>;  // with sum of value, and sum of squared value
 
     // compute `variance` from intermediate type `IntermediateType`
-    CUDA_HOST_DEVICE_CALLABLE
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       ResultType mean     = input.value / count;
       ResultType asum     = input.value_squared;
@@ -279,10 +277,9 @@ struct standard_deviation : public compound_op<standard_deviation> {
     using IntermediateType = var_std<ResultType>;  // with sum of value, and sum of squared value
 
     // compute `standard deviation` from intermediate type `IntermediateType`
-    CUDA_HOST_DEVICE_CALLABLE
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       using intermediateOp = variance::template intermediate<ResultType>;
       ResultType var       = intermediateOp::compute_result(input, count, ddof);
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 95605dc8a71..a59ad4c42ee 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -40,7 +40,7 @@ namespace detail {
 template <typename LHS,
           typename RHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
-CUDA_HOST_DEVICE_CALLABLE auto min(LHS const& lhs, RHS const& rhs)
+CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs)
 {
   return std::min(lhs, rhs);
 }
@@ -51,7 +51,7 @@ CUDA_HOST_DEVICE_CALLABLE auto min(LHS const& lhs, RHS const& rhs)
 template <typename LHS,
           typename RHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
-CUDA_HOST_DEVICE_CALLABLE auto max(LHS const& lhs, RHS const& rhs)
+CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
 {
   return std::max(lhs, rhs);
 }
@@ -62,7 +62,7 @@ CUDA_HOST_DEVICE_CALLABLE auto max(LHS const& lhs, RHS const& rhs)
  */
 struct DeviceSum {
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -94,13 +94,13 @@ struct DeviceSum {
  */
 struct DeviceCount {
   template <typename T, typename std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline T operator()(const T& lhs, const T& rhs)
   {
     return T{DeviceCount{}(lhs.time_since_epoch(), rhs.time_since_epoch())};
   }
 
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T&, const T& rhs)
+  CUDF_HOST_DEVICE inline T operator()(const T&, const T& rhs)
   {
     return rhs + T{1};
   }
@@ -117,7 +117,7 @@ struct DeviceCount {
  */
 struct DeviceMin {
   template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs)
     -> decltype(cudf::detail::min(lhs, rhs))
   {
     return numeric::detail::min(lhs, rhs);
@@ -142,7 +142,7 @@ struct DeviceMin {
 
   // @brief identity specialized for string_view
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE static constexpr T identity()
+  CUDF_HOST_DEVICE inline static constexpr T identity()
   {
     return string_view::max();
   }
@@ -159,7 +159,7 @@ struct DeviceMin {
  */
 struct DeviceMax {
   template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs)
     -> decltype(cudf::detail::max(lhs, rhs))
   {
     return numeric::detail::max(lhs, rhs);
@@ -183,7 +183,7 @@ struct DeviceMax {
   }
 
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE static constexpr T identity()
+  CUDF_HOST_DEVICE inline static constexpr T identity()
   {
     return string_view::min();
   }
@@ -200,7 +200,7 @@ struct DeviceMax {
  */
 struct DeviceProduct {
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -224,7 +224,7 @@ struct DeviceProduct {
  */
 struct DeviceAnd {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
   {
     return (lhs & rhs);
   }
@@ -235,7 +235,7 @@ struct DeviceAnd {
  */
 struct DeviceOr {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
   {
     return (lhs | rhs);
   }
@@ -246,7 +246,7 @@ struct DeviceOr {
  */
 struct DeviceXor {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
   {
     return (lhs ^ rhs);
   }
@@ -258,7 +258,7 @@ struct DeviceXor {
 struct DeviceLeadLag {
   const size_type row_offset;
 
-  explicit CUDA_HOST_DEVICE_CALLABLE DeviceLeadLag(size_type offset_) : row_offset(offset_) {}
+  explicit CUDF_HOST_DEVICE inline DeviceLeadLag(size_type offset_) : row_offset(offset_) {}
 };
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index f3390d9387b..c35d24ddeac 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -31,7 +31,7 @@ namespace detail {
  * Normalization of floating point NaNs and zeros, passthrough for all other values.
  */
 template <typename T>
-T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros(T const& key)
+T __device__ inline normalize_nans_and_zeros(T const& key)
 {
   if constexpr (is_floating_point<T>()) {
     if (isnan(key)) {
@@ -50,7 +50,7 @@ T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros(T const& key)
  * Licensed under the MIT license.
  * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  */
-void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destination)
+void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination)
 {
   // Transform 0xABCD1234 => 0x0000ABCD00001234 => 0x0B0A0D0C02010403
   uint64_t x = num;
@@ -86,12 +86,12 @@ struct MurmurHash3_32 {
   MurmurHash3_32() = default;
   constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const
+  __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
   {
     return (x << r) | (x >> (32 - r));
   }
 
-  CUDA_DEVICE_CALLABLE uint32_t fmix32(uint32_t h) const
+  __device__ inline uint32_t fmix32(uint32_t h) const
   {
     h ^= h >> 16;
     h *= 0x85ebca6b;
@@ -118,7 +118,7 @@ struct MurmurHash3_32 {
    *
    * @returns A hash value that intelligently combines the lhs and rhs hash values
    */
-  CUDA_DEVICE_CALLABLE result_type hash_combine(result_type lhs, result_type rhs)
+  __device__ inline result_type hash_combine(result_type lhs, result_type rhs)
   {
     result_type combined{lhs};
 
@@ -127,11 +127,11 @@ struct MurmurHash3_32 {
     return combined;
   }
 
-  result_type CUDA_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); }
+  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
 
   // compute wrapper for floating point types
   template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
-  hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const
+  hash_value_type __device__ inline compute_floating_point(T const& key) const
   {
     if (key == T{0.0}) {
       return compute(T{0.0});
@@ -144,7 +144,7 @@ struct MurmurHash3_32 {
   }
 
   template <typename TKey>
-  result_type CUDA_DEVICE_CALLABLE compute(TKey const& key) const
+  result_type __device__ inline compute(TKey const& key) const
   {
     constexpr int len         = sizeof(argument_type);
     uint8_t const* const data = reinterpret_cast<uint8_t const*>(&key);
@@ -191,7 +191,7 @@ struct MurmurHash3_32 {
 };
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<bool>::operator()(bool const& key) const
+hash_value_type __device__ inline MurmurHash3_32<bool>::operator()(bool const& key) const
 {
   return this->compute(static_cast<uint8_t>(key));
 }
@@ -200,8 +200,8 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<bool>::operator()(bool const
  * @brief Specialization of MurmurHash3_32 operator for strings.
  */
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<cudf::string_view>::operator()(cudf::string_view const& key) const
+hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
 {
   auto const len        = key.size_bytes();
   uint8_t const* data   = reinterpret_cast<uint8_t const*>(key.data());
@@ -249,49 +249,49 @@ MurmurHash3_32<cudf::string_view>::operator()(cudf::string_view const& key) cons
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<float>::operator()(float const& key) const
+hash_value_type __device__ inline MurmurHash3_32<float>::operator()(float const& key) const
 {
   return this->compute_floating_point(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<double>::operator()(double const& key) const
+hash_value_type __device__ inline MurmurHash3_32<double>::operator()(double const& key) const
 {
   return this->compute_floating_point(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+hash_value_type __device__ inline MurmurHash3_32<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+hash_value_type __device__ inline MurmurHash3_32<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+hash_value_type __device__ inline MurmurHash3_32<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
+hash_value_type __device__ inline MurmurHash3_32<cudf::list_view>::operator()(
+  cudf::list_view const& key) const
 {
   cudf_assert(false && "List column hashing is not supported");
   return 0;
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+hash_value_type __device__ inline MurmurHash3_32<cudf::struct_view>::operator()(
+  cudf::struct_view const& key) const
 {
   cudf_assert(false && "Direct hashing of struct_view is not supported");
   return 0;
@@ -305,12 +305,12 @@ struct SparkMurmurHash3_32 {
   SparkMurmurHash3_32() = default;
   constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const
+  __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
   {
     return (x << r) | (x >> (32 - r));
   }
 
-  CUDA_DEVICE_CALLABLE uint32_t fmix32(uint32_t h) const
+  __device__ inline uint32_t fmix32(uint32_t h) const
   {
     h ^= h >> 16;
     h *= 0x85ebca6b;
@@ -320,11 +320,11 @@ struct SparkMurmurHash3_32 {
     return h;
   }
 
-  result_type CUDA_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); }
+  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
 
   // compute wrapper for floating point types
   template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
-  hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const
+  hash_value_type __device__ inline compute_floating_point(T const& key) const
   {
     if (isnan(key)) {
       T nan = std::numeric_limits<T>::quiet_NaN();
@@ -335,7 +335,7 @@ struct SparkMurmurHash3_32 {
   }
 
   template <typename TKey>
-  result_type CUDA_DEVICE_CALLABLE compute(TKey const& key) const
+  result_type __device__ inline compute(TKey const& key) const
   {
     constexpr int len        = sizeof(TKey);
     int8_t const* const data = reinterpret_cast<int8_t const*>(&key);
@@ -379,71 +379,68 @@ struct SparkMurmurHash3_32 {
 };
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32<bool>::operator()(bool const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<bool>::operator()(bool const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<int8_t>::operator()(int8_t const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<int8_t>::operator()(int8_t const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<uint8_t>::operator()(uint8_t const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<uint8_t>::operator()(uint8_t const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<int16_t>::operator()(int16_t const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<int16_t>::operator()(int16_t const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<uint16_t>::operator()(uint16_t const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<uint16_t>::operator()(
+  uint16_t const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
 {
   return this->compute<uint64_t>(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
 {
   return this->compute<uint64_t>(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
 {
   return this->compute<__int128_t>(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<cudf::list_view>::operator()(
+  cudf::list_view const& key) const
 {
   cudf_assert(false && "List column hashing is not supported");
   return 0;
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<cudf::struct_view>::operator()(
+  cudf::struct_view const& key) const
 {
   cudf_assert(false && "Direct hashing of struct_view is not supported");
   return 0;
@@ -453,8 +450,8 @@ SparkMurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key)
  * @brief Specialization of MurmurHash3_32 operator for strings.
  */
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<cudf::string_view>::operator()(cudf::string_view const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
 {
   auto const len        = key.size_bytes();
   int8_t const* data    = reinterpret_cast<int8_t const*>(key.data());
@@ -499,14 +496,13 @@ SparkMurmurHash3_32<cudf::string_view>::operator()(cudf::string_view const& key)
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32<float>::operator()(float const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(float const& key) const
 {
   return this->compute_floating_point(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<double>::operator()(double const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(double const& key) const
 {
   return this->compute_floating_point(key);
 }
diff --git a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
index 05a788abd45..12774f57c6a 100644
--- a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
@@ -43,14 +43,13 @@ struct null_replacing_transformer {
   using type = ResultType;
   Functor f;
   type replacement;
-  CUDA_HOST_DEVICE_CALLABLE
-  null_replacing_transformer(type null_replacement, Functor transformer)
+  CUDF_HOST_DEVICE inline null_replacing_transformer(type null_replacement, Functor transformer)
     : f(transformer), replacement(null_replacement)
   {
   }
 
   template <typename ElementType>
-  CUDA_HOST_DEVICE_CALLABLE type operator()(thrust::pair<ElementType, bool> const& pair_value)
+  CUDF_HOST_DEVICE inline type operator()(thrust::pair<ElementType, bool> const& pair_value)
   {
     if (pair_value.second)
       return f(pair_value.first);
@@ -76,22 +75,21 @@ struct meanvar {
   ElementType value_squared;  /// the value of squared
   cudf::size_type count;      /// the count
 
-  CUDA_HOST_DEVICE_CALLABLE
-  meanvar(ElementType _value = 0, ElementType _value_squared = 0, cudf::size_type _count = 0)
+  CUDF_HOST_DEVICE inline meanvar(ElementType _value         = 0,
+                                  ElementType _value_squared = 0,
+                                  cudf::size_type _count     = 0)
     : value(_value), value_squared(_value_squared), count(_count){};
 
   using this_t = cudf::meanvar<ElementType>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  this_t operator+(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
   {
     return this_t((this->value + rhs.value),
                   (this->value_squared + rhs.value_squared),
                   (this->count + rhs.count));
   };
 
-  CUDA_HOST_DEVICE_CALLABLE
-  bool operator==(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator==(this_t const& rhs) const
   {
     return ((this->value == rhs.value) && (this->value_squared == rhs.value_squared) &&
             (this->count == rhs.count));
@@ -113,8 +111,10 @@ struct meanvar {
  */
 template <typename ElementType>
 struct transformer_squared {
-  CUDA_HOST_DEVICE_CALLABLE
-  ElementType operator()(ElementType const& value) { return (value * value); };
+  CUDF_HOST_DEVICE inline ElementType operator()(ElementType const& value)
+  {
+    return (value * value);
+  };
 };
 
 /**
@@ -130,8 +130,7 @@ template <typename ElementType>
 struct transformer_meanvar {
   using ResultType = meanvar<ElementType>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  ResultType operator()(thrust::pair<ElementType, bool> const& pair)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::pair<ElementType, bool> const& pair)
   {
     ElementType v = pair.first;
     return meanvar<ElementType>(v, v * v, (pair.second) ? 1 : 0);
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index e8223b53997..727dce0db9d 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -79,7 +79,7 @@ template <typename Rep,
           typename T,
           typename cuda::std::enable_if_t<(cuda::std::is_same_v<int32_t, T> &&
                                            is_supported_representation_type<Rep>())>* = nullptr>
-CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent)
+CUDF_HOST_DEVICE inline Rep ipow(T exponent)
 {
   cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
   if (exponent == 0) return static_cast<Rep>(1);
@@ -108,7 +108,7 @@ CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent)
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDA_HOST_DEVICE_CALLABLE constexpr T right_shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T right_shift(T const& val, scale_type const& scale)
 {
   return val / ipow<Rep, Rad>(static_cast<int32_t>(scale));
 }
@@ -125,7 +125,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T right_shift(T const& val, scale_type const
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDA_HOST_DEVICE_CALLABLE constexpr T left_shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T left_shift(T const& val, scale_type const& scale)
 {
   return val * ipow<Rep, Rad>(static_cast<int32_t>(-scale));
 }
@@ -144,7 +144,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T left_shift(T const& val, scale_type const&
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDA_HOST_DEVICE_CALLABLE constexpr T shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T shift(T const& val, scale_type const& scale)
 {
   if (scale == 0)
     return val;
@@ -179,7 +179,7 @@ template <typename Rep,
 struct scaled_integer {
   Rep value;
   scale_type scale;
-  CUDA_HOST_DEVICE_CALLABLE explicit scaled_integer(Rep v, scale_type s) : value{v}, scale{s} {}
+  CUDF_HOST_DEVICE inline explicit scaled_integer(Rep v, scale_type s) : value{v}, scale{s} {}
 };
 
 /**
@@ -210,7 +210,7 @@ class fixed_point {
   template <typename T,
             typename cuda::std::enable_if_t<cuda::std::is_floating_point<T>() &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale)
+  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     : _value{static_cast<Rep>(detail::shift<Rep, Rad>(value, scale))}, _scale{scale}
   {
   }
@@ -226,7 +226,7 @@ class fixed_point {
   template <typename T,
             typename cuda::std::enable_if_t<cuda::std::is_integral<T>() &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale)
+  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     // `value` is cast to `Rep` to avoid overflow in cases where
     // constructing to `Rep` that is wider than `T`
     : _value{detail::shift<Rep, Rad>(static_cast<Rep>(value), scale)}, _scale{scale}
@@ -238,8 +238,10 @@ class fixed_point {
    *
    * @param s scaled_integer that contains scale and already shifted value
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  explicit fixed_point(scaled_integer<Rep> s) : _value{s.value}, _scale{s.scale} {}
+  CUDF_HOST_DEVICE inline explicit fixed_point(scaled_integer<Rep> s)
+    : _value{s.value}, _scale{s.scale}
+  {
+  }
 
   /**
    * @brief "Scale-less" constructor that constructs `fixed_point` number with a specified
@@ -247,7 +249,7 @@ class fixed_point {
    */
   template <typename T,
             typename cuda::std::enable_if_t<is_supported_construction_value_type<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point(T const& value)
+  CUDF_HOST_DEVICE inline fixed_point(T const& value)
     : _value{static_cast<Rep>(value)}, _scale{scale_type{0}}
   {
   }
@@ -256,8 +258,7 @@ class fixed_point {
    * @brief Default constructor that constructs `fixed_point` number with a
    * value and scale of zero
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  fixed_point() : _value{0}, _scale{scale_type{0}} {}
+  CUDF_HOST_DEVICE inline fixed_point() : _value{0}, _scale{scale_type{0}} {}
 
   /**
    * @brief Explicit conversion operator for casting to floating point types
@@ -289,7 +290,7 @@ class fixed_point {
     return static_cast<U>(detail::shift<Rep, Rad>(value, scale_type{-_scale}));
   }
 
-  CUDA_HOST_DEVICE_CALLABLE operator scaled_integer<Rep>() const
+  CUDF_HOST_DEVICE inline operator scaled_integer<Rep>() const
   {
     return scaled_integer<Rep>{_value, _scale};
   }
@@ -299,21 +300,21 @@ class fixed_point {
    *
    * @return The underlying value of the `fixed_point` number
    */
-  CUDA_HOST_DEVICE_CALLABLE rep value() const { return _value; }
+  CUDF_HOST_DEVICE inline rep value() const { return _value; }
 
   /**
    * @brief Method that returns the scale of the `fixed_point` number
    *
    * @return The scale of the `fixed_point` number
    */
-  CUDA_HOST_DEVICE_CALLABLE scale_type scale() const { return _scale; }
+  CUDF_HOST_DEVICE inline scale_type scale() const { return _scale; }
 
   /**
    * @brief Explicit conversion operator to `bool`
    *
    * @return The `fixed_point` value as a boolean (zero is `false`, nonzero is `true`)
    */
-  CUDA_HOST_DEVICE_CALLABLE explicit constexpr operator bool() const
+  CUDF_HOST_DEVICE inline explicit constexpr operator bool() const
   {
     return static_cast<bool>(_value);
   }
@@ -326,7 +327,7 @@ class fixed_point {
    * @return The sum
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1>& operator+=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator+=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this + rhs;
     return *this;
@@ -340,7 +341,7 @@ class fixed_point {
    * @return The product
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1>& operator*=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator*=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this * rhs;
     return *this;
@@ -354,7 +355,7 @@ class fixed_point {
    * @return The difference
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1>& operator-=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator-=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this - rhs;
     return *this;
@@ -368,7 +369,7 @@ class fixed_point {
    * @return The quotient
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1>& operator/=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator/=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this / rhs;
     return *this;
@@ -379,8 +380,7 @@ class fixed_point {
    *
    * @return The incremented result
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  fixed_point<Rep, Rad>& operator++()
+  CUDF_HOST_DEVICE inline fixed_point<Rep, Rad>& operator++()
   {
     *this = *this + fixed_point<Rep, Rad>{1, scale_type{_scale}};
     return *this;
@@ -398,7 +398,7 @@ class fixed_point {
    * @return The resulting `fixed_point` sum
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend fixed_point<Rep1, Rad1> operator+(
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator+(
     fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
@@ -413,7 +413,7 @@ class fixed_point {
    * @return The resulting `fixed_point` difference
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend fixed_point<Rep1, Rad1> operator-(
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator-(
     fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
@@ -426,7 +426,7 @@ class fixed_point {
    * @return The resulting `fixed_point` product
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend fixed_point<Rep1, Rad1> operator*(
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator*(
     fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
@@ -439,7 +439,7 @@ class fixed_point {
    * @return The resulting `fixed_point` quotient
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend fixed_point<Rep1, Rad1> operator/(
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator/(
     fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
@@ -454,8 +454,8 @@ class fixed_point {
    * @return true if `lhs` and `rhs` are equal, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator==(fixed_point<Rep1, Rad1> const& lhs,
-                                                   fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator==(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator != (for comparing two `fixed_point` numbers)
@@ -469,8 +469,8 @@ class fixed_point {
    * @return true if `lhs` and `rhs` are not equal, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
-                                                   fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator <= (for comparing two `fixed_point` numbers)
@@ -484,8 +484,8 @@ class fixed_point {
    * @return true if `lhs` less than or equal to `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
-                                                   fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator >= (for comparing two `fixed_point` numbers)
@@ -499,8 +499,8 @@ class fixed_point {
    * @return true if `lhs` greater than or equal to `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
-                                                   fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator < (for comparing two `fixed_point` numbers)
@@ -514,8 +514,8 @@ class fixed_point {
    * @return true if `lhs` less than `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator<(fixed_point<Rep1, Rad1> const& lhs,
-                                                  fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator<(fixed_point<Rep1, Rad1> const& lhs,
+                                                fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator > (for comparing two `fixed_point` numbers)
@@ -529,8 +529,8 @@ class fixed_point {
    * @return true if `lhs` greater than `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator>(fixed_point<Rep1, Rad1> const& lhs,
-                                                  fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator>(fixed_point<Rep1, Rad1> const& lhs,
+                                                fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief Method for creating a `fixed_point` number with a new `scale`
@@ -541,7 +541,7 @@ class fixed_point {
    * @param scale The `scale` of the returned `fixed_point` number
    * @return `fixed_point` number with a new `scale`
    */
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep, Rad> rescaled(scale_type scale) const
+  CUDF_HOST_DEVICE inline fixed_point<Rep, Rad> rescaled(scale_type scale) const
   {
     if (scale == _scale) return *this;
     Rep const value = detail::shift<Rep, Rad>(_value, scale_type{scale - _scale});
@@ -580,7 +580,7 @@ class fixed_point {
  * @return true if addition causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDA_HOST_DEVICE_CALLABLE auto addition_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto addition_overflow(T lhs, T rhs)
 {
   return rhs > 0 ? lhs > cuda::std::numeric_limits<Rep>::max() - rhs
                  : lhs < cuda::std::numeric_limits<Rep>::min() - rhs;
@@ -595,7 +595,7 @@ CUDA_HOST_DEVICE_CALLABLE auto addition_overflow(T lhs, T rhs)
  * @return true if subtraction causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDA_HOST_DEVICE_CALLABLE auto subtraction_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto subtraction_overflow(T lhs, T rhs)
 {
   return rhs > 0 ? lhs < cuda::std::numeric_limits<Rep>::min() + rhs
                  : lhs > cuda::std::numeric_limits<Rep>::max() + rhs;
@@ -610,7 +610,7 @@ CUDA_HOST_DEVICE_CALLABLE auto subtraction_overflow(T lhs, T rhs)
  * @return true if division causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDA_HOST_DEVICE_CALLABLE auto division_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto division_overflow(T lhs, T rhs)
 {
   return lhs == cuda::std::numeric_limits<Rep>::min() && rhs == -1;
 }
@@ -624,7 +624,7 @@ CUDA_HOST_DEVICE_CALLABLE auto division_overflow(T lhs, T rhs)
  * @return true if multiplication causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDA_HOST_DEVICE_CALLABLE auto multiplication_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto multiplication_overflow(T lhs, T rhs)
 {
   auto const min = cuda::std::numeric_limits<Rep>::min();
   auto const max = cuda::std::numeric_limits<Rep>::max();
@@ -638,8 +638,8 @@ CUDA_HOST_DEVICE_CALLABLE auto multiplication_overflow(T lhs, T rhs)
 
 // PLUS Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
-                                                            fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   auto const sum   = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value;
@@ -656,8 +656,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Ra
 
 // MINUS Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
-                                                            fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   auto const diff  = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value;
@@ -674,8 +674,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Ra
 
 // MULTIPLIES Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Rad1> const& lhs,
-                                                            fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
 #if defined(__CUDACC_DEBUG__)
 
@@ -689,8 +689,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Ra
 
 // DIVISION Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Rad1> const& lhs,
-                                                            fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
 #if defined(__CUDACC_DEBUG__)
 
@@ -704,8 +704,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Ra
 
 // EQUALITY COMPARISON Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator==(fixed_point<Rep1, Rad1> const& lhs,
-                                          fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator==(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value;
@@ -713,8 +713,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator==(fixed_point<Rep1, Rad1> const& lhs,
 
 // EQUALITY NOT COMPARISON Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
-                                          fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value;
@@ -722,8 +722,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
 
 // LESS THAN OR EQUAL TO Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
-                                          fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value;
@@ -731,8 +731,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
 
 // GREATER THAN OR EQUAL TO Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
-                                          fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value;
@@ -740,8 +740,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
 
 // LESS THAN Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator<(fixed_point<Rep1, Rad1> const& lhs,
-                                         fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<(fixed_point<Rep1, Rad1> const& lhs,
+                                       fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value;
@@ -749,8 +749,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator<(fixed_point<Rep1, Rad1> const& lhs,
 
 // GREATER THAN Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator>(fixed_point<Rep1, Rad1> const& lhs,
-                                         fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>(fixed_point<Rep1, Rad1> const& lhs,
+                                       fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value;
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 2b50e273517..be900f252f6 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -62,13 +62,13 @@ constexpr auto abs(T value)
 }
 
 template <typename T>
-CUDA_HOST_DEVICE_CALLABLE auto min(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto min(T lhs, T rhs)
 {
   return lhs < rhs ? lhs : rhs;
 }
 
 template <typename T>
-CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto max(T lhs, T rhs)
 {
   return lhs > rhs ? lhs : rhs;
 }
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index 7d0586ed6a6..bdf68037944 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -65,9 +65,9 @@ struct unbound_list_view {
    * @param lists_column The actual source/target lists column
    * @param row_index Index of the row in lists_column that this instance represents
    */
-  CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label,
-                                         cudf::detail::lists_column_device_view const& lists_column,
-                                         size_type const& row_index)
+  __device__ inline unbound_list_view(label_type scatter_source_label,
+                                      cudf::detail::lists_column_device_view const& lists_column,
+                                      size_type const& row_index)
     : _label{scatter_source_label}, _row_index{row_index}
   {
     _size = list_device_view{lists_column, row_index}.size();
@@ -81,9 +81,9 @@ struct unbound_list_view {
    * @param row_index Index of the row that this instance represents in the source/target column
    * @param size The number of elements in this list row
    */
-  CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label,
-                                         size_type const& row_index,
-                                         size_type const& size)
+  __device__ inline unbound_list_view(label_type scatter_source_label,
+                                      size_type const& row_index,
+                                      size_type const& size)
     : _label{scatter_source_label}, _row_index{row_index}, _size{size}
   {
   }
@@ -91,17 +91,17 @@ struct unbound_list_view {
   /**
    * @brief Returns number of elements in this list row.
    */
-  CUDA_DEVICE_CALLABLE size_type size() const { return _size; }
+  __device__ inline size_type size() const { return _size; }
 
   /**
    * @brief Returns whether this row came from the `scatter()` source or target
    */
-  CUDA_DEVICE_CALLABLE label_type label() const { return _label; }
+  __device__ inline label_type label() const { return _label; }
 
   /**
    * @brief Returns the index in the source/target column
    */
-  CUDA_DEVICE_CALLABLE size_type row_index() const { return _row_index; }
+  __device__ inline size_type row_index() const { return _row_index; }
 
   /**
    * @brief Binds to source/target column (depending on SOURCE/TARGET labels),
@@ -111,9 +111,9 @@ struct unbound_list_view {
    * @param scatter_target Target column for the scatter operation
    * @return A (bound) list_view for the row that this object represents
    */
-  CUDA_DEVICE_CALLABLE list_device_view
-  bind_to_column(lists_column_device_view const& scatter_source,
-                 lists_column_device_view const& scatter_target) const
+  __device__ inline list_device_view bind_to_column(
+    lists_column_device_view const& scatter_source,
+    lists_column_device_view const& scatter_target) const
   {
     return list_device_view(_label == label_type::SOURCE ? scatter_source : scatter_target,
                             _row_index);
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 659fb1e6b2a..5071f046e0c 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -32,8 +32,8 @@ class list_device_view {
  public:
   list_device_view() = default;
 
-  CUDA_DEVICE_CALLABLE list_device_view(lists_column_device_view const& lists_column,
-                                        size_type const& row_index)
+  __device__ inline list_device_view(lists_column_device_view const& lists_column,
+                                     size_type const& row_index)
     : lists_column(lists_column), _row_index(row_index)
   {
     column_device_view const& offsets = lists_column.offsets();
@@ -69,7 +69,7 @@ class list_device_view {
    * The offset of this element as stored in the child column (i.e. 5)
    * may be fetched using this method.
    */
-  CUDA_DEVICE_CALLABLE size_type element_offset(size_type idx) const
+  __device__ inline size_type element_offset(size_type idx) const
   {
     cudf_assert(idx >= 0 && idx < size() && "idx out of bounds");
     return begin_offset + idx;
@@ -83,7 +83,7 @@ class list_device_view {
    * @return The element at the specified index of the list row.
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE T element(size_type idx) const
+  __device__ inline T element(size_type idx) const
   {
     return lists_column.child().element<T>(element_offset(idx));
   }
@@ -91,7 +91,7 @@ class list_device_view {
   /**
    * @brief Checks whether element is null at specified index in the list row.
    */
-  CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const
+  __device__ inline bool is_null(size_type idx) const
   {
     cudf_assert(idx >= 0 && idx < size() && "Index out of bounds.");
     auto element_offset = begin_offset + idx;
@@ -101,17 +101,17 @@ class list_device_view {
   /**
    * @brief Checks whether this list row is null.
    */
-  CUDA_DEVICE_CALLABLE bool is_null() const { return lists_column.is_null(_row_index); }
+  __device__ inline bool is_null() const { return lists_column.is_null(_row_index); }
 
   /**
    * @brief Fetches the number of elements in this list row.
    */
-  CUDA_DEVICE_CALLABLE size_type size() const { return _size; }
+  __device__ inline size_type size() const { return _size; }
 
   /**
    * @brief Fetches the lists_column_device_view that contains this list.
    */
-  CUDA_DEVICE_CALLABLE lists_column_device_view const& get_column() const { return lists_column; }
+  __device__ inline lists_column_device_view const& get_column() const { return lists_column; }
 
   template <typename T>
   struct pair_accessor;
@@ -141,7 +141,7 @@ class list_device_view {
    *   2. `p.second == false`
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE const_pair_iterator<T> pair_begin() const
+  __device__ inline const_pair_iterator<T> pair_begin() const
   {
     return const_pair_iterator<T>{thrust::counting_iterator<size_type>(0), pair_accessor<T>{*this}};
   }
@@ -151,7 +151,7 @@ class list_device_view {
    * list_device_view.
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE const_pair_iterator<T> pair_end() const
+  __device__ inline const_pair_iterator<T> pair_end() const
   {
     return const_pair_iterator<T>{thrust::counting_iterator<size_type>(size()),
                                   pair_accessor<T>{*this}};
@@ -173,7 +173,7 @@ class list_device_view {
    *   2. `p.second == false`
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE const_pair_rep_iterator<T> pair_rep_begin() const
+  __device__ inline const_pair_rep_iterator<T> pair_rep_begin() const
   {
     return const_pair_rep_iterator<T>{thrust::counting_iterator<size_type>(0),
                                       pair_rep_accessor<T>{*this}};
@@ -184,7 +184,7 @@ class list_device_view {
    * list_device_view.
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE const_pair_rep_iterator<T> pair_rep_end() const
+  __device__ inline const_pair_rep_iterator<T> pair_rep_end() const
   {
     return const_pair_rep_iterator<T>{thrust::counting_iterator<size_type>(size()),
                                       pair_rep_accessor<T>{*this}};
@@ -215,7 +215,7 @@ class list_device_view {
      *
      * @param _list The `list_device_view` whose rows are being accessed.
      */
-    explicit CUDA_HOST_DEVICE_CALLABLE pair_accessor(list_device_view const& _list) : list{_list} {}
+    explicit CUDF_HOST_DEVICE inline pair_accessor(list_device_view const& _list) : list{_list} {}
 
     /**
      * @brief Accessor for the {data, validity} pair at the specified index
@@ -223,8 +223,7 @@ class list_device_view {
      * @param i Index into the list_device_view
      * @return A pair of data element and its validity flag.
      */
-    CUDA_DEVICE_CALLABLE
-    thrust::pair<T, bool> operator()(cudf::size_type i) const
+    __device__ inline thrust::pair<T, bool> operator()(cudf::size_type i) const
     {
       return {list.element<T>(i), !list.is_null(i)};
     }
@@ -253,8 +252,7 @@ class list_device_view {
      *
      * @param _list The `list_device_view` whose rows are being accessed.
      */
-    explicit CUDA_HOST_DEVICE_CALLABLE pair_rep_accessor(list_device_view const& _list)
-      : list{_list}
+    explicit CUDF_HOST_DEVICE inline pair_rep_accessor(list_device_view const& _list) : list{_list}
     {
     }
 
@@ -264,21 +262,20 @@ class list_device_view {
      * @param i Index into the list_device_view
      * @return A pair of data element and its validity flag.
      */
-    CUDA_DEVICE_CALLABLE
-    thrust::pair<rep_type, bool> operator()(cudf::size_type i) const
+    __device__ inline thrust::pair<rep_type, bool> operator()(cudf::size_type i) const
     {
       return {get_rep<T>(i), !list.is_null(i)};
     }
 
    private:
     template <typename R, std::enable_if_t<std::is_same_v<R, rep_type>, void>* = nullptr>
-    CUDA_DEVICE_CALLABLE rep_type get_rep(cudf::size_type i) const
+    __device__ inline rep_type get_rep(cudf::size_type i) const
     {
       return list.element<R>(i);
     }
 
     template <typename R, std::enable_if_t<not std::is_same_v<R, rep_type>, void>* = nullptr>
-    CUDA_DEVICE_CALLABLE rep_type get_rep(cudf::size_type i) const
+    __device__ inline rep_type get_rep(cudf::size_type i) const
     {
       return list.element<R>(i).value();
     }
@@ -291,7 +288,7 @@ class list_device_view {
  */
 struct list_size_functor {
   column_device_view const d_column;
-  CUDA_HOST_DEVICE_CALLABLE list_size_functor(column_device_view const& d_col) : d_column(d_col)
+  CUDF_HOST_DEVICE inline list_size_functor(column_device_view const& d_col) : d_column(d_col)
   {
 #if defined(__CUDA_ARCH__)
     cudf_assert(d_col.type().id() == type_id::LIST && "Only list type column is supported");
@@ -299,7 +296,7 @@ struct list_size_functor {
     CUDF_EXPECTS(d_col.type().id() == type_id::LIST, "Only list type column is supported");
 #endif
   }
-  CUDA_DEVICE_CALLABLE size_type operator()(size_type idx)
+  __device__ inline size_type operator()(size_type idx)
   {
     if (d_column.is_null(idx)) return size_type{0};
     auto d_offsets =
diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh
index d8f082c9a42..aff088a7f44 100644
--- a/cpp/include/cudf/lists/lists_column_device_view.cuh
+++ b/cpp/include/cudf/lists/lists_column_device_view.cuh
@@ -46,12 +46,12 @@ class lists_column_device_view {
   /**
    * @brief Fetches number of rows in the lists column
    */
-  CUDA_HOST_DEVICE_CALLABLE cudf::size_type size() const { return underlying.size(); }
+  CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); }
 
   /**
    * @brief Fetches the offsets column of the underlying list column.
    */
-  CUDA_DEVICE_CALLABLE column_device_view offsets() const
+  __device__ inline column_device_view offsets() const
   {
     return underlying.child(lists_column_view::offsets_column_index);
   }
@@ -59,7 +59,7 @@ class lists_column_device_view {
   /**
    * @brief Fetches the child column of the underlying list column.
    */
-  CUDA_DEVICE_CALLABLE column_device_view child() const
+  __device__ inline column_device_view child() const
   {
     return underlying.child(lists_column_view::child_column_index);
   }
@@ -67,19 +67,19 @@ class lists_column_device_view {
   /**
    * @brief Indicates whether the list column is nullable.
    */
-  CUDA_DEVICE_CALLABLE bool nullable() const { return underlying.nullable(); }
+  __device__ inline bool nullable() const { return underlying.nullable(); }
 
   /**
    * @brief Indicates whether the row (i.e. list) at the specified
    * index is null.
    */
-  CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const { return underlying.is_null(idx); }
+  __device__ inline bool is_null(size_type idx) const { return underlying.is_null(idx); }
 
   /**
    * @brief Fetches the offset of the underlying column_device_view,
    *        in case it is a sliced/offset column.
    */
-  CUDA_DEVICE_CALLABLE size_type offset() const { return underlying.offset(); }
+  __device__ inline size_type offset() const { return underlying.offset(); }
 
  private:
   column_device_view underlying;
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp
index 9081fa23eec..8435c47eaf5 100644
--- a/cpp/include/cudf/strings/json.hpp
+++ b/cpp/include/cudf/strings/json.hpp
@@ -48,7 +48,7 @@ class get_json_object_options {
    * @brief Returns true/false depending on whether single-quotes for representing strings
    * are allowed.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool get_allow_single_quotes() const { return allow_single_quotes; }
+  CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const { return allow_single_quotes; }
 
   /**
    * @brief Returns true/false depending on whether individually returned string values have
@@ -72,7 +72,7 @@ class get_json_object_options {
    *
    * @endcode
    */
-  CUDA_HOST_DEVICE_CALLABLE bool get_strip_quotes_from_single_strings() const
+  CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const
   {
     return strip_quotes_from_single_strings;
   }
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 238d55d580e..43a90997c86 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -80,7 +80,7 @@ static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
  *
  * @return An empty string
  */
-CUDA_HOST_DEVICE_CALLABLE string_view string_view::min() { return string_view(); }
+CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); }
 
 /**
  * @brief Return maximum value associated with the string type
@@ -91,7 +91,7 @@ CUDA_HOST_DEVICE_CALLABLE string_view string_view::min() { return string_view();
  * @return A string value which represents the highest possible valid UTF-8 encoded
  * character.
  */
-CUDA_HOST_DEVICE_CALLABLE string_view string_view::max()
+CUDF_HOST_DEVICE inline string_view string_view::max()
 {
   const char* psentinel{nullptr};
 #if defined(__CUDA_ARCH__)
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index be182cb0e9d..22409ab3dc7 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -51,20 +51,20 @@ class string_view {
   /**
    * @brief Return the number of bytes in this string
    */
-  CUDA_HOST_DEVICE_CALLABLE size_type size_bytes() const { return _bytes; }
+  CUDF_HOST_DEVICE inline size_type size_bytes() const { return _bytes; }
   /**
    * @brief Return the number of characters in this string
    */
-  CUDA_DEVICE_CALLABLE size_type length() const;
+  __device__ inline size_type length() const;
   /**
    * @brief Return a pointer to the internal device array
    */
-  CUDA_HOST_DEVICE_CALLABLE const char* data() const { return _data; }
+  CUDF_HOST_DEVICE inline const char* data() const { return _data; }
 
   /**
    * @brief Return true if string has no characters
    */
-  CUDA_HOST_DEVICE_CALLABLE bool empty() const { return size_bytes() == 0; }
+  CUDF_HOST_DEVICE inline bool empty() const { return size_bytes() == 0; }
 
   /**
    * @brief Handy iterator for navigating through encoded characters.
@@ -76,28 +76,28 @@ class string_view {
     using reference         = char_utf8&;
     using pointer           = char_utf8*;
     using iterator_category = std::input_iterator_tag;
-    CUDA_DEVICE_CALLABLE const_iterator(const string_view& str, size_type pos);
+    __device__ inline const_iterator(const string_view& str, size_type pos);
     const_iterator(const const_iterator& mit) = default;
     const_iterator(const_iterator&& mit)      = default;
     const_iterator& operator=(const const_iterator&) = default;
     const_iterator& operator=(const_iterator&&) = default;
-    CUDA_DEVICE_CALLABLE const_iterator& operator++();
-    CUDA_DEVICE_CALLABLE const_iterator operator++(int);
-    CUDA_DEVICE_CALLABLE const_iterator& operator+=(difference_type);
-    CUDA_DEVICE_CALLABLE const_iterator operator+(difference_type);
-    CUDA_DEVICE_CALLABLE const_iterator& operator--();
-    CUDA_DEVICE_CALLABLE const_iterator operator--(int);
-    CUDA_DEVICE_CALLABLE const_iterator& operator-=(difference_type);
-    CUDA_DEVICE_CALLABLE const_iterator operator-(difference_type);
-    CUDA_DEVICE_CALLABLE bool operator==(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator!=(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator<(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator<=(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator>(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator>=(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE char_utf8 operator*() const;
-    CUDA_DEVICE_CALLABLE size_type position() const;
-    CUDA_DEVICE_CALLABLE size_type byte_offset() const;
+    __device__ inline const_iterator& operator++();
+    __device__ inline const_iterator operator++(int);
+    __device__ inline const_iterator& operator+=(difference_type);
+    __device__ inline const_iterator operator+(difference_type);
+    __device__ inline const_iterator& operator--();
+    __device__ inline const_iterator operator--(int);
+    __device__ inline const_iterator& operator-=(difference_type);
+    __device__ inline const_iterator operator-(difference_type);
+    __device__ inline bool operator==(const const_iterator&) const;
+    __device__ inline bool operator!=(const const_iterator&) const;
+    __device__ inline bool operator<(const const_iterator&) const;
+    __device__ inline bool operator<=(const const_iterator&) const;
+    __device__ inline bool operator>(const const_iterator&) const;
+    __device__ inline bool operator>=(const const_iterator&) const;
+    __device__ inline char_utf8 operator*() const;
+    __device__ inline size_type position() const;
+    __device__ inline size_type byte_offset() const;
 
    private:
     const char* p{};
@@ -109,24 +109,24 @@ class string_view {
   /**
    * @brief Return new iterator pointing to the beginning of this string
    */
-  CUDA_DEVICE_CALLABLE const_iterator begin() const;
+  __device__ inline const_iterator begin() const;
   /**
    * @brief Return new iterator pointing past the end of this string
    */
-  CUDA_DEVICE_CALLABLE const_iterator end() const;
+  __device__ inline const_iterator end() const;
 
   /**
    * @brief Return single UTF-8 character at the given character position
    *
    * @param pos Character position
    */
-  CUDA_DEVICE_CALLABLE char_utf8 operator[](size_type pos) const;
+  __device__ inline char_utf8 operator[](size_type pos) const;
   /**
    * @brief Return the byte offset from data() for a given character position
    *
    * @param pos Character position
    */
-  CUDA_DEVICE_CALLABLE size_type byte_offset(size_type pos) const;
+  __device__ inline size_type byte_offset(size_type pos) const;
 
   /**
    * @brief Comparing target string with this string. Each character is compared
@@ -141,7 +141,7 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  CUDA_DEVICE_CALLABLE int compare(const string_view& str) const;
+  __device__ inline int compare(const string_view& str) const;
   /**
    * @brief Comparing target string with this string. Each character is compared
    * as a UTF-8 code-point value.
@@ -156,32 +156,32 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  CUDA_DEVICE_CALLABLE int compare(const char* str, size_type bytes) const;
+  __device__ inline int compare(const char* str, size_type bytes) const;
 
   /**
    * @brief Returns true if rhs matches this string exactly.
    */
-  CUDA_DEVICE_CALLABLE bool operator==(const string_view& rhs) const;
+  __device__ inline bool operator==(const string_view& rhs) const;
   /**
    * @brief Returns true if rhs does not match this string.
    */
-  CUDA_DEVICE_CALLABLE bool operator!=(const string_view& rhs) const;
+  __device__ inline bool operator!=(const string_view& rhs) const;
   /**
    * @brief Returns true if this string is ordered before rhs.
    */
-  CUDA_DEVICE_CALLABLE bool operator<(const string_view& rhs) const;
+  __device__ inline bool operator<(const string_view& rhs) const;
   /**
    * @brief Returns true if rhs is ordered before this string.
    */
-  CUDA_DEVICE_CALLABLE bool operator>(const string_view& rhs) const;
+  __device__ inline bool operator>(const string_view& rhs) const;
   /**
    * @brief Returns true if this string matches or is ordered before rhs.
    */
-  CUDA_DEVICE_CALLABLE bool operator<=(const string_view& rhs) const;
+  __device__ inline bool operator<=(const string_view& rhs) const;
   /**
    * @brief Returns true if rhs matches or is ordered before this string.
    */
-  CUDA_DEVICE_CALLABLE bool operator>=(const string_view& rhs) const;
+  __device__ inline bool operator>=(const string_view& rhs) const;
 
   /**
    * @brief Returns the character position of the first occurrence where the
@@ -193,9 +193,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if str is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type find(const string_view& str,
-                                      size_type pos   = 0,
-                                      size_type count = -1) const;
+  __device__ inline size_type find(const string_view& str,
+                                   size_type pos   = 0,
+                                   size_type count = -1) const;
   /**
    * @brief Returns the character position of the first occurrence where the
    * array str is found in this string within the character range [pos,pos+n).
@@ -207,10 +207,10 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type find(const char* str,
-                                      size_type bytes,
-                                      size_type pos   = 0,
-                                      size_type count = -1) const;
+  __device__ inline size_type find(const char* str,
+                                   size_type bytes,
+                                   size_type pos   = 0,
+                                   size_type count = -1) const;
   /**
    * @brief Returns the character position of the first occurrence where
    * character is found in this string within the character range [pos,pos+n).
@@ -221,9 +221,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type find(char_utf8 character,
-                                      size_type pos   = 0,
-                                      size_type count = -1) const;
+  __device__ inline size_type find(char_utf8 character,
+                                   size_type pos   = 0,
+                                   size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where the
    * argument str is found in this string within the character range [pos,pos+n).
@@ -234,9 +234,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type rfind(const string_view& str,
-                                       size_type pos   = 0,
-                                       size_type count = -1) const;
+  __device__ inline size_type rfind(const string_view& str,
+                                    size_type pos   = 0,
+                                    size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where the
    * array str is found in this string within the character range [pos,pos+n).
@@ -248,10 +248,10 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type rfind(const char* str,
-                                       size_type bytes,
-                                       size_type pos   = 0,
-                                       size_type count = -1) const;
+  __device__ inline size_type rfind(const char* str,
+                                    size_type bytes,
+                                    size_type pos   = 0,
+                                    size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where
    * character is found in this string within the character range [pos,pos+n).
@@ -262,9 +262,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type rfind(char_utf8 character,
-                                       size_type pos   = 0,
-                                       size_type count = -1) const;
+  __device__ inline size_type rfind(char_utf8 character,
+                                    size_type pos   = 0,
+                                    size_type count = -1) const;
 
   /**
    * @brief Return a sub-string of this string. The original string and device
@@ -274,7 +274,7 @@ class string_view {
    * @param length Number of characters from start to include in the sub-string.
    * @return New instance pointing to a subset of the characters within this instance.
    */
-  CUDA_DEVICE_CALLABLE string_view substr(size_type start, size_type length) const;
+  __device__ inline string_view substr(size_type start, size_type length) const;
 
   /**
    * @brief Return minimum value associated with the string type
@@ -284,7 +284,7 @@ class string_view {
    *
    * @return An empty string
    */
-  CUDA_HOST_DEVICE_CALLABLE static string_view min();
+  CUDF_HOST_DEVICE inline static string_view min();
 
   /**
    * @brief Return maximum value associated with the string type
@@ -295,12 +295,12 @@ class string_view {
    * @return A string value which represents the highest possible valid UTF-8 encoded
    * character.
    */
-  CUDA_HOST_DEVICE_CALLABLE static string_view max();
+  CUDF_HOST_DEVICE inline static string_view max();
 
   /**
    * @brief Default constructor represents an empty string.
    */
-  CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0) {}
+  CUDF_HOST_DEVICE inline string_view() : _data(""), _bytes(0), _length(0) {}
 
   /**
    * @brief Create instance from existing device char array.
@@ -308,7 +308,7 @@ class string_view {
    * @param data Device char array encoded in UTF8.
    * @param bytes Number of bytes in data array.
    */
-  CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes)
+  CUDF_HOST_DEVICE inline string_view(const char* data, size_type bytes)
     : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH)
   {
   }
@@ -330,7 +330,7 @@ class string_view {
    * @param bytepos Byte position from start of _data.
    * @return The character position for the specified byte.
    */
-  CUDA_DEVICE_CALLABLE size_type character_offset(size_type bytepos) const;
+  __device__ inline size_type character_offset(size_type bytepos) const;
 };
 
 namespace strings {
@@ -386,7 +386,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte)
  * @param[out] character Single char_utf8 value.
  * @return The number of bytes in the character
  */
-CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& character)
+CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character)
 {
   size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
 
@@ -413,7 +413,7 @@ CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& cha
  * @param[out] str Allocated char array with enough space to hold the encoded character.
  * @return The number of bytes in the character
  */
-CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char* str)
+CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str)
 {
   size_type const chr_width = bytes_in_char_utf8(character);
   for (size_type idx = 0; idx < chr_width; ++idx) {
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 13d5f8e06bc..459a4182aa0 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -17,19 +17,9 @@
 #pragma once
 
 #ifdef __CUDACC__
-#define CUDA_HOST_DEVICE_CALLABLE __host__ __device__ inline
-#define CUDA_DEVICE_CALLABLE      __device__ inline
-
-// This version of the macro maximizes the chances of inlining when applied to
-// a callable that is called on the GPU.
-#define CUDF_HDFI __host__ __device__ __forceinline__
-#define CUDF_DFI  __device__ __forceinline__
+#define CUDF_HOST_DEVICE __host__ __device__
 #else
-#define CUDA_HOST_DEVICE_CALLABLE inline
-#define CUDA_DEVICE_CALLABLE      inline
-
-#define CUDF_HDFI inline
-#define CUDF_DFI  inline
+#define CUDF_HOST_DEVICE
 #endif
 
 #include <cassert>
diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp
index cbd09fa7b0d..f4a70463de3 100644
--- a/cpp/include/cudf/utilities/bit.hpp
+++ b/cpp/include/cudf/utilities/bit.hpp
@@ -42,7 +42,7 @@ namespace detail {
 #endif
 
 template <typename T>
-constexpr CUDA_HOST_DEVICE_CALLABLE std::size_t size_in_bits()
+constexpr CUDF_HOST_DEVICE inline std::size_t size_in_bits()
 {
   static_assert(CHAR_BIT == 8, "Size of a byte must be 8 bits.");
   return sizeof(T) * CHAR_BIT;
@@ -58,7 +58,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE std::size_t size_in_bits()
 /**
  * @brief Returns the index of the word containing the specified bit.
  */
-constexpr CUDA_HOST_DEVICE_CALLABLE size_type word_index(size_type bit_index)
+constexpr CUDF_HOST_DEVICE inline size_type word_index(size_type bit_index)
 {
   return bit_index / detail::size_in_bits<bitmask_type>();
 }
@@ -66,7 +66,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE size_type word_index(size_type bit_index)
 /**
  * @brief Returns the position within a word of the specified bit.
  */
-constexpr CUDA_HOST_DEVICE_CALLABLE size_type intra_word_index(size_type bit_index)
+constexpr CUDF_HOST_DEVICE inline size_type intra_word_index(size_type bit_index)
 {
   return bit_index % detail::size_in_bits<bitmask_type>();
 }
@@ -80,7 +80,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE size_type intra_word_index(size_type bit_ind
  * @param bitmask The bitmask containing the bit to set
  * @param bit_index Index of the bit to set
  */
-CUDA_HOST_DEVICE_CALLABLE void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   bitmask[word_index(bit_index)] |= (bitmask_type{1} << intra_word_index(bit_index));
@@ -95,7 +95,7 @@ CUDA_HOST_DEVICE_CALLABLE void set_bit_unsafe(bitmask_type* bitmask, size_type b
  * @param bitmask The bitmask containing the bit to clear
  * @param bit_index The index of the bit to clear
  */
-CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   bitmask[word_index(bit_index)] &= ~(bitmask_type{1} << intra_word_index(bit_index));
@@ -109,7 +109,7 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type
  * @return true The specified bit is `1`
  * @return false  The specified bit is `0`
  */
-CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline bool bit_is_set(bitmask_type const* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index));
@@ -125,9 +125,9 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type
  * @return false  The specified bit is `0`
  * @return `default_value` if `bitmask` is nullptr
  */
-CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask,
-                                            size_type bit_index,
-                                            bool default_value)
+CUDF_HOST_DEVICE inline bool bit_value_or(bitmask_type const* bitmask,
+                                          size_type bit_index,
+                                          bool default_value)
 {
   return bitmask != nullptr ? bit_is_set(bitmask, bit_index) : default_value;
 }
@@ -140,7 +140,7 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask,
  * @param n The number of least significant bits to set
  * @return A bitmask word with `n` least significant bits set
  */
-constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_least_significant_bits(size_type n)
+constexpr CUDF_HOST_DEVICE inline bitmask_type set_least_significant_bits(size_type n)
 {
   constexpr_assert(0 <= n && n < static_cast<size_type>(detail::size_in_bits<bitmask_type>()));
   return ((bitmask_type{1} << n) - 1);
@@ -154,7 +154,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_least_significant_bits(size
  * @param n The number of most significant bits to set
  * @return A bitmask word with `n` most significant bits set
  */
-constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_most_significant_bits(size_type n)
+constexpr CUDF_HOST_DEVICE inline bitmask_type set_most_significant_bits(size_type n)
 {
   constexpr size_type word_size{detail::size_in_bits<bitmask_type>()};
   constexpr_assert(0 <= n && n < word_size);
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index d7d38aba4f3..0c6a6ee244c 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -420,7 +420,9 @@ using scalar_device_type_t = typename type_to_scalar_type_impl<T>::ScalarDeviceT
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl,
           typename Functor,
           typename... Ts>
-CUDF_HDFI constexpr decltype(auto) type_dispatcher(cudf::data_type dtype, Functor f, Ts&&... args)
+CUDF_HOST_DEVICE __forceinline__ constexpr decltype(auto) type_dispatcher(cudf::data_type dtype,
+                                                                          Functor f,
+                                                                          Ts&&... args)
 {
   switch (dtype.id()) {
     case type_id::BOOL8:
@@ -531,7 +533,7 @@ template <typename T1>
 struct double_type_dispatcher_second_type {
 #pragma nv_exec_check_disable
   template <typename T2, typename F, typename... Ts>
-  CUDF_HDFI decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE __forceinline__ decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<T1, T2>(std::forward<Ts>(args)...);
   }
@@ -541,7 +543,9 @@ template <template <cudf::type_id> typename IdTypeMap>
 struct double_type_dispatcher_first_type {
 #pragma nv_exec_check_disable
   template <typename T1, typename F, typename... Ts>
-  CUDF_HDFI decltype(auto) operator()(cudf::data_type type2, F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE __forceinline__ decltype(auto) operator()(cudf::data_type type2,
+                                                             F&& f,
+                                                             Ts&&... args) const
   {
     return type_dispatcher<IdTypeMap>(type2,
                                       detail::double_type_dispatcher_second_type<T1>{},
@@ -566,10 +570,8 @@ struct double_type_dispatcher_first_type {
  */
 #pragma nv_exec_check_disable
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl, typename F, typename... Ts>
-CUDF_HDFI constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
-                                                          cudf::data_type type2,
-                                                          F&& f,
-                                                          Ts&&... args)
+CUDF_HOST_DEVICE __forceinline__ constexpr decltype(auto) double_type_dispatcher(
+  cudf::data_type type1, cudf::data_type type2, F&& f, Ts&&... args)
 {
   return type_dispatcher<IdTypeMap>(type1,
                                     detail::double_type_dispatcher_first_type<IdTypeMap>{},
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index 040fde6e08d..9f1c1534e9a 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -56,22 +56,22 @@ struct dictionary_wrapper {
   dictionary_wrapper& operator=(const dictionary_wrapper&) = default;
 
   // construct object from type
-  CUDA_HOST_DEVICE_CALLABLE constexpr explicit dictionary_wrapper(value_type v) : _value{v} {}
+  CUDF_HOST_DEVICE inline constexpr explicit dictionary_wrapper(value_type v) : _value{v} {}
 
   // conversion operator
-  CUDA_HOST_DEVICE_CALLABLE explicit operator value_type() const { return _value; }
+  CUDF_HOST_DEVICE inline explicit operator value_type() const { return _value; }
   // simple accessor
-  CUDA_HOST_DEVICE_CALLABLE value_type value() const { return _value; }
+  CUDF_HOST_DEVICE inline value_type value() const { return _value; }
 
-  static CUDA_HOST_DEVICE_CALLABLE constexpr value_type max_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type max_value()
   {
     return std::numeric_limits<value_type>::max();
   }
-  static CUDA_HOST_DEVICE_CALLABLE constexpr value_type min_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type min_value()
   {
     return std::numeric_limits<value_type>::min();
   }
-  static CUDA_HOST_DEVICE_CALLABLE constexpr value_type lowest_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type lowest_value()
   {
     return std::numeric_limits<IndexType>::lowest();
   }
@@ -82,43 +82,43 @@ struct dictionary_wrapper {
 
 // comparison operators
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator==(dictionary_wrapper<Integer> const& lhs,
-                                          dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator==(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() == rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator!=(dictionary_wrapper<Integer> const& lhs,
-                                          dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator!=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() != rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator<=(dictionary_wrapper<Integer> const& lhs,
-                                          dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() <= rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator>=(dictionary_wrapper<Integer> const& lhs,
-                                          dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() >= rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE constexpr bool operator<(dictionary_wrapper<Integer> const& lhs,
-                                                   dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline constexpr bool operator<(dictionary_wrapper<Integer> const& lhs,
+                                                 dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() < rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator>(dictionary_wrapper<Integer> const& lhs,
-                                         dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>(dictionary_wrapper<Integer> const& lhs,
+                                       dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() > rhs.value();
 }
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 71d9b615153..efa8cdca2cc 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -119,9 +119,9 @@ struct compare_functor {
 
   // This is used to compare a scalar and a column value
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  CUDA_DEVICE_CALLABLE typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
-                                                   !std::is_same_v<RhsViewT, column_device_view>,
-                                                 OutT>
+  __device__ inline typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
+                                                !std::is_same_v<RhsViewT, column_device_view>,
+                                              OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(i),
@@ -133,9 +133,9 @@ struct compare_functor {
 
   // This is used to compare a scalar and a column value
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  CUDA_DEVICE_CALLABLE typename std::enable_if_t<!std::is_same_v<LhsViewT, column_device_view> &&
-                                                   std::is_same_v<RhsViewT, column_device_view>,
-                                                 OutT>
+  __device__ inline typename std::enable_if_t<!std::is_same_v<LhsViewT, column_device_view> &&
+                                                std::is_same_v<RhsViewT, column_device_view>,
+                                              OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(),
@@ -147,9 +147,9 @@ struct compare_functor {
 
   // This is used to compare 2 column values
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  CUDA_DEVICE_CALLABLE typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
-                                                   std::is_same_v<RhsViewT, column_device_view>,
-                                                 OutT>
+  __device__ inline typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
+                                                std::is_same_v<RhsViewT, column_device_view>,
+                                              OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(i),
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 5bd639dd295..dc1cae82796 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -43,9 +43,9 @@ constexpr bool is_bool_result()
 template <typename CastType>
 struct type_casted_accessor {
   template <typename Element>
-  CUDA_DEVICE_CALLABLE CastType operator()(cudf::size_type i,
-                                           column_device_view const& col,
-                                           bool is_scalar) const
+  __device__ inline CastType operator()(cudf::size_type i,
+                                        column_device_view const& col,
+                                        bool is_scalar) const
   {
     if constexpr (column_device_view::has_element_accessor<Element>() and
                   std::is_convertible_v<Element, CastType>)
@@ -61,9 +61,9 @@ struct type_casted_accessor {
 template <typename FromType>
 struct typed_casted_writer {
   template <typename Element>
-  CUDA_DEVICE_CALLABLE void operator()(cudf::size_type i,
-                                       mutable_column_device_view const& col,
-                                       FromType val) const
+  __device__ inline void operator()(cudf::size_type i,
+                                    mutable_column_device_view const& col,
+                                    FromType val) const
   {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
                   std::is_constructible_v<Element, FromType>) {
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 86645e2cb8a..75507d055e0 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -29,7 +29,7 @@ namespace ops {
 
 struct Add {
   template <typename T1, typename T2>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs + rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -37,7 +37,7 @@ struct Add {
 
 struct Sub {
   template <typename T1, typename T2>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs - rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs - rhs)
   {
     return lhs - rhs;
   }
@@ -58,7 +58,7 @@ struct Mul {
            (is_numeric<TypeLhs>() and is_fixed_point<TypeRhs>());
   }
   template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs * rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -75,7 +75,7 @@ struct Div {
            (is_numeric<TypeLhs>() and is_fixed_point<TypeRhs>());
   }
   template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs / rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs / rhs)
   {
     return lhs / rhs;
   }
@@ -83,7 +83,7 @@ struct Div {
 
 struct TrueDiv {
   template <typename T1, typename T2>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs)
     -> decltype((static_cast<double>(lhs) / static_cast<double>(rhs)))
   {
     return (static_cast<double>(lhs) / static_cast<double>(rhs));
@@ -92,7 +92,7 @@ struct TrueDiv {
 
 struct FloorDiv {
   template <typename T1, typename T2>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs)
     -> decltype(floor(static_cast<double>(lhs) / static_cast<double>(rhs)))
   {
     return floor(static_cast<double>(lhs) / static_cast<double>(rhs));
@@ -110,21 +110,21 @@ struct Mod {
            (is_duration<TypeLhs>() and (std::is_integral<TypeRhs>() or is_duration<TypeRhs>()));
   }
   template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs % rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs % rhs)
   {
     return lhs % rhs;
   }
   template <typename T1,
             typename T2,
             std::enable_if_t<(std::is_same_v<float, std::common_type_t<T1, T2>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> float
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> float
   {
     return fmodf(static_cast<float>(lhs), static_cast<float>(rhs));
   }
   template <typename T1,
             typename T2,
             std::enable_if_t<(std::is_same_v<double, std::common_type_t<T1, T2>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> double
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> double
   {
     return fmod(static_cast<double>(lhs), static_cast<double>(rhs));
   }
@@ -138,7 +138,7 @@ struct PMod {
   template <typename TypeLhs,
             typename TypeRhs,
             std::enable_if_t<(std::is_integral_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
   {
     using common_t = std::common_type_t<TypeLhs, TypeRhs>;
     common_t xconv = static_cast<common_t>(x);
@@ -153,7 +153,7 @@ struct PMod {
     typename TypeLhs,
     typename TypeRhs,
     std::enable_if_t<(std::is_floating_point_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
   {
     using common_t = std::common_type_t<TypeLhs, TypeRhs>;
     common_t xconv = static_cast<common_t>(x);
@@ -168,7 +168,7 @@ struct PyMod {
   template <typename TypeLhs,
             typename TypeRhs,
             std::enable_if_t<(std::is_integral_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
   {
     return ((x % y) + y) % y;
   }
@@ -177,7 +177,7 @@ struct PyMod {
     typename TypeLhs,
     typename TypeRhs,
     std::enable_if_t<(std::is_floating_point_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
     double x1 = static_cast<double>(x);
     double y1 = static_cast<double>(y);
@@ -187,7 +187,7 @@ struct PyMod {
   template <typename TypeLhs,
             typename TypeRhs,
             std::enable_if_t<(is_duration<TypeLhs>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
   {
     return ((x % y) + y) % y;
   }
@@ -198,7 +198,7 @@ struct Pow {
             typename TypeRhs,
             std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
                               std::is_convertible_v<TypeRhs, double>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
     return pow(static_cast<double>(x), static_cast<double>(y));
   }
@@ -209,7 +209,7 @@ struct LogBase {
             typename TypeRhs,
             std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
                               std::is_convertible_v<TypeRhs, double>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
     return (std::log(static_cast<double>(x)) / std::log(static_cast<double>(y)));
   }
@@ -220,7 +220,7 @@ struct ATan2 {
             typename TypeRhs,
             std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
                               std::is_convertible_v<TypeRhs, double>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
     return std::atan2(static_cast<double>(x), static_cast<double>(y));
   }
@@ -228,7 +228,7 @@ struct ATan2 {
 
 struct ShiftLeft {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x << y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x << y)
   {
     return (x << y);
   }
@@ -236,7 +236,7 @@ struct ShiftLeft {
 
 struct ShiftRight {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >> y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >> y)
   {
     return (x >> y);
   }
@@ -247,7 +247,7 @@ struct ShiftRightUnsigned {
     typename TypeLhs,
     typename TypeRhs,
     std::enable_if_t<(std::is_integral_v<TypeLhs> and not is_boolean<TypeLhs>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
     -> decltype(static_cast<std::make_unsigned_t<TypeLhs>>(x) >> y)
   {
     return (static_cast<std::make_unsigned_t<TypeLhs>>(x) >> y);
@@ -256,7 +256,7 @@ struct ShiftRightUnsigned {
 
 struct BitwiseAnd {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x & y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x & y)
   {
     return (x & y);
   }
@@ -264,7 +264,7 @@ struct BitwiseAnd {
 
 struct BitwiseOr {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x | y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x | y)
   {
     return (x | y);
   }
@@ -272,7 +272,7 @@ struct BitwiseOr {
 
 struct BitwiseXor {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x ^ y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x ^ y)
   {
     return (x ^ y);
   }
@@ -280,7 +280,7 @@ struct BitwiseXor {
 
 struct LogicalAnd {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y)
   {
     return (x && y);
   }
@@ -288,7 +288,7 @@ struct LogicalAnd {
 
 struct LogicalOr {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y)
   {
     return (x || y);
   }
@@ -296,7 +296,7 @@ struct LogicalOr {
 
 struct Equal {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y)
   {
     return (x == y);
   }
@@ -304,7 +304,7 @@ struct Equal {
 
 struct NotEqual {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y)
   {
     return (x != y);
   }
@@ -312,7 +312,7 @@ struct NotEqual {
 
 struct Less {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x < y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x < y)
   {
     return (x < y);
   }
@@ -320,7 +320,7 @@ struct Less {
 
 struct Greater {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x > y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x > y)
   {
     return (x > y);
   }
@@ -328,7 +328,7 @@ struct Greater {
 
 struct LessEqual {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x <= y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x <= y)
   {
     return (x <= y);
   }
@@ -336,7 +336,7 @@ struct LessEqual {
 
 struct GreaterEqual {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >= y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >= y)
   {
     return (x >= y);
   }
@@ -344,7 +344,7 @@ struct GreaterEqual {
 
 struct NullEquals {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(
+  __device__ inline auto operator()(
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x == y)
   {
     output_valid = true;
@@ -354,14 +354,14 @@ struct NullEquals {
   }
   // To allow std::is_invocable_v = true
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y);
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y);
 };
 
 struct NullMax {
   template <typename TypeLhs,
             typename TypeRhs,
             typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
-  CUDA_DEVICE_CALLABLE auto operator()(
+  __device__ inline auto operator()(
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) > static_cast<common_t>(y) ? x : y))
   {
@@ -382,7 +382,7 @@ struct NullMax {
   template <typename TypeLhs,
             typename TypeRhs,
             typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) > static_cast<common_t>(y) ? x : y));
 };
 
@@ -390,7 +390,7 @@ struct NullMin {
   template <typename TypeLhs,
             typename TypeRhs,
             typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
-  CUDA_DEVICE_CALLABLE auto operator()(
+  __device__ inline auto operator()(
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) < static_cast<common_t>(y) ? x : y))
   {
@@ -411,7 +411,7 @@ struct NullMin {
   template <typename TypeLhs,
             typename TypeRhs,
             typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) < static_cast<common_t>(y) ? x : y));
 };
 
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 1e9a39560b8..122ad4a9752 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -63,7 +63,7 @@ enum class rounding_function {
 template <datetime_component Component>
 struct extract_component_operator {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  __device__ inline int16_t operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
 
@@ -99,7 +99,7 @@ struct extract_component_operator {
 template <typename DurationType>
 struct RoundFunctor {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE auto operator()(rounding_function round_kind, Timestamp dt)
+  __device__ inline auto operator()(rounding_function round_kind, Timestamp dt)
   {
     switch (round_kind) {
       case rounding_function::CEIL: return cuda::std::chrono::ceil<DurationType>(dt);
@@ -121,7 +121,7 @@ struct RoundingDispatcher {
   }
 
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const
+  __device__ inline Timestamp operator()(Timestamp const ts) const
   {
     switch (component) {
       case rounding_frequency::DAY:
@@ -161,7 +161,7 @@ static __device__ int16_t const days_until_month[2][13] = {
 // date only (without the time component)
 struct extract_last_day_of_month {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE timestamp_D operator()(Timestamp const ts) const
+  __device__ inline timestamp_D operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
     const year_month_day ymd(floor<days>(ts));
@@ -175,7 +175,7 @@ struct extract_last_day_of_month {
 // an integer while the other returns a timestamp.
 struct days_in_month_op {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  __device__ inline int16_t operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
     auto const date = year_month_day(floor<days>(ts));
@@ -187,7 +187,7 @@ struct days_in_month_op {
 // Extract the day number of the year present in the timestamp
 struct extract_day_num_of_year {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  __device__ inline int16_t operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
 
@@ -203,7 +203,7 @@ struct extract_day_num_of_year {
 // Extract the the quarter to which the timestamp belongs to
 struct extract_quarter_op {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  __device__ inline int16_t operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
 
@@ -220,7 +220,7 @@ struct extract_quarter_op {
 // Returns true if the year is a leap year
 struct is_leap_year_op {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const
+  __device__ inline bool operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
     auto const days_since_epoch = floor<days>(ts);
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index cdcf4311be7..02b4f2af724 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -58,7 +58,7 @@ struct is_double_convertible_impl {
 template <typename CastType>
 struct type_casted_accessor {
   template <typename Element>
-  CUDA_DEVICE_CALLABLE CastType operator()(cudf::size_type i, column_device_view const& col) const
+  __device__ inline CastType operator()(cudf::size_type i, column_device_view const& col) const
   {
     if constexpr (column_device_view::has_element_accessor<Element>() and
                   std::is_convertible_v<Element, CastType>)
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index ffc6032dfa1..dde4e00eb4a 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -51,7 +51,7 @@ struct element_arg_minmax_fn {
   bool const has_nulls;
   bool const arg_min;
 
-  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
+  __device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
   {
     // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
     // github.com/NVIDIA/thrust/issues/1525
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 8d09728b771..1048a6a71c8 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -79,8 +79,7 @@ struct permuted_row_equality_comparator {
    * @param rhs The index of the second row
    * @returns true if the two specified rows in the permuted order are equivalent
    */
-  CUDA_DEVICE_CALLABLE
-  bool operator()(cudf::size_type lhs, cudf::size_type rhs)
+  __device__ inline bool operator()(cudf::size_type lhs, cudf::size_type rhs)
   {
     return _comparator(_map[lhs], _map[rhs]);
   }
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index b9915da90b9..528c949b690 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -62,12 +62,12 @@ struct hash_circular_buffer {
   int available_space{capacity};
   hash_step_callable hash_step;
 
-  CUDA_DEVICE_CALLABLE hash_circular_buffer(hash_step_callable hash_step)
+  __device__ inline hash_circular_buffer(hash_step_callable hash_step)
     : cur{storage}, hash_step{hash_step}
   {
   }
 
-  CUDA_DEVICE_CALLABLE void put(uint8_t const* in, int size)
+  __device__ inline void put(uint8_t const* in, int size)
   {
     int copy_start = 0;
     while (size >= available_space) {
@@ -88,7 +88,7 @@ struct hash_circular_buffer {
     available_space -= size;
   }
 
-  CUDA_DEVICE_CALLABLE void pad(int const space_to_leave)
+  __device__ inline void pad(int const space_to_leave)
   {
     if (space_to_leave > available_space) {
       memset(cur, 0x00, available_space);
@@ -101,12 +101,12 @@ struct hash_circular_buffer {
     available_space = space_to_leave;
   }
 
-  CUDA_DEVICE_CALLABLE const uint8_t& operator[](int idx) const { return storage[idx]; }
+  __device__ inline const uint8_t& operator[](int idx) const { return storage[idx]; }
 };
 
 // Get a uint8_t pointer to a column element and its size as a pair.
 template <typename Element>
-auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(Element const& element)
+auto __device__ inline get_element_pointer_and_size(Element const& element)
 {
   if constexpr (is_fixed_width<Element>() && !is_chrono<Element>()) {
     return thrust::make_pair(reinterpret_cast<uint8_t const*>(&element), sizeof(Element));
@@ -116,7 +116,7 @@ auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(Element const& element)
 }
 
 template <>
-auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(string_view const& element)
+auto __device__ inline get_element_pointer_and_size(string_view const& element)
 {
   return thrust::make_pair(reinterpret_cast<uint8_t const*>(element.data()), element.size_bytes());
 }
@@ -124,12 +124,12 @@ auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(string_view const& elemen
 struct MD5Hasher {
   static constexpr int message_chunk_size = 64;
 
-  CUDA_DEVICE_CALLABLE MD5Hasher(char* result_location)
+  __device__ inline MD5Hasher(char* result_location)
     : result_location(result_location), buffer(md5_hash_step{hash_values})
   {
   }
 
-  CUDA_DEVICE_CALLABLE ~MD5Hasher()
+  __device__ inline ~MD5Hasher()
   {
     // On destruction, finalize the message buffer and write out the current
     // hexadecimal hash value to the result location.
@@ -154,7 +154,7 @@ struct MD5Hasher {
   MD5Hasher& operator=(MD5Hasher&&) = delete;
 
   template <typename Element>
-  void CUDA_DEVICE_CALLABLE process(Element const& element)
+  void __device__ inline process(Element const& element)
   {
     auto const normalized_element  = normalize_nans_and_zeros(element);
     auto const [element_ptr, size] = get_element_pointer_and_size(normalized_element);
@@ -169,7 +169,7 @@ struct MD5Hasher {
   struct md5_hash_step {
     uint32_t (&hash_values)[4];
 
-    void CUDA_DEVICE_CALLABLE operator()(const uint8_t (&buffer)[message_chunk_size])
+    void __device__ inline operator()(const uint8_t (&buffer)[message_chunk_size])
     {
       uint32_t A = hash_values[0];
       uint32_t B = hash_values[1];
@@ -226,13 +226,13 @@ struct HasherDispatcher {
   Hasher* hasher;
   column_device_view const& input_col;
 
-  CUDA_DEVICE_CALLABLE HasherDispatcher(Hasher* hasher, column_device_view const& input_col)
+  __device__ inline HasherDispatcher(Hasher* hasher, column_device_view const& input_col)
     : hasher{hasher}, input_col{input_col}
   {
   }
 
   template <typename Element>
-  void CUDA_DEVICE_CALLABLE operator()(size_type const row_index) const
+  void __device__ inline operator()(size_type const row_index) const
   {
     if constexpr ((is_fixed_width<Element>() && !is_chrono<Element>()) ||
                   std::is_same_v<Element, string_view>) {
@@ -249,14 +249,13 @@ struct ListHasherDispatcher {
   Hasher* hasher;
   column_device_view const& input_col;
 
-  CUDA_DEVICE_CALLABLE ListHasherDispatcher(Hasher* hasher, column_device_view const& input_col)
+  __device__ inline ListHasherDispatcher(Hasher* hasher, column_device_view const& input_col)
     : hasher{hasher}, input_col{input_col}
   {
   }
 
   template <typename Element>
-  void CUDA_DEVICE_CALLABLE operator()(size_type const offset_begin,
-                                       size_type const offset_end) const
+  void __device__ inline operator()(size_type const offset_begin, size_type const offset_end) const
   {
     if constexpr ((is_fixed_width<Element>() && !is_chrono<Element>()) ||
                   std::is_same_v<Element, string_view>) {
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index e15144f9ea5..b25dfd0a621 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -57,10 +57,10 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
  *
  * @return GMT offset
  */
-CUDA_HOST_DEVICE_CALLABLE int32_t get_gmt_offset_impl(int64_t const* ttimes,
-                                                      int32_t const* offsets,
-                                                      size_t count,
-                                                      int64_t ts)
+CUDF_HOST_DEVICE inline int32_t get_gmt_offset_impl(int64_t const* ttimes,
+                                                    int32_t const* offsets,
+                                                    size_t count,
+                                                    int64_t ts)
 {
   // Returns start of the range if all elements are larger than the input timestamp
   auto last_less_equal_ttime_idx = [&](long begin_idx, long end_idx, int64_t ts) {
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index a0554833def..7af1e47087b 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -23,7 +23,7 @@
 namespace cudf {
 namespace detail {
 template <typename Result, typename T>
-CUDA_HOST_DEVICE_CALLABLE Result get_array_value(T const* devarr, size_type location)
+CUDF_HOST_DEVICE inline Result get_array_value(T const* devarr, size_type location)
 {
   T result;
 #if defined(__CUDA_ARCH__)
@@ -36,7 +36,7 @@ CUDA_HOST_DEVICE_CALLABLE Result get_array_value(T const* devarr, size_type loca
 
 namespace interpolate {
 template <typename Result, typename T>
-CUDA_HOST_DEVICE_CALLABLE Result linear(T lhs, T rhs, double frac)
+CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac)
 {
   // TODO: safe operation to avoid overflow/underflow
   // double can fully represent int8-32 value range.
@@ -52,7 +52,7 @@ CUDA_HOST_DEVICE_CALLABLE Result linear(T lhs, T rhs, double frac)
 }
 
 template <typename Result, typename T>
-CUDA_HOST_DEVICE_CALLABLE Result midpoint(T lhs, T rhs)
+CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs)
 {
   // TODO: try std::midpoint (C++20) if available
   double dlhs = static_cast<double>(lhs);
@@ -61,7 +61,7 @@ CUDA_HOST_DEVICE_CALLABLE Result midpoint(T lhs, T rhs)
 }
 
 template <typename Result>
-CUDA_HOST_DEVICE_CALLABLE Result midpoint(int64_t lhs, int64_t rhs)
+CUDF_HOST_DEVICE inline Result midpoint(int64_t lhs, int64_t rhs)
 {
   // caring to avoid integer overflow and underflow between int64_t and Result( double )
   int64_t half = lhs / 2 + rhs / 2;
@@ -70,7 +70,7 @@ CUDA_HOST_DEVICE_CALLABLE Result midpoint(int64_t lhs, int64_t rhs)
 }
 
 template <>
-CUDA_HOST_DEVICE_CALLABLE int64_t midpoint(int64_t lhs, int64_t rhs)
+CUDF_HOST_DEVICE inline int64_t midpoint(int64_t lhs, int64_t rhs)
 {
   // caring to avoid integer overflow
   int64_t half   = lhs / 2 + rhs / 2;
@@ -92,8 +92,7 @@ struct quantile_index {
   size_type nearest;
   double fraction;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  quantile_index(size_type count, double quantile)
+  CUDF_HOST_DEVICE inline quantile_index(size_type count, double quantile)
   {
     quantile = std::min(std::max(quantile, 0.0), 1.0);
 
@@ -123,8 +122,10 @@ struct quantile_index {
  * @returns Value of the desired quantile.
  */
 template <typename Result, typename ValueAccessor>
-CUDA_HOST_DEVICE_CALLABLE Result
-select_quantile(ValueAccessor get_value, size_type size, double q, interpolation interp)
+CUDF_HOST_DEVICE inline Result select_quantile(ValueAccessor get_value,
+                                               size_type size,
+                                               double q,
+                                               interpolation interp)
 {
   if (size < 2) { return get_value(0); }
 
@@ -154,8 +155,10 @@ select_quantile(ValueAccessor get_value, size_type size, double q, interpolation
 }
 
 template <typename Result, typename Iterator>
-CUDA_HOST_DEVICE_CALLABLE Result
-select_quantile_data(Iterator begin, size_type size, double q, interpolation interp)
+CUDF_HOST_DEVICE inline Result select_quantile_data(Iterator begin,
+                                                    size_type size,
+                                                    double q,
+                                                    interpolation interp)
 {
   if (size == 0) return static_cast<Result>(*begin);
 
@@ -184,10 +187,10 @@ select_quantile_data(Iterator begin, size_type size, double q, interpolation int
 }
 
 template <typename Iterator>
-CUDA_HOST_DEVICE_CALLABLE bool select_quantile_validity(Iterator begin,
-                                                        size_type size,
-                                                        double q,
-                                                        interpolation interp)
+CUDF_HOST_DEVICE inline bool select_quantile_validity(Iterator begin,
+                                                      size_type size,
+                                                      double q,
+                                                      interpolation interp)
 {
   quantile_index idx(size, q);
 
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 809f3506c67..bf2c83b5b8d 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -88,8 +88,7 @@ struct min_max_scan_operator {
     if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
 
-  CUDA_DEVICE_CALLABLE
-  size_type operator()(size_type lhs, size_type rhs) const
+  __device__ inline size_type operator()(size_type lhs, size_type rhs) const
   {
     // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
     // in these cases the return value does not matter since the result is not used
diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp
index 5fabcf5b14e..80a9397922e 100644
--- a/cpp/src/rolling/rolling_detail.hpp
+++ b/cpp/src/rolling/rolling_detail.hpp
@@ -29,7 +29,7 @@ namespace detail {
 // store functor
 template <typename T, bool is_mean = false>
 struct rolling_store_output_functor {
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count) { out = val; }
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count) { out = val; }
 };
 
 // Specialization for MEAN
@@ -38,21 +38,21 @@ struct rolling_store_output_functor<_T, true> {
   // SFINAE for non-bool types
   template <typename T                                                             = _T,
             std::enable_if_t<!(cudf::is_boolean<T>() || cudf::is_timestamp<T>())>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = val / count;
   }
 
   // SFINAE for bool type
   template <typename T = _T, std::enable_if_t<cudf::is_boolean<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = static_cast<int32_t>(val) / count;
   }
 
   // SFINAE for timestamp types
   template <typename T = _T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = static_cast<T>(val.time_since_epoch() / count);
   }
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index e2c342482f1..c61fb8905f5 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -72,22 +72,22 @@ enum class parse_result {
  */
 class parser {
  protected:
-  CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {}
-  CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len)
+  CUDF_HOST_DEVICE inline parser() : input(nullptr), input_len(0), pos(nullptr) {}
+  CUDF_HOST_DEVICE inline parser(const char* _input, int64_t _input_len)
     : input(_input), input_len(_input_len), pos(_input)
   {
     parse_whitespace();
   }
 
-  CUDA_HOST_DEVICE_CALLABLE parser(parser const& p)
+  CUDF_HOST_DEVICE inline parser(parser const& p)
     : input(p.input), input_len(p.input_len), pos(p.pos)
   {
   }
 
-  CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; }
-  CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); }
+  CUDF_HOST_DEVICE inline bool eof(const char* p) { return p - input >= input_len; }
+  CUDF_HOST_DEVICE inline bool eof() { return eof(pos); }
 
-  CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace()
+  CUDF_HOST_DEVICE inline bool parse_whitespace()
   {
     while (!eof()) {
       if (is_whitespace(*pos)) {
@@ -99,12 +99,12 @@ class parser {
     return false;
   }
 
-  CUDA_HOST_DEVICE_CALLABLE bool is_hex_digit(char c)
+  CUDF_HOST_DEVICE inline bool is_hex_digit(char c)
   {
     return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
   }
 
-  CUDA_HOST_DEVICE_CALLABLE int64_t chars_left() { return input_len - ((pos - input) + 1); }
+  CUDF_HOST_DEVICE inline int64_t chars_left() { return input_len - ((pos - input) + 1); }
 
   /**
    * @brief Parse an escape sequence.
@@ -114,7 +114,7 @@ class parser {
    *
    * @returns True on success or false on fail.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool parse_escape_seq()
+  CUDF_HOST_DEVICE inline bool parse_escape_seq()
   {
     if (*pos != '\\') { return false; }
     char c = *++pos;
@@ -147,9 +147,7 @@ class parser {
    * indicates allowing either single or double quotes (but not a mixture of both).
    * @returns A result code indicating success, failure or other result.
    */
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str,
-                                                      bool can_be_empty,
-                                                      char quote)
+  CUDF_HOST_DEVICE inline parse_result parse_string(string_view& str, bool can_be_empty, char quote)
   {
     str = string_view(nullptr, 0);
 
@@ -183,7 +181,7 @@ class parser {
   int64_t input_len;
   char const* pos;
 
-  CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; }
+  CUDF_HOST_DEVICE inline bool is_whitespace(char c) { return c <= ' '; }
 };
 
 /**
@@ -370,7 +368,7 @@ class json_state : private parser {
    * to not be present.
    * @returns A result code indicating success, failure or other result.
    */
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, bool can_be_empty)
+  CUDF_HOST_DEVICE inline parse_result parse_name(string_view& name, bool can_be_empty)
   {
     char const quote = options.get_allow_single_quotes() ? 0 : '\"';
 
@@ -399,7 +397,7 @@ class json_state : private parser {
    * @param val (Output) The string containing the parsed value
    * @returns A result code indicating success, failure or other result.
    */
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val)
+  CUDF_HOST_DEVICE inline parse_result parse_non_string_value(string_view& val)
   {
     if (!parse_whitespace()) { return parse_result::ERROR; }
 
@@ -483,7 +481,7 @@ class json_state : private parser {
     return parse_result::SUCCESS;
   }
 
-  CUDA_HOST_DEVICE_CALLABLE bool is_quote(char c)
+  CUDF_HOST_DEVICE inline bool is_quote(char c)
   {
     return (c == '\"') || (options.get_allow_single_quotes() && (c == '\''));
   }
@@ -503,12 +501,12 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR,
  * an array of these operators applied to the incoming json string,
  */
 struct path_operator {
-  CUDA_HOST_DEVICE_CALLABLE path_operator()
+  CUDF_HOST_DEVICE inline path_operator()
     : type(path_operator_type::ERROR), index(-1), expected_type{NONE}
   {
   }
-  CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type,
-                                          json_element_type _expected_type = NONE)
+  CUDF_HOST_DEVICE inline path_operator(path_operator_type _type,
+                                        json_element_type _expected_type = NONE)
     : type(_type), index(-1), expected_type{_expected_type}
   {
   }
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 27556d90b1b..d6b8307c3fb 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -142,7 +142,7 @@ class reprog_device {
   /**
    * @brief Returns the number of regex groups found in the expression.
    */
-  __host__ __device__ inline int32_t group_counts() const { return _num_capturing_groups; }
+  CUDF_HOST_DEVICE inline int32_t group_counts() const { return _num_capturing_groups; }
 
   /**
    * @brief Returns the regex instruction object for a given index.
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 52cc69c69b8..0e11e9c1bbd 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -45,12 +45,12 @@ struct alignas(8) relist {
   int16_t* inst_ids{};  // one per instruction
   u_char* mask{};       // bit per instruction
 
-  __host__ __device__ inline static int32_t data_size_for(int32_t insts)
+  CUDF_HOST_DEVICE inline static int32_t data_size_for(int32_t insts)
   {
     return ((sizeof(ranges[0]) + sizeof(inst_ids[0])) * insts) + ((insts + 7) / 8);
   }
 
-  __host__ __device__ inline static int32_t alloc_size(int32_t insts)
+  CUDF_HOST_DEVICE inline static int32_t alloc_size(int32_t insts)
   {
     int32_t size = sizeof(relist);
     size += data_size_for(insts);
@@ -58,9 +58,9 @@ struct alignas(8) relist {
     return size;
   }
 
-  __host__ __device__ inline relist() {}
+  CUDF_HOST_DEVICE inline relist() {}
 
-  __host__ __device__ inline relist(int16_t insts, u_char* data = nullptr) : listsize(insts)
+  CUDF_HOST_DEVICE inline relist(int16_t insts, u_char* data = nullptr) : listsize(insts)
   {
     auto ptr = data == nullptr ? reinterpret_cast<u_char*>(this) + sizeof(relist) : data;
     ranges   = reinterpret_cast<int2*>(ptr);
@@ -71,7 +71,7 @@ struct alignas(8) relist {
     reset();
   }
 
-  __host__ __device__ inline void reset()
+  CUDF_HOST_DEVICE inline void reset()
   {
     memset(mask, 0, (listsize + 7) / 8);
     size = 0;
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 5b8f3d1ce9f..5cc4ce5f6c9 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -38,7 +38,7 @@ struct unary_cast {
             typename TargetT                                          = _TargetT,
             typename std::enable_if_t<(cudf::is_numeric<SourceT>() &&
                                        cudf::is_numeric<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return static_cast<TargetT>(element);
   }
@@ -47,7 +47,7 @@ struct unary_cast {
             typename TargetT                                            = _TargetT,
             typename std::enable_if_t<(cudf::is_timestamp<SourceT>() &&
                                        cudf::is_timestamp<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     // Convert source tick counts into target tick counts without blindly truncating them
     // by dividing the respective duration time periods (which may not work for time before
@@ -59,7 +59,7 @@ struct unary_cast {
             typename TargetT                                           = _TargetT,
             typename std::enable_if_t<(cudf::is_duration<SourceT>() &&
                                        cudf::is_duration<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT>(element)};
   }
@@ -68,7 +68,7 @@ struct unary_cast {
             typename TargetT                                         = _TargetT,
             typename std::enable_if_t<cudf::is_numeric<SourceT>() &&
                                       cudf::is_duration<TargetT>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{static_cast<typename TargetT::rep>(element)};
   }
@@ -77,7 +77,7 @@ struct unary_cast {
             typename TargetT                                           = _TargetT,
             typename std::enable_if_t<(cudf::is_timestamp<SourceT>() &&
                                        cudf::is_duration<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT>(element.time_since_epoch())};
   }
@@ -86,7 +86,7 @@ struct unary_cast {
             typename TargetT                                        = _TargetT,
             typename std::enable_if_t<cudf::is_duration<SourceT>() &&
                                       cudf::is_numeric<TargetT>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return static_cast<TargetT>(element.count());
   }
@@ -95,7 +95,7 @@ struct unary_cast {
             typename TargetT                                            = _TargetT,
             typename std::enable_if_t<(cudf::is_duration<SourceT>() &&
                                        cudf::is_timestamp<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT::duration>(element)};
   }
@@ -111,7 +111,7 @@ struct fixed_point_unary_cast {
             typename TargetT                                          = _TargetT,
             typename std::enable_if_t<(cudf::is_fixed_point<_SourceT>() &&
                                        cudf::is_numeric<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(DeviceT const element)
+  __device__ inline TargetT operator()(DeviceT const element)
   {
     auto const fp = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
     return static_cast<TargetT>(fp);
@@ -121,7 +121,7 @@ struct fixed_point_unary_cast {
             typename TargetT                                              = _TargetT,
             typename std::enable_if_t<(cudf::is_numeric<_SourceT>() &&
                                        cudf::is_fixed_point<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE DeviceT operator()(SourceT const element)
+  __device__ inline DeviceT operator()(SourceT const element)
   {
     return TargetT{element, scale}.value();
   }
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 1d23d73bf36..e8102dee2a2 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -39,8 +39,7 @@ template <typename ElementType>
 struct transformer_optional_meanvar {
   using ResultType = thrust::optional<cudf::meanvar<ElementType>>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  ResultType operator()(thrust::optional<ElementType> const& optional)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::optional<ElementType> const& optional)
   {
     if (optional.has_value()) {
       auto v = *optional;
@@ -52,7 +51,7 @@ struct transformer_optional_meanvar {
 
 template <typename T>
 struct optional_to_meanvar {
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
+  CUDF_HOST_DEVICE inline T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index 6c09997456d..99c3bfc2eb4 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -36,8 +36,7 @@ template <typename ElementType>
 struct transformer_pair_meanvar {
   using ResultType = thrust::pair<cudf::meanvar<ElementType>, bool>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  ResultType operator()(thrust::pair<ElementType, bool> const& pair)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::pair<ElementType, bool> const& pair)
   {
     ElementType v = pair.first;
     return {{v, static_cast<ElementType>(v * v), (pair.second) ? 1 : 0}, pair.second};
@@ -46,8 +45,8 @@ struct transformer_pair_meanvar {
 
 struct sum_if_not_null {
   template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE thrust::pair<T, bool> operator()(const thrust::pair<T, bool>& lhs,
-                                                             const thrust::pair<T, bool>& rhs)
+  CUDF_HOST_DEVICE inline thrust::pair<T, bool> operator()(const thrust::pair<T, bool>& lhs,
+                                                           const thrust::pair<T, bool>& rhs)
   {
     if (lhs.second & rhs.second)
       return {lhs.first + rhs.first, true};