diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c686cd0fd39..c246eb3b266 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -63,6 +63,7 @@
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
+from cudf.core.udf.row_function import _get_row_kernel
 from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -3926,10 +3927,8 @@ def apply(
             raise ValueError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
             raise ValueError("The `result_type` kwarg is not yet supported.")
-        if kwargs:
-            raise ValueError("UDFs using **kwargs are not yet supported.")
 
-        return self._apply(func, *args)
+        return self._apply(func, _get_row_kernel, *args, **kwargs)
 
     @applyutils.doc_apply()
     def apply_rows(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 69dc5389e7a..891f58657b0 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -45,7 +45,6 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import Merge, MergeSemi
-from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
@@ -1367,39 +1366,6 @@ def _quantiles(
         result._copy_type_metadata(self)
         return result
 
-    @annotate("APPLY", color="purple", domain="cudf_python")
-    def _apply(self, func, *args):
-        """
-        Apply `func` across the rows of the frame.
-        """
-        kernel, retty = compile_or_get(self, func, args)
-
-        # Mask and data column preallocated
-        ans_col = cupy.empty(len(self), dtype=retty)
-        ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
-        launch_args = [(ans_col, ans_mask), len(self)]
-        offsets = []
-
-        # if compile_or_get succeeds, it is safe to create a kernel that only
-        # consumes the columns that are of supported dtype
-        for col in supported_cols_from_frame(self).values():
-            data = col.data
-            mask = col.mask
-            if mask is None:
-                launch_args.append(data)
-            else:
-                launch_args.append((data, mask))
-            offsets.append(col.offset)
-        launch_args += offsets
-        launch_args += list(args)
-        kernel.forall(len(self))(*launch_args)
-
-        col = as_column(ans_col)
-        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self._index)
-
-        return result
-
     def rank(
         self,
         axis=0,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 9b42aca00d0..59040e3ecbb 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -24,11 +24,12 @@
     is_integer_dtype,
     is_list_like,
 )
-from cudf.core.column import arange
+from cudf.core.column import arange, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex, _index_from_columns
 from cudf.core.multiindex import MultiIndex
+from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
 from cudf.utils.utils import cached_property
 
 doc_reset_index_template = """
@@ -756,6 +757,51 @@ def add_suffix(self, suffix):
                 Use `Series.add_suffix` or `DataFrame.add_suffix`"
         )
 
+    @annotate("APPLY", color="purple", domain="cudf_python")
+    def _apply(self, func, kernel_getter, *args, **kwargs):
+        """Apply `func` across the rows of the frame."""
+        if kwargs:
+            raise ValueError("UDFs using **kwargs are not yet supported.")
+
+        try:
+            kernel, retty = _compile_or_get(
+                self, func, args, kernel_getter=kernel_getter
+            )
+        except Exception as e:
+            raise ValueError(
+                "user defined function compilation failed."
+            ) from e
+
+        # Mask and data column preallocated
+        ans_col = cp.empty(len(self), dtype=retty)
+        ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
+        launch_args = [(ans_col, ans_mask), len(self)]
+        offsets = []
+
+        # if _compile_or_get succeeds, it is safe to create a kernel that only
+        # consumes the columns that are of supported dtype
+        for col in _supported_cols_from_frame(self).values():
+            data = col.data
+            mask = col.mask
+            if mask is None:
+                launch_args.append(data)
+            else:
+                launch_args.append((data, mask))
+            offsets.append(col.offset)
+        launch_args += offsets
+        launch_args += list(args)
+
+        try:
+            kernel.forall(len(self))(*launch_args)
+        except Exception as e:
+            raise RuntimeError("UDF kernel execution failed.") from e
+
+        col = as_column(ans_col)
+        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
+        result = cudf.Series._from_data({None: col}, self._index)
+
+        return result
+
     def sort_values(
         self,
         by,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 0371c40274f..61975d47af2 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -14,7 +14,6 @@
 import cupy
 import numpy as np
 import pandas as pd
-from numba import cuda
 from pandas._config import get_option
 
 import cudf
@@ -67,6 +66,7 @@
     doc_reset_index_template,
 )
 from cudf.core.single_column_frame import SingleColumnFrame
+from cudf.core.udf.scalar_function import _get_scalar_kernel
 from cudf.utils import cudautils, docutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -2374,7 +2374,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
             by numba based on the function logic and argument types.
             See examples for details.
         args : tuple
-            Not supported
+            Positional arguments passed to func after the series value.
         **kwargs
             Not supported
 
@@ -2440,20 +2440,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         2     4.5
         dtype: float64
         """
-        if args or kwargs:
-            raise ValueError(
-                "UDFs using *args or **kwargs are not yet supported."
-            )
-
-        # these functions are generally written as functions of scalar
-        # values rather than rows. Rather than writing an entirely separate
-        # numba kernel that is not built around a row object, its simpler
-        # to just turn this into the equivalent single column dataframe case
-        name = self.name or "__temp_srname"
-        df = cudf.DataFrame({name: self})
-        f_ = cuda.jit(device=True)(func)
-
-        return df.apply(lambda row: f_(row[name]))
+        if convert_dtype is not True:
+            raise ValueError("Series.apply only supports convert_dtype=True")
+        return self._apply(func, _get_scalar_kernel, *args, **kwargs)
 
     def applymap(self, udf, out_dtype=None):
         """Apply an elementwise function to transform the values in the Column.
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
deleted file mode 100644
index 8e798de3bfe..00000000000
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ /dev/null
@@ -1,390 +0,0 @@
-import math
-from typing import Callable
-
-import cachetools
-import numpy as np
-from numba import cuda, typeof
-from numba.np import numpy_support
-from numba.types import Poison, Record, Tuple, boolean, int64, void
-from nvtx import annotate
-
-from cudf.core.dtypes import CategoricalDtype
-from cudf.core.udf.api import Masked, pack_return
-from cudf.core.udf.typing import MaskedType
-from cudf.utils import cudautils
-from cudf.utils.dtypes import (
-    BOOL_TYPES,
-    DATETIME_TYPES,
-    NUMERIC_TYPES,
-    TIMEDELTA_TYPES,
-)
-
-libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32"))
-MASK_BITSIZE = np.dtype("int32").itemsize * 8
-precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
-
-JIT_SUPPORTED_TYPES = (
-    NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES
-)
-
-
-def _is_jit_supported_type(dtype):
-    # category dtype isn't hashable
-    if isinstance(dtype, CategoricalDtype):
-        return False
-    return str(dtype) in JIT_SUPPORTED_TYPES
-
-
-def all_dtypes_from_frame(frame):
-    return {
-        colname: col.dtype
-        if _is_jit_supported_type(col.dtype)
-        else np.dtype("O")
-        for colname, col in frame._data.items()
-    }
-
-
-def supported_dtypes_from_frame(frame):
-    return {
-        colname: col.dtype
-        for colname, col in frame._data.items()
-        if _is_jit_supported_type(col.dtype)
-    }
-
-
-def supported_cols_from_frame(frame):
-    return {
-        colname: col
-        for colname, col in frame._data.items()
-        if _is_jit_supported_type(col.dtype)
-    }
-
-
-def generate_cache_key(frame, func: Callable):
-    """Create a cache key that uniquely identifies a compilation.
-
-    A new compilation is needed any time any of the following things change:
-    - The UDF itself as defined in python by the user
-    - The types of the columns utilized by the UDF
-    - The existence of the input columns masks
-    """
-    return (
-        *cudautils.make_cache_key(
-            func, tuple(all_dtypes_from_frame(frame).values())
-        ),
-        *(col.mask is None for col in frame._data.values()),
-        *frame._data.keys(),
-    )
-
-
-def get_frame_row_type(dtype):
-    """
-    Get the numba `Record` type corresponding to a frame.
-    Models each column and its mask as a MaskedType and
-    models the row as a dictionary like data structure
-    containing these MaskedTypes.
-
-    Large parts of this function are copied with comments
-    from the Numba internals and slightly modified to
-    account for validity bools to be present in the final
-    struct.
-    """
-
-    # Create the numpy structured type corresponding to the numpy dtype.
-
-    fields = []
-    offset = 0
-
-    sizes = [val[0].itemsize for val in dtype.fields.values()]
-    for i, (name, info) in enumerate(dtype.fields.items()):
-        # *info* consists of the element dtype, its offset from the beginning
-        # of the record, and an optional "title" containing metadata.
-        # We ignore the offset in info because its value assumes no masking;
-        # instead, we compute the correct offset based on the masked type.
-        elemdtype = info[0]
-        title = info[2] if len(info) == 3 else None
-        ty = numpy_support.from_dtype(elemdtype)
-        infos = {
-            "type": MaskedType(ty),
-            "offset": offset,
-            "title": title,
-        }
-        fields.append((name, infos))
-
-        # increment offset by itemsize plus one byte for validity
-        offset += elemdtype.itemsize + 1
-
-        # Align the next member of the struct to be a multiple of the
-        # memory access size, per PTX ISA 7.4/5.4.5
-        if i < len(sizes) - 1:
-            next_itemsize = sizes[i + 1]
-            offset = int(math.ceil(offset / next_itemsize) * next_itemsize)
-
-    # Numba requires that structures are aligned for the CUDA target
-    _is_aligned_struct = True
-    return Record(fields, offset, _is_aligned_struct)
-
-
-@annotate("NUMBA JIT", color="green", domain="cudf_python")
-def get_udf_return_type(frame, func: Callable, args=()):
-
-    """
-    Get the return type of a masked UDF for a given set of argument dtypes. It
-    is assumed that the function consumes a dictionary whose keys are strings
-    and whose values are of MaskedType. Initially assume that the UDF may be
-    written to utilize any field in the row - including those containing an
-    unsupported dtype. If an unsupported dtype is actually used in the function
-    the compilation should fail at `compile_udf`. If compilation succeeds, one
-    can infer that the function does not use any of the columns of unsupported
-    dtype - meaning we can drop them going forward and the UDF will still end
-    up getting fed rows containing all the fields it actually needs to use to
-    compute the answer for that row.
-    """
-
-    # present a row containing all fields to the UDF and try and compile
-    row_type = get_frame_row_type(
-        np.dtype(list(all_dtypes_from_frame(frame).items()))
-    )
-    compile_sig = (row_type, *(typeof(arg) for arg in args))
-
-    # Get the return type. The PTX is also returned by compile_udf, but is not
-    # needed here.
-    ptx, output_type = cudautils.compile_udf(func, compile_sig)
-    if not isinstance(output_type, MaskedType):
-        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
-    else:
-        numba_output_type = output_type
-
-    return (
-        numba_output_type
-        if not isinstance(numba_output_type, MaskedType)
-        else numba_output_type.value_type
-    )
-
-
-def masked_array_type_from_col(col):
-    """
-    Return a type representing a tuple of arrays,
-    the first element an array of the numba type
-    corresponding to `dtype`, and the second an
-    array of bools representing a mask.
-    """
-    nb_scalar_ty = numpy_support.from_dtype(col.dtype)
-    if col.mask is None:
-        return nb_scalar_ty[::1]
-    else:
-        return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1]))
-
-
-def construct_signature(frame, return_type, args):
-    """
-    Build the signature of numba types that will be used to
-    actually JIT the kernel itself later, accounting for types
-    and offsets. Skips columns with unsupported dtypes.
-    """
-
-    # Tuple of arrays, first the output data array, then the mask
-    return_type = Tuple((return_type[::1], boolean[::1]))
-    offsets = []
-    sig = [return_type, int64]
-    for col in supported_cols_from_frame(frame).values():
-        sig.append(masked_array_type_from_col(col))
-        offsets.append(int64)
-
-    # return_type, size, data, masks, offsets, extra args
-    sig = void(*(sig + offsets + [typeof(arg) for arg in args]))
-
-    return sig
-
-
-@cuda.jit(device=True)
-def mask_get(mask, pos):
-    return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
-
-
-kernel_template = """\
-def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}):
-    i = cuda.grid(1)
-    ret_data_arr, ret_mask_arr = retval
-    if i < size:
-        # Create a structured array with the desired fields
-        rows = cuda.local.array(1, dtype=row_type)
-
-        # one element of that array
-        row = rows[0]
-
-{masked_input_initializers}
-{row_initializers}
-
-        # pass the assembled row into the udf
-        ret = f_(row, {extra_args})
-
-        # pack up the return values and set them
-        ret_masked = pack_return(ret)
-        ret_data_arr[i] = ret_masked.value
-        ret_mask_arr[i] = ret_masked.valid
-"""
-
-unmasked_input_initializer_template = """\
-        d_{idx} = input_col_{idx}
-        masked_{idx} = Masked(d_{idx}[i], True)
-"""
-
-masked_input_initializer_template = """\
-        d_{idx}, m_{idx} = input_col_{idx}
-        masked_{idx} = Masked(d_{idx}[i], mask_get(m_{idx}, i + offset_{idx}))
-"""
-
-row_initializer_template = """\
-        row["{name}"] = masked_{idx}
-"""
-
-
-def _define_function(frame, row_type, args):
-    """
-    The kernel we want to JIT compile looks something like the following,
-    which is an example for two columns that both have nulls present
-
-    def _kernel(retval, input_col_0, input_col_1, offset_0, offset_1, size):
-        i = cuda.grid(1)
-        ret_data_arr, ret_mask_arr = retval
-        if i < size:
-            rows = cuda.local.array(1, dtype=row_type)
-            row = rows[0]
-
-            d_0, m_0 = input_col_0
-            masked_0 = Masked(d_0[i], mask_get(m_0, i + offset_0))
-            d_1, m_1 = input_col_1
-            masked_1 = Masked(d_1[i], mask_get(m_1, i + offset_1))
-
-            row["a"] = masked_0
-            row["b"] = masked_1
-
-            ret = f_(row)
-
-            ret_masked = pack_return(ret)
-            ret_data_arr[i] = ret_masked.value
-            ret_mask_arr[i] = ret_masked.valid
-
-    However we do not always have two columns and columns do not always have
-    an associated mask. Ideally, we would just write one kernel and make use
-    of `*args` - and then one function would work for any number of columns,
-    currently numba does not support `*args` and treats functions it JITs as
-    if `*args` is a singular argument. Thus we are forced to write the right
-    functions dynamically at runtime and define them using `exec`.
-    """
-    # Create argument list for kernel
-    frame = supported_cols_from_frame(frame)
-
-    input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))])
-    input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))])
-    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
-
-    # Generate the initializers for each device function argument
-    initializers = []
-    row_initializers = []
-    for i, (colname, col) in enumerate(frame.items()):
-        idx = str(i)
-        if col.mask is not None:
-            template = masked_input_initializer_template
-        else:
-            template = unmasked_input_initializer_template
-
-        initializer = template.format(idx=idx)
-
-        initializers.append(initializer)
-
-        row_initializer = row_initializer_template.format(
-            idx=idx, name=colname
-        )
-        row_initializers.append(row_initializer)
-
-    # Incorporate all of the above into the kernel code template
-    d = {
-        "input_columns": input_columns,
-        "input_offsets": input_offsets,
-        "extra_args": extra_args,
-        "masked_input_initializers": "\n".join(initializers),
-        "row_initializers": "\n".join(row_initializers),
-        "numba_rectype": row_type,  # from global
-    }
-
-    return kernel_template.format(**d)
-
-
-@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
-def compile_or_get(frame, func, args):
-    """
-    Return a compiled kernel in terms of MaskedTypes that launches a
-    kernel equivalent of `f` for the dtypes of `df`. The kernel uses
-    a thread for each row and calls `f` using that rows data / mask
-    to produce an output value and output validity for each row.
-
-    If the UDF has already been compiled for this requested dtypes,
-    a cached version will be returned instead of running compilation.
-
-    CUDA kernels are void and do not return values. Thus, we need to
-    preallocate a column of the correct dtype and pass it in as one of
-    the kernel arguments. This creates a chicken-and-egg problem where
-    we need the column type to compile the kernel, but normally we would
-    be getting that type FROM compiling the kernel (and letting numba
-    determine it as a return value). As a workaround, we compile the UDF
-    itself outside the final kernel to invoke a full typing pass, which
-    unfortunately is difficult to do without running full compilation.
-    we then obtain the return type from that separate compilation and
-    use it to allocate an output column of the right dtype.
-    """
-
-    # check to see if we already compiled this function
-    cache_key = generate_cache_key(frame, func)
-    if precompiled.get(cache_key) is not None:
-        kernel, masked_or_scalar = precompiled[cache_key]
-        return kernel, masked_or_scalar
-
-    # precompile the user udf to get the right return type.
-    # could be a MaskedType or a scalar type.
-    scalar_return_type = get_udf_return_type(frame, func, args)
-
-    # get_udf_return_type will throw a TypingError if the user tries to use
-    # a field in the row containing an unsupported dtype, except in the
-    # edge case where all the function does is return that element:
-
-    # def f(row):
-    #    return row[<bad dtype key>]
-    # In this case numba is happy to return MaskedType(<bad dtype key>)
-    # because it relies on not finding overloaded operators for types to raise
-    # the exception, so we have to explicitly check for that case.
-    if isinstance(scalar_return_type, Poison):
-        raise TypeError(str(scalar_return_type))
-
-    # this is the signature for the final full kernel compilation
-    sig = construct_signature(frame, scalar_return_type, args)
-
-    # this row type is used within the kernel to pack up the column and
-    # mask data into the dict like data structure the user udf expects
-    np_field_types = np.dtype(list(supported_dtypes_from_frame(frame).items()))
-    row_type = get_frame_row_type(np_field_types)
-
-    f_ = cuda.jit(device=True)(func)
-    # Dict of 'local' variables into which `_kernel` is defined
-    local_exec_context = {}
-    global_exec_context = {
-        "f_": f_,
-        "cuda": cuda,
-        "Masked": Masked,
-        "mask_get": mask_get,
-        "pack_return": pack_return,
-        "row_type": row_type,
-    }
-    exec(
-        _define_function(frame, row_type, args),
-        global_exec_context,
-        local_exec_context,
-    )
-    # The python function definition representing the kernel
-    _kernel = local_exec_context["_kernel"]
-    kernel = cuda.jit(sig)(_kernel)
-    np_return_type = numpy_support.as_dtype(scalar_return_type)
-    precompiled[cache_key] = (kernel, np_return_type)
-
-    return kernel, np_return_type
diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py
new file mode 100644
index 00000000000..5cda9fb8218
--- /dev/null
+++ b/python/cudf/cudf/core/udf/row_function.py
@@ -0,0 +1,151 @@
+import math
+
+import numpy as np
+from numba import cuda
+from numba.np import numpy_support
+from numba.types import Record
+
+from cudf.core.udf.api import Masked, pack_return
+from cudf.core.udf.templates import (
+    masked_input_initializer_template,
+    row_initializer_template,
+    row_kernel_template,
+    unmasked_input_initializer_template,
+)
+from cudf.core.udf.typing import MaskedType
+from cudf.core.udf.utils import (
+    _all_dtypes_from_frame,
+    _construct_signature,
+    _get_kernel,
+    _get_udf_return_type,
+    _mask_get,
+    _supported_cols_from_frame,
+    _supported_dtypes_from_frame,
+)
+
+
+def _get_frame_row_type(dtype):
+    """
+    Get the numba `Record` type corresponding to a frame.
+    Models each column and its mask as a MaskedType and
+    models the row as a dictionary like data structure
+    containing these MaskedTypes.
+
+    Large parts of this function are copied with comments
+    from the Numba internals and slightly modified to
+    account for validity bools to be present in the final
+    struct.
+
+    See numba.np.numpy_support.from_struct_dtype for details.
+    """
+
+    # Create the numpy structured type corresponding to the numpy dtype.
+
+    fields = []
+    offset = 0
+
+    sizes = [val[0].itemsize for val in dtype.fields.values()]
+    for i, (name, info) in enumerate(dtype.fields.items()):
+        # *info* consists of the element dtype, its offset from the beginning
+        # of the record, and an optional "title" containing metadata.
+        # We ignore the offset in info because its value assumes no masking;
+        # instead, we compute the correct offset based on the masked type.
+        elemdtype = info[0]
+        title = info[2] if len(info) == 3 else None
+        ty = numpy_support.from_dtype(elemdtype)
+        infos = {
+            "type": MaskedType(ty),
+            "offset": offset,
+            "title": title,
+        }
+        fields.append((name, infos))
+
+        # increment offset by itemsize plus one byte for validity
+        offset += elemdtype.itemsize + 1
+
+        # Align the next member of the struct to be a multiple of the
+        # memory access size, per PTX ISA 7.4/5.4.5
+        if i < len(sizes) - 1:
+            next_itemsize = sizes[i + 1]
+            offset = int(math.ceil(offset / next_itemsize) * next_itemsize)
+
+    # Numba requires that structures are aligned for the CUDA target
+    _is_aligned_struct = True
+    return Record(fields, offset, _is_aligned_struct)
+
+
+def _row_kernel_string_from_template(frame, row_type, args):
+    """
+    Function to write numba kernels for `DataFrame.apply` as a string.
+    Workaround until numba supports functions that use `*args`
+
+    `DataFrame.apply` expects functions of a dict like row as well as
+    possibly one or more scalar arguments
+
+    def f(row, c, k):
+        return (row['x'] + c) / k
+
+    Both the number of input columns as well as their nullability and any
+    scalar arguments may vary, so the kernels vary significantly. See
+    templates.py for the full row kernel template and more details.
+    """
+    # Create argument list for kernel
+    frame = _supported_cols_from_frame(frame)
+
+    input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))])
+    input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))])
+    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
+
+    # Generate the initializers for each device function argument
+    initializers = []
+    row_initializers = []
+    for i, (colname, col) in enumerate(frame.items()):
+        idx = str(i)
+        template = (
+            masked_input_initializer_template
+            if col.mask is not None
+            else unmasked_input_initializer_template
+        )
+        initializers.append(template.format(idx=idx))
+        row_initializers.append(
+            row_initializer_template.format(idx=idx, name=colname)
+        )
+
+    return row_kernel_template.format(
+        input_columns=input_columns,
+        input_offsets=input_offsets,
+        extra_args=extra_args,
+        masked_input_initializers="\n".join(initializers),
+        row_initializers="\n".join(row_initializers),
+        numba_rectype=row_type,
+    )
+
+
+def _get_row_kernel(frame, func, args):
+    row_type = _get_frame_row_type(
+        np.dtype(list(_all_dtypes_from_frame(frame).items()))
+    )
+    scalar_return_type = _get_udf_return_type(row_type, func, args)
+
+    # this is the signature for the final full kernel compilation
+    sig = _construct_signature(frame, scalar_return_type, args)
+
+    # this row type is used within the kernel to pack up the column and
+    # mask data into the dict like data structure the user udf expects
+    np_field_types = np.dtype(
+        list(_supported_dtypes_from_frame(frame).items())
+    )
+    row_type = _get_frame_row_type(np_field_types)
+
+    # Dict of 'local' variables into which `_kernel` is defined
+    global_exec_context = {
+        "cuda": cuda,
+        "Masked": Masked,
+        "_mask_get": _mask_get,
+        "pack_return": pack_return,
+        "row_type": row_type,
+    }
+    kernel_string = _row_kernel_string_from_template(frame, row_type, args)
+    kernel = _get_kernel(kernel_string, global_exec_context, sig, func)
+
+    return kernel, scalar_return_type
diff --git a/python/cudf/cudf/core/udf/scalar_function.py b/python/cudf/cudf/core/udf/scalar_function.py
new file mode 100644
index 00000000000..7f3b461a1f0
--- /dev/null
+++ b/python/cudf/cudf/core/udf/scalar_function.py
@@ -0,0 +1,64 @@
+from numba import cuda
+from numba.np import numpy_support
+
+from cudf.core.udf.api import Masked, pack_return
+from cudf.core.udf.templates import (
+    masked_input_initializer_template,
+    scalar_kernel_template,
+    unmasked_input_initializer_template,
+)
+from cudf.core.udf.typing import MaskedType
+from cudf.core.udf.utils import (
+    _construct_signature,
+    _get_kernel,
+    _get_udf_return_type,
+    _mask_get,
+)
+
+
+def _scalar_kernel_string_from_template(sr, args):
+    """
+    Function to write numba kernels for `Series.apply` as a string.
+    Workaround until numba supports functions that use `*args`
+
+    `Series.apply` expects functions of a single variable and possibly
+    one or more constants, such as:
+
+    def f(x, c, k):
+        return (x + c) / k
+
+    where the `x` are meant to be the values of the series. Since there
+    can be only one column, the only thing that varies in the kinds of
+    kernels that we want is the number of extra_args. See templates.py
+    for the full kernel template.
+    """
+    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
+
+    masked_initializer = (
+        masked_input_initializer_template
+        if sr._column.mask
+        else unmasked_input_initializer_template
+    ).format(idx=0)
+
+    return scalar_kernel_template.format(
+        extra_args=extra_args, masked_initializer=masked_initializer
+    )
+
+
+def _get_scalar_kernel(sr, func, args):
+    sr_type = MaskedType(numpy_support.from_dtype(sr.dtype))
+    scalar_return_type = _get_udf_return_type(sr_type, func, args)
+
+    sig = _construct_signature(sr, scalar_return_type, args=args)
+    f_ = cuda.jit(device=True)(func)
+    global_exec_context = {
+        "f_": f_,
+        "cuda": cuda,
+        "Masked": Masked,
+        "_mask_get": _mask_get,
+        "pack_return": pack_return,
+    }
+    kernel_string = _scalar_kernel_string_from_template(sr, args=args)
+    kernel = _get_kernel(kernel_string, global_exec_context, sig, func)
+
+    return kernel, scalar_return_type
diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py
new file mode 100644
index 00000000000..8cb11133323
--- /dev/null
+++ b/python/cudf/cudf/core/udf/templates.py
@@ -0,0 +1,52 @@
+unmasked_input_initializer_template = """\
+        d_{idx} = input_col_{idx}
+        masked_{idx} = Masked(d_{idx}[i], True)
+"""
+
+masked_input_initializer_template = """\
+        d_{idx}, m_{idx} = input_col_{idx}
+        masked_{idx} = Masked(d_{idx}[i], _mask_get(m_{idx}, i + offset_{idx}))
+"""
+
+row_initializer_template = """\
+        row["{name}"] = masked_{idx}
+"""
+
+row_kernel_template = """\
+def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}):
+    i = cuda.grid(1)
+    ret_data_arr, ret_mask_arr = retval
+    if i < size:
+        # Create a structured array with the desired fields
+        rows = cuda.local.array(1, dtype=row_type)
+
+        # one element of that array
+        row = rows[0]
+
+{masked_input_initializers}
+{row_initializers}
+
+        # pass the assembled row into the udf
+        ret = f_(row, {extra_args})
+
+        # pack up the return values and set them
+        ret_masked = pack_return(ret)
+        ret_data_arr[i] = ret_masked.value
+        ret_mask_arr[i] = ret_masked.valid
+"""
+
+scalar_kernel_template = """
+def _kernel(retval, size, input_col_0, offset_0, {extra_args}):
+    i = cuda.grid(1)
+    ret_data_arr, ret_mask_arr = retval
+
+    if i < size:
+
+{masked_initializer}
+
+        ret = f_(masked_0, {extra_args})
+
+        ret_masked = pack_return(ret)
+        ret_data_arr[i] = ret_masked.value
+        ret_mask_arr[i] = ret_masked.valid
+"""
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
new file mode 100644
index 00000000000..a98ee40274e
--- /dev/null
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -0,0 +1,216 @@
+from typing import Callable
+
+import cachetools
+import numpy as np
+from numba import cuda, typeof
+from numba.core.errors import TypingError
+from numba.np import numpy_support
+from numba.types import Poison, Tuple, boolean, int64, void
+from nvtx import annotate
+
+from cudf.core.dtypes import CategoricalDtype
+from cudf.core.udf.typing import MaskedType
+from cudf.utils import cudautils
+from cudf.utils.dtypes import (
+    BOOL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+)
+
+JIT_SUPPORTED_TYPES = (
+    NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES
+)
+
+libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32"))
+MASK_BITSIZE = np.dtype("int32").itemsize * 8
+
+precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
+
+
+@annotate("NUMBA JIT", color="green", domain="cudf_python")
+def _get_udf_return_type(argty, func: Callable, args=()):
+    """
+    Get the return type of a masked UDF for a given set of argument dtypes. It
+    is assumed that the function consumes a dictionary whose keys are strings
+    and whose values are of MaskedType. Initially assume that the UDF may be
+    written to utilize any field in the row - including those containing an
+    unsupported dtype. If an unsupported dtype is actually used in the function
+    the compilation should fail at `compile_udf`. If compilation succeeds, one
+    can infer that the function does not use any of the columns of unsupported
+    dtype - meaning we can drop them going forward and the UDF will still end
+    up getting fed rows containing all the fields it actually needs to use to
+    compute the answer for that row.
+    """
+
+    # present a row containing all fields to the UDF and try and compile
+    compile_sig = (argty, *(typeof(arg) for arg in args))
+
+    # Get the return type. The PTX is also returned by compile_udf, but is not
+    # needed here.
+    ptx, output_type = cudautils.compile_udf(func, compile_sig)
+    if not isinstance(output_type, MaskedType):
+        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
+    else:
+        numba_output_type = output_type
+
+    result = (
+        numba_output_type
+        if not isinstance(numba_output_type, MaskedType)
+        else numba_output_type.value_type
+    )
+
+    # _get_udf_return_type will throw a TypingError if the user tries to use
+    # a field in the row containing an unsupported dtype, except in the
+    # edge case where all the function does is return that element:
+
+    # def f(row):
+    #    return row[<bad dtype key>]
+    # In this case numba is happy to return MaskedType(<bad dtype key>)
+    # because it relies on not finding overloaded operators for types to raise
+    # the exception, so we have to explicitly check for that case.
+    if isinstance(result, Poison):
+        raise TypingError(str(result))
+
+    return result
+
+
+def _is_jit_supported_type(dtype):
+    # category dtype isn't hashable
+    if isinstance(dtype, CategoricalDtype):
+        return False
+    return str(dtype) in JIT_SUPPORTED_TYPES
+
+
+def _all_dtypes_from_frame(frame):
+    return {
+        colname: col.dtype
+        if _is_jit_supported_type(col.dtype)
+        else np.dtype("O")
+        for colname, col in frame._data.items()
+    }
+
+
+def _supported_dtypes_from_frame(frame):
+    return {
+        colname: col.dtype
+        for colname, col in frame._data.items()
+        if _is_jit_supported_type(col.dtype)
+    }
+
+
+def _supported_cols_from_frame(frame):
+    return {
+        colname: col
+        for colname, col in frame._data.items()
+        if _is_jit_supported_type(col.dtype)
+    }
+
+
+def _masked_array_type_from_col(col):
+    """
+    Return a type representing a tuple of arrays,
+    the first element an array of the numba type
+    corresponding to `dtype`, and the second an
+    array of bools representing a mask.
+    """
+    nb_scalar_ty = numpy_support.from_dtype(col.dtype)
+    if col.mask is None:
+        return nb_scalar_ty[::1]
+    else:
+        return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1]))
+
+
+def _construct_signature(frame, return_type, args):
+    """
+    Build the signature of numba types that will be used to
+    actually JIT the kernel itself later, accounting for types
+    and offsets. Skips columns with unsupported dtypes.
+    """
+
+    # Tuple of arrays, first the output data array, then the mask
+    return_type = Tuple((return_type[::1], boolean[::1]))
+    offsets = []
+    sig = [return_type, int64]
+    for col in _supported_cols_from_frame(frame).values():
+        sig.append(_masked_array_type_from_col(col))
+        offsets.append(int64)
+
+    # return_type, size, data, masks, offsets, extra args
+    sig = void(*(sig + offsets + [typeof(arg) for arg in args]))
+
+    return sig
+
+
+@cuda.jit(device=True)
+def _mask_get(mask, pos):
+    """Return the validity of mask[pos] as a word."""
+    return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
+
+
+def _generate_cache_key(frame, func: Callable):
+    """Create a cache key that uniquely identifies a compilation.
+
+    A new compilation is needed any time any of the following things change:
+    - The UDF itself as defined in python by the user
+    - The types of the columns utilized by the UDF
+    - The existence of the input columns masks
+    """
+    return (
+        *cudautils.make_cache_key(
+            func, tuple(_all_dtypes_from_frame(frame).values())
+        ),
+        *(col.mask is None for col in frame._data.values()),
+        *frame._data.keys(),
+    )
+
+
+@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
+def _compile_or_get(frame, func, args, kernel_getter=None):
+    """
+    Return a compiled kernel in terms of MaskedTypes that launches a
+    kernel equivalent of `f` for the dtypes of `df`. The kernel uses
+    a thread for each row and calls `f` using that rows data / mask
+    to produce an output value and output validity for each row.
+
+    If the UDF has already been compiled for this requested dtypes,
+    a cached version will be returned instead of running compilation.
+
+    CUDA kernels are void and do not return values. Thus, we need to
+    preallocate a column of the correct dtype and pass it in as one of
+    the kernel arguments. This creates a chicken-and-egg problem where
+    we need the column type to compile the kernel, but normally we would
+    be getting that type FROM compiling the kernel (and letting numba
+    determine it as a return value). As a workaround, we compile the UDF
+    itself outside the final kernel to invoke a full typing pass, which
+    unfortunately is difficult to do without running full compilation.
+    we then obtain the return type from that separate compilation and
+    use it to allocate an output column of the right dtype.
+    """
+
+    # check to see if we already compiled this function
+    cache_key = _generate_cache_key(frame, func)
+    if precompiled.get(cache_key) is not None:
+        kernel, masked_or_scalar = precompiled[cache_key]
+        return kernel, masked_or_scalar
+
+    # precompile the user udf to get the right return type.
+    # could be a MaskedType or a scalar type.
+
+    kernel, scalar_return_type = kernel_getter(frame, func, args)
+
+    np_return_type = numpy_support.as_dtype(scalar_return_type)
+    precompiled[cache_key] = (kernel, np_return_type)
+
+    return kernel, np_return_type
+
+
+def _get_kernel(kernel_string, globals_, sig, func):
+    """template kernel compilation helper function"""
+    f_ = cuda.jit(device=True)(func)
+    globals_["f_"] = f_
+    exec(kernel_string, globals_)
+    _kernel = globals_["_kernel"]
+    kernel = cuda.jit(sig)(_kernel)
+
+    return kernel
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 7f2925f2f06..56090c8eacf 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -13,7 +13,7 @@
     comparison_ops,
     unary_ops,
 )
-from cudf.core.udf.pipeline import precompiled
+from cudf.core.udf.utils import precompiled
 from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq
 
 
@@ -486,7 +486,7 @@ def outer(row):
         {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]}
     )
 
-    with pytest.raises(AttributeError):
+    with pytest.raises(ValueError):
         gdf.apply(outer, axis=1)
 
     pdf = gdf.to_pandas(nullable=True)
@@ -539,7 +539,7 @@ def func(row):
         return row["unsupported_col"]
 
     # check that we fail when an unsupported type is used within a function
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         data.apply(func, axis=1)
 
     # also check that a DF containing unsupported dtypes can still run a
@@ -596,6 +596,44 @@ def func(row, c, k):
     run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, cudf.NA, 3],
+        [0.5, 2.0, cudf.NA, cudf.NA, 5.0],
+        [True, False, cudf.NA],
+    ],
+)
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_mask_udf_scalar_args_binops_series(data, op):
+    data = cudf.Series(data)
+
+    def func(x, c):
+        return x + c
+
+    run_masked_udf_series(func, data, args=(1,), check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, cudf.NA, 3],
+        [0.5, 2.0, cudf.NA, cudf.NA, 5.0],
+        [True, False, cudf.NA],
+    ],
+)
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_masked_udf_scalar_args_binops_multiple_series(data, op):
+    data = cudf.Series(data)
+
+    def func(data, c, k):
+        x = op(data, c)
+        y = op(x, k)
+        return y
+
+    run_masked_udf_series(func, data, args=(1, 2), check_dtype=False)
+
+
 def test_masked_udf_caching():
     # Make sure similar functions that differ
     # by simple things like constants actually