From a7bbfb429fc522805668f3cf795c46ba24891640 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 24 Feb 2021 04:06:28 -0800
Subject: [PATCH 001/132] just debugging info

---
 cpp/src/jit/launcher.cpp        |  2 ++
 cpp/src/transform/transform.cpp | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/cpp/src/jit/launcher.cpp b/cpp/src/jit/launcher.cpp
index 2ddcac7d5ba..a19b6a55503 100644
--- a/cpp/src/jit/launcher.cpp
+++ b/cpp/src/jit/launcher.cpp
@@ -35,6 +35,8 @@ launcher::launcher(const std::string& hash,
                    rmm::cuda_stream_view stream)
   : cache_instance{cudf::jit::cudfJitCache::Instance()}, stream(stream)
 {
+  std::cout << "launcher::launcher cuda_source:" << std::endl;
+  std::cout << cuda_source << std::endl;
   program = cache_instance.getProgram(
     hash, cuda_source.c_str(), header_names, compiler_flags, file_callback);
 }
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 6da0f78687b..f7fc7f832c4 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -59,11 +59,23 @@ void unary_operation(mutable_column_view output,
 {
   std::string hash = "prog_transform" + std::to_string(std::hash<std::string>{}(udf));
 
+  std::cout << "The program's hash is:" << std::endl;
+  std::cout << hash << std::endl;
+
+  std::cout << "the actual udf string is: " << std::endl;
+  std::cout << udf << std::endl;
+
+
+  std::cout << "cuda_source is:" << std::endl;
   std::string cuda_source = code::kernel_header;
+  std::cout << cuda_source << std::endl;
+
   if (is_ptx) {
     cuda_source += cudf::jit::parse_single_function_ptx(
                      udf, "GENERIC_UNARY_OP", cudf::jit::get_type_name(output_type), {0}) +
                    code::kernel;
+    std::cout << "cuda_source after is_ptx condition: " << std::endl;
+    std::cout << cuda_source << std::endl;
   } else {
     cuda_source += cudf::jit::parse_single_function_cuda(udf, "GENERIC_UNARY_OP") + code::kernel;
   }

From 193f8e06677df7d4d5b2b70cb347db80b575b402 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 7 Mar 2021 12:18:18 -0800
Subject: [PATCH 002/132] initial python MaskedType

---
 python/cudf/cudf/core/__init__.py |   1 +
 python/cudf/cudf/core/udf.py      | 115 ++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 python/cudf/cudf/core/udf.py

diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py
index 91a369c31f8..22e8027b0ab 100644
--- a/python/cudf/cudf/core/__init__.py
+++ b/python/cudf/cudf/core/__init__.py
@@ -24,3 +24,4 @@
 from cudf.core.multiindex import MultiIndex
 from cudf.core.scalar import NA, Scalar
 from cudf.core.series import Series
+import cudf.core.udf
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
new file mode 100644
index 00000000000..a1660abe07b
--- /dev/null
+++ b/python/cudf/cudf/core/udf.py
@@ -0,0 +1,115 @@
+import numba
+from numba import cuda
+from numba import njit
+
+class Masked(object):
+    def __init__(self, value, valid):
+        self.value = value
+        self.valid = valid
+
+from numba.extending import types
+
+class MaskedType(types.Type):
+    def __init__(self):
+        super().__init__(name="Masked")
+
+numba_masked = MaskedType() # name this something more natural - GM
+
+from numba.core.extending import typeof_impl
+@typeof_impl.register(Masked)
+def typeof_masked(val, c):
+    # This can be dynamic and be based off the specifics of the instance
+    # which is why val is a parameter
+    # c is a typeof context and doesn't matter for this
+    return numba_masked
+
+# types.number -  more general numeric values
+from numba.core.extending import type_callable
+@type_callable(Masked)
+def type_masked(context):
+    def typer(value, valid):
+        if isinstance(value, types.Integer) and isinstance(valid, types.Boolean):
+            return numba_masked
+    return typer
+
+# {i8, ... }
+from numba.core.extending import models, register_model
+@register_model(MaskedType)
+class MaskedModel(models.StructModel):
+    def __init__(self, dmm, fe_type):
+        members = [
+            ('value', types.int64),
+            ('valid', types.bool_)
+        ]
+        models.StructModel.__init__(self, dmm, fe_type, members)
+
+from numba.core.extending import lower_builtin
+from numba.core import cgutils
+
+@lower_builtin(Masked, types.int64, types.bool_)
+def impl_masked_constructor(context, builder, sig, args):
+    
+    print("beginning:")
+    print(builder.module)
+    
+    typ = sig.return_type
+    value, valid = args    
+    
+    masked = cgutils.create_struct_proxy(typ)(context, builder)
+    masked.value = value
+    masked.valid = valid
+    print(masked._getvalue())
+    
+    print("end:")
+    print(builder.module)
+    return masked._getvalue() # return a pointer to the struct I created
+
+@njit
+def create_masked(value, valid):
+    Masked(value, valid)
+
+import numpy as np
+create_masked(1, True)
+
+from numba.core.typing import signature
+
+from numba.core.typing.templates import AbstractTemplate
+from numba.cuda.cudadecl import registry as cuda_registry
+
+import operator
+
+@cuda_registry.register_global(operator.add)
+class MaskedScalarAdd(AbstractTemplate):
+    # abstracttemplate vs concretetemplate
+    def generic(self, args, kws):
+        if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
+            return signature(numba_masked, numba_masked, numba_masked)
+
+from numba.cuda.cudaimpl import lower as cuda_lower
+
+@cuda_lower(operator.add, MaskedType, MaskedType)
+def masked_scalar_add_impl(context, builder, sig, args):
+    # get the types from the signature
+
+    masked_type_1, masked_type_2 = sig.args
+    masked_return_type = sig.return_type
+    
+    
+    # create LLVM IR structs
+    m1 = cgutils.create_struct_proxy(masked_type_1)(context, builder, value=args[0])
+    m2 = cgutils.create_struct_proxy(masked_type_2)(context, builder, value=args[1])
+    result = cgutils.create_struct_proxy(masked_return_type)(context, builder)
+
+    valid = builder.or_(m1.valid, m2.valid)
+    result.valid = valid
+    with builder.if_then(valid):
+        result.value = builder.add(m1.value, m2.value)
+        
+    return result._getvalue()
+
+@cuda.jit(numba_masked(numba_masked, numba_masked), device=True)
+def masked_add_py(m1, m2):
+    return m1 + m2
+
+def masked_add_py_2(m1, m2):
+    return m1 + m2

From 91ae6a3d2ea71494075648398b09f75dc8f3044b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 7 Mar 2021 12:21:52 -0800
Subject: [PATCH 003/132] a little cleanup

---
 python/cudf/cudf/core/udf.py | 88 ++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 49 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index a1660abe07b..0caffb00768 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -1,82 +1,68 @@
+import operator
+
 import numba
-from numba import cuda
-from numba import njit
+import numpy as np
+from numba import cuda, njit
+from numba.core import cgutils
+from numba.core.extending import (
+    lower_builtin,
+    models,
+    register_model,
+    type_callable,
+    typeof_impl,
+)
+from numba.core.typing import signature
+from numba.core.typing.templates import AbstractTemplate
+from numba.cuda.cudadecl import registry as cuda_registry
+from numba.cuda.cudaimpl import lower as cuda_lower
+from numba.extending import types
+
 
 class Masked(object):
     def __init__(self, value, valid):
         self.value = value
         self.valid = valid
 
-from numba.extending import types
 
 class MaskedType(types.Type):
     def __init__(self):
         super().__init__(name="Masked")
 
-numba_masked = MaskedType() # name this something more natural - GM
+numba_masked = MaskedType()  # name this something more natural - GM
 
-from numba.core.extending import typeof_impl
 @typeof_impl.register(Masked)
 def typeof_masked(val, c):
-    # This can be dynamic and be based off the specifics of the instance
-    # which is why val is a parameter
-    # c is a typeof context and doesn't matter for this
     return numba_masked
 
-# types.number -  more general numeric values
-from numba.core.extending import type_callable
+
 @type_callable(Masked)
 def type_masked(context):
     def typer(value, valid):
-        if isinstance(value, types.Integer) and isinstance(valid, types.Boolean):
+        if isinstance(value, types.Integer) and isinstance(
+            valid, types.Boolean
+        ):
             return numba_masked
+
     return typer
 
-# {i8, ... }
-from numba.core.extending import models, register_model
+
 @register_model(MaskedType)
 class MaskedModel(models.StructModel):
     def __init__(self, dmm, fe_type):
-        members = [
-            ('value', types.int64),
-            ('valid', types.bool_)
-        ]
+        members = [("value", types.int64), ("valid", types.bool_)]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
-from numba.core.extending import lower_builtin
-from numba.core import cgutils
 
 @lower_builtin(Masked, types.int64, types.bool_)
 def impl_masked_constructor(context, builder, sig, args):
-    
-    print("beginning:")
-    print(builder.module)
-    
     typ = sig.return_type
-    value, valid = args    
-    
+    value, valid = args
+
     masked = cgutils.create_struct_proxy(typ)(context, builder)
     masked.value = value
     masked.valid = valid
-    print(masked._getvalue())
-    
-    print("end:")
-    print(builder.module)
-    return masked._getvalue() # return a pointer to the struct I created
-
-@njit
-def create_masked(value, valid):
-    Masked(value, valid)
-
-import numpy as np
-create_masked(1, True)
+    return masked._getvalue()  # return a pointer to the struct I created
 
-from numba.core.typing import signature
-
-from numba.core.typing.templates import AbstractTemplate
-from numba.cuda.cudadecl import registry as cuda_registry
-
-import operator
 
 @cuda_registry.register_global(operator.add)
 class MaskedScalarAdd(AbstractTemplate):
@@ -85,7 +71,6 @@ def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
             return signature(numba_masked, numba_masked, numba_masked)
 
-from numba.cuda.cudaimpl import lower as cuda_lower
 
 @cuda_lower(operator.add, MaskedType, MaskedType)
 def masked_scalar_add_impl(context, builder, sig, args):
@@ -93,23 +78,28 @@ def masked_scalar_add_impl(context, builder, sig, args):
 
     masked_type_1, masked_type_2 = sig.args
     masked_return_type = sig.return_type
-    
-    
+
     # create LLVM IR structs
-    m1 = cgutils.create_struct_proxy(masked_type_1)(context, builder, value=args[0])
-    m2 = cgutils.create_struct_proxy(masked_type_2)(context, builder, value=args[1])
+    m1 = cgutils.create_struct_proxy(masked_type_1)(
+        context, builder, value=args[0]
+    )
+    m2 = cgutils.create_struct_proxy(masked_type_2)(
+        context, builder, value=args[1]
+    )
     result = cgutils.create_struct_proxy(masked_return_type)(context, builder)
 
     valid = builder.or_(m1.valid, m2.valid)
     result.valid = valid
     with builder.if_then(valid):
         result.value = builder.add(m1.value, m2.value)
-        
+
     return result._getvalue()
 
+
 @cuda.jit(numba_masked(numba_masked, numba_masked), device=True)
 def masked_add_py(m1, m2):
     return m1 + m2
 
+
 def masked_add_py_2(m1, m2):
     return m1 + m2

From a855a6f453c311d3ee5464de37a8d09041ef79a0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 7 Mar 2021 13:43:58 -0800
Subject: [PATCH 004/132] basic bindings, header, placeholder c++ code

---
 cpp/include/cudf/transform.hpp          |  9 +++++++++
 cpp/src/transform/transform.cpp         | 18 ++++++++++++++++++
 python/cudf/cudf/_lib/cpp/transform.pxd |  7 +++++++
 python/cudf/cudf/core/udf.py            |  7 +++++++
 4 files changed, 41 insertions(+)

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 9b740d207e1..c2c9e954235 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -53,6 +53,15 @@ std::unique_ptr<column> transform(
   bool is_ptx,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+
+std::unique_ptr<column> masked_binary_op(
+  column_view const& A,
+  column_view const& B,
+  std::string const& binary_udf,
+  data_type output_type,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
  * preserving existing null values and also returns new null_count.
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index f7fc7f832c4..da98d99616e 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -131,4 +131,22 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<column> masked_binary_op(column_view const& A, 
+                                         column_view const& B, 
+                                         std::string const& binary_udf, 
+                                         data_type output_type, 
+                                         rmm::mr::device_memory_resource* mr)
+{
+  std::cout << "hello world!" << std::endl;
+
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+  std::unique_ptr<column> output = make_fixed_width_column(
+    output_type, A.size(), copy_bitmask(A), cudf::UNKNOWN_NULL_COUNT, stream, mr);
+
+  return output;
+}
+
+
+
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 5e37336cb94..624c36f9b19 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -38,6 +38,13 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         bool is_ptx
     ) except +
 
+    cdef unique_ptr[column] masked_binary_op(
+        column_view A,
+        column_view B, 
+        string binary_udf,
+        data_type output_type
+    ) except +
+
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 0caffb00768..2c58b6bc951 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -16,6 +16,7 @@
 from numba.cuda.cudadecl import registry as cuda_registry
 from numba.cuda.cudaimpl import lower as cuda_lower
 from numba.extending import types
+import inspect
 
 
 class Masked(object):
@@ -103,3 +104,9 @@ def masked_add_py(m1, m2):
 
 def masked_add_py_2(m1, m2):
     return m1 + m2
+
+def compile_udf(func):
+
+    signature = (numba_masked, numba_masked)
+    ptx, _ = cuda.compile_ptx_for_current_device(func, signature, device=True)
+    return ptx

From 1b2c00c2e1dc50f5d01df22a2e3af4657cfe94b2 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 7 Mar 2021 13:45:40 -0800
Subject: [PATCH 005/132] missed one cython file - bindings work and run

---
 python/cudf/cudf/_lib/transform.pyx | 30 +++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index f061f8de942..93090cc78fc 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -127,6 +127,36 @@ def transform(Column input, op):
 
     return Column.from_unique_ptr(move(c_output))
 
+def masked_binary_op(Column A, Column B, op):
+    cdef column_view A_view = A.view()
+    cdef column_view B_view = B.view()
+
+    cdef string c_str
+    cdef type_id c_tid
+    cdef data_type c_dtype
+
+    if A.dtype != np.dtype('int64') or B.dtype != np.dtype('int64'):
+        raise TypeError('int64 please')
+    
+ 
+    from cudf.core.udf import compile_udf
+    st = compile_udf(op).encode('UTF-8')
+
+    c_tid = <type_id> (
+        <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]
+    )
+    c_dtype = data_type(c_tid)
+
+    with nogil:
+        c_output = move(libcudf_transform.masked_binary_op(
+            A_view,
+            B_view,
+            c_str,
+            c_dtype,
+        ))
+
+    return Column.from_unique_ptr(move(c_output))
+
 
 def table_encode(Table input):
     cdef table_view c_input = input.data_view()

From 7584ad3e8b1303c866c706bcc44fe776e192da3b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 7 Mar 2021 13:53:14 -0800
Subject: [PATCH 006/132] fix bug

---
 python/cudf/cudf/_lib/transform.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 93090cc78fc..1fea3e7bbec 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -140,7 +140,7 @@ def masked_binary_op(Column A, Column B, op):
     
  
     from cudf.core.udf import compile_udf
-    st = compile_udf(op).encode('UTF-8')
+    c_str = compile_udf(op).encode('UTF-8')
 
     c_tid = <type_id> (
         <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]

From 4988b14487979f914228e14b198e9b13bd49d34d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 8 Mar 2021 05:07:00 -0800
Subject: [PATCH 007/132] little more progress

---
 cpp/src/transform/jit/code/kernel.cpp | 21 +++++++++++++++++++++
 cpp/src/transform/transform.cpp       | 12 +++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index 58fdb945de3..29f7cb4a1d7 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -53,6 +53,27 @@ const char* kernel =
     }
   )***";
 
+const char* masked_binary_op_kernel = 
+  R"***(
+    template <typename TypeOut, typename TypeIn1, typename TypeIn2>
+    __global__
+    void kernel(cudf::size_type size,
+                    TypeOut* out_data, TypeIn1* in_data1, TypeIn2 in_data2) {
+        int tid = threadIdx.x;
+        int blkid = blockIdx.x;
+        int blksz = blockDim.x;
+        int gridsz = gridDim.x;
+
+        int start = tid + blkid * blksz;
+        int step = blksz * gridsz;
+
+        for (cudf::size_type i=start; i<size; i+=step) {
+          GENERIC_BINARY_OP(&out_data[i], in_data[i]);  
+        }
+    }
+  )***";
+
+
 }  // namespace code
 }  // namespace jit
 }  // namespace transformation
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index da98d99616e..f0b4bd87501 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -137,13 +137,23 @@ std::unique_ptr<column> masked_binary_op(column_view const& A,
                                          data_type output_type, 
                                          rmm::mr::device_memory_resource* mr)
 {
-  std::cout << "hello world!" << std::endl;
+  std::cout << "ehllo " << std::endl;
+  std::cout << binary_udf << std::endl;
 
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
+
+
   std::unique_ptr<column> output = make_fixed_width_column(
     output_type, A.size(), copy_bitmask(A), cudf::UNKNOWN_NULL_COUNT, stream, mr);
 
+  auto null_mask = cudf::create_null_mask(A.size(), mask_state::ALL_VALID, mr);
+
+  std::unique_ptr<column> output_mask = make_fixed_width_column(
+    cudf::data_type{cudf::type_id::BOOL8}, A.size(), null_mask, cudf::UNKNOWN_NULL_COUNT, stream, mr);
+
+  
+
   return output;
 }
 

From 7a6427cdccb29238ef271ee36fcfb3eefefa1381 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 Mar 2021 18:48:41 -0800
Subject: [PATCH 008/132] an attempt at NA plumbing

---
 python/cudf/cudf/core/udf.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 2c58b6bc951..891406acba8 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -18,6 +18,7 @@
 from numba.extending import types
 import inspect
 
+from cudf.core.scalar import _NAType
 
 class Masked(object):
     def __init__(self, value, valid):
@@ -29,12 +30,21 @@ class MaskedType(types.Type):
     def __init__(self):
         super().__init__(name="Masked")
 
+class NAType(types.Type):
+    def __init__(self):
+        super().__init__(name="NA")
+
 numba_masked = MaskedType()  # name this something more natural - GM
+numba_na = NAType()
+
 
 @typeof_impl.register(Masked)
 def typeof_masked(val, c):
     return numba_masked
 
+@typeof_impl.register(_NAType)
+def typeof_na(val, c):
+    return numba_na
 
 @type_callable(Masked)
 def type_masked(context):
@@ -73,6 +83,12 @@ def generic(self, args, kws):
             return signature(numba_masked, numba_masked, numba_masked)
 
 
+@cuda_registry.register_global(operator.add)
+class MaskedScalarAddNull(AbstractTemplate):
+    def generic(self, args, kws):
+        if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
+            return signature(numba_masked, numba_masked, numba_na)
+
 @cuda_lower(operator.add, MaskedType, MaskedType)
 def masked_scalar_add_impl(context, builder, sig, args):
     # get the types from the signature
@@ -96,6 +112,13 @@ def masked_scalar_add_impl(context, builder, sig, args):
 
     return result._getvalue()
 
+@cuda_lower(operator.add, MaskedType, NAType)
+def masked_scalar_add_na_impl(context, builder, sig, args):
+    return_type = sig.return_type
+    result = cgutils.create_struct_proxy(return_type)(context, builder)
+    result.valid = False
+    return result._getvalue()
+
 
 @cuda.jit(numba_masked(numba_masked, numba_masked), device=True)
 def masked_add_py(m1, m2):
@@ -110,3 +133,8 @@ def compile_udf(func):
     signature = (numba_masked, numba_masked)
     ptx, _ = cuda.compile_ptx_for_current_device(func, signature, device=True)
     return ptx
+
+@cuda.jit(numba_masked(numba_masked), device=True)
+def test_scalar_null_add(masked):
+    result = masked + cudf.NA
+    return masked

From 5e6eb0683370df9d5f3d6b4c047f66186d0efc6c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 11 Mar 2021 18:49:52 -0800
Subject: [PATCH 009/132] a little more plubming and prototyping

---
 cpp/include/cudf/transform.hpp          |  2 ++
 cpp/src/transform/jit/code/kernel.cpp   | 17 ++++++++++++++++-
 cpp/src/transform/transform.cpp         | 16 ++++++++++++----
 python/cudf/cudf/_lib/cpp/transform.pxd |  4 +++-
 python/cudf/cudf/_lib/transform.pyx     |  8 ++++++++
 5 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index c2c9e954235..ad1d260f943 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -59,6 +59,8 @@ std::unique_ptr<column> masked_binary_op(
   column_view const& B,
   std::string const& binary_udf,
   data_type output_type,
+  column_view const& outcol_view,
+  column_view const& outmsk_view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 
diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index 29f7cb4a1d7..4f490956ace 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -31,10 +31,17 @@ const char* kernel_header =
 
     #include <cudf/types.hpp>
     #include <cudf/wrappers/timestamps.hpp>
+
+    struct Masked {
+      int value;
+      bool valid;
+    };
+
   )***";
 
 const char* kernel =
   R"***(
+
     template <typename TypeOut, typename TypeIn>
     __global__
     void kernel(cudf::size_type size,
@@ -47,6 +54,10 @@ const char* kernel =
         int start = tid + blkid * blksz;
         int step = blksz * gridsz;
 
+        Masked m;
+        m.value = 1;
+        m.valid = true;
+
         for (cudf::size_type i=start; i<size; i+=step) {
           GENERIC_UNARY_OP(&out_data[i], in_data[i]);  
         }
@@ -55,6 +66,8 @@ const char* kernel =
 
 const char* masked_binary_op_kernel = 
   R"***(
+    
+
     template <typename TypeOut, typename TypeIn1, typename TypeIn2>
     __global__
     void kernel(cudf::size_type size,
@@ -67,8 +80,10 @@ const char* masked_binary_op_kernel =
         int start = tid + blkid * blksz;
         int step = blksz * gridsz;
 
+        Masked m;
+
         for (cudf::size_type i=start; i<size; i+=step) {
-          GENERIC_BINARY_OP(&out_data[i], in_data[i]);  
+          GENERIC_BINARY_OP(&out_data[i], in_data1[i], in_data2[i]);  
         }
     }
   )***";
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index f0b4bd87501..ca5921cd306 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -135,24 +135,32 @@ std::unique_ptr<column> masked_binary_op(column_view const& A,
                                          column_view const& B, 
                                          std::string const& binary_udf, 
                                          data_type output_type, 
+                                         column_view const& outcol_view,
+                                         column_view const& outmsk_view,
                                          rmm::mr::device_memory_resource* mr)
 {
   std::cout << "ehllo " << std::endl;
   std::cout << binary_udf << std::endl;
 
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+zz
 
 
+  std::string parsed_ptx = cudf::jit::parse_single_function_ptx(
+                     binary_udf, "GENERIC_BINARY_OP", cudf::jit::get_type_name(output_type), {0});
+
+
+  std::cout << "successfully parsed PTX!!!" << std::endl;
+  std::cout << "__________________________" << std::endl;
+  std::cout << parsed_ptx << std::endl;
+  std::cout << "__________________________" << std::endl;
+
 
   std::unique_ptr<column> output = make_fixed_width_column(
     output_type, A.size(), copy_bitmask(A), cudf::UNKNOWN_NULL_COUNT, stream, mr);
 
-  auto null_mask = cudf::create_null_mask(A.size(), mask_state::ALL_VALID, mr);
 
-  std::unique_ptr<column> output_mask = make_fixed_width_column(
-    cudf::data_type{cudf::type_id::BOOL8}, A.size(), null_mask, cudf::UNKNOWN_NULL_COUNT, stream, mr);
 
-  
 
   return output;
 }
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 624c36f9b19..217b3374579 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -42,7 +42,9 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         column_view A,
         column_view B, 
         string binary_udf,
-        data_type output_type
+        data_type output_type,
+        column_view outcol_view,
+        column_view outmask_view,
     ) except +
 
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 1fea3e7bbec..b94d2b33bce 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -147,12 +147,20 @@ def masked_binary_op(Column A, Column B, op):
     )
     c_dtype = data_type(c_tid)
 
+    cdef Column output_column = cudf.core.column.column_empty(len(A), dtype='int64')
+    cdef Column output_mask = cudf.core.column.column_empty(len(A), dtype='bool')
+
+    cdef column_view outcol_view = output_column.view()
+    cdef column_view outmsk_view = output_mask.view()
+
     with nogil:
         c_output = move(libcudf_transform.masked_binary_op(
             A_view,
             B_view,
             c_str,
             c_dtype,
+            outcol_view,
+            outmsk_view
         ))
 
     return Column.from_unique_ptr(move(c_output))

From ea15da63ce003f4428b2c5f026aabd10032a2a38 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 12 Mar 2021 09:12:05 -0800
Subject: [PATCH 010/132] lots of progress

---
 cpp/src/transform/jit/code/code.h     |  1 +
 cpp/src/transform/jit/code/kernel.cpp | 32 +++++++----
 cpp/src/transform/transform.cpp       | 77 ++++++++++++++-------------
 python/cudf/cudf/core/udf.py          | 41 +++++++++++---
 4 files changed, 98 insertions(+), 53 deletions(-)

diff --git a/cpp/src/transform/jit/code/code.h b/cpp/src/transform/jit/code/code.h
index cc3d6a8fe89..b12fac6e2c7 100644
--- a/cpp/src/transform/jit/code/code.h
+++ b/cpp/src/transform/jit/code/code.h
@@ -24,6 +24,7 @@ extern const char* kernel_header;
 extern const char* kernel;
 extern const char* traits;
 extern const char* operation;
+extern const char* masked_binary_op_kernel;
 
 }  // namespace code
 }  // namespace jit
diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index 4f490956ace..d0632e8385a 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -28,9 +28,9 @@ const char* kernel_header =
     #include <cuda/std/climits>
     #include <cuda/std/cstddef>
     #include <cuda/std/limits>
-
     #include <cudf/types.hpp>
     #include <cudf/wrappers/timestamps.hpp>
+    #include <cudf/utilities/bit.hpp>
 
     struct Masked {
       int value;
@@ -54,10 +54,6 @@ const char* kernel =
         int start = tid + blkid * blksz;
         int step = blksz * gridsz;
 
-        Masked m;
-        m.value = 1;
-        m.valid = true;
-
         for (cudf::size_type i=start; i<size; i+=step) {
           GENERIC_UNARY_OP(&out_data[i], in_data[i]);  
         }
@@ -70,8 +66,15 @@ const char* masked_binary_op_kernel =
 
     template <typename TypeOut, typename TypeIn1, typename TypeIn2>
     __global__
-    void kernel(cudf::size_type size,
-                    TypeOut* out_data, TypeIn1* in_data1, TypeIn2 in_data2) {
+    void kernel(cudf::size_type size, 
+                cudf::size_type offset,
+                TypeOut* out_data, 
+                cudf::bitmask_type const* out_mask,
+                TypeIn1* in_data1, 
+                cudf::bitmask_type const* in_data1_mask,
+                TypeIn2* in_data2,
+                cudf::bitmask_type const* in_data2_mask
+    ) {
         int tid = threadIdx.x;
         int blkid = blockIdx.x;
         int blksz = blockDim.x;
@@ -80,10 +83,21 @@ const char* masked_binary_op_kernel =
         int start = tid + blkid * blksz;
         int step = blksz * gridsz;
 
-        Masked m;
+        Masked output;
 
         for (cudf::size_type i=start; i<size; i+=step) {
-          GENERIC_BINARY_OP(&out_data[i], in_data1[i], in_data2[i]);  
+          bool mask_1 = cudf::bit_is_set(in_data1_mask, offset + i);
+          bool mask_2 = cudf::bit_is_set(in_data2_mask, offset + i);
+          
+          GENERIC_BINARY_OP(&output, 
+                            in_data1[i], 
+                            mask_1, 
+                            in_data2[i],
+                            mask_2);  
+
+          out_data[i] = output.value;
+          out_mask[i] = output.valid;
+
         }
     }
   )***";
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index ca5921cd306..d5705082dab 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -58,24 +58,12 @@ void unary_operation(mutable_column_view output,
                      rmm::cuda_stream_view stream)
 {
   std::string hash = "prog_transform" + std::to_string(std::hash<std::string>{}(udf));
-
-  std::cout << "The program's hash is:" << std::endl;
-  std::cout << hash << std::endl;
-
-  std::cout << "the actual udf string is: " << std::endl;
-  std::cout << udf << std::endl;
-
-
-  std::cout << "cuda_source is:" << std::endl;
   std::string cuda_source = code::kernel_header;
-  std::cout << cuda_source << std::endl;
 
   if (is_ptx) {
     cuda_source += cudf::jit::parse_single_function_ptx(
                      udf, "GENERIC_UNARY_OP", cudf::jit::get_type_name(output_type), {0}) +
                    code::kernel;
-    std::cout << "cuda_source after is_ptx condition: " << std::endl;
-    std::cout << cuda_source << std::endl;
   } else {
     cuda_source += cudf::jit::parse_single_function_cuda(udf, "GENERIC_UNARY_OP") + code::kernel;
   }
@@ -93,6 +81,26 @@ void unary_operation(mutable_column_view output,
     .launch(output.size(), cudf::jit::get_data_ptr(output), cudf::jit::get_data_ptr(input));
 }
 
+
+void binary_operation(column_view const& A, 
+                      column_view const& B, 
+                      std::string const& binary_udf, 
+                      data_type output_type, 
+                      column_view const& outcol_view,
+                      column_view const& outmsk_view,
+                      rmm::mr::device_memory_resource* mr)
+{
+  std::string cuda_source = code::kernel_header;
+  cuda_source += cudf::jit::parse_single_function_ptx(
+                     binary_udf, "GENERIC_BINARY_OP", cudf::jit::get_type_name(output_type), {0});
+
+  cuda_source += code::masked_binary_op_kernel;
+
+  std::cout << "*** CUDA_SOURCE ***" << std::endl;
+  std::cout << cuda_source << std::endl;
+
+}
+
 }  // namespace jit
 }  // namespace transformation
 
@@ -119,6 +127,24 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
+std::unique_ptr<column> masked_binary_op_inner(column_view const& A, 
+                                         column_view const& B, 
+                                         std::string const& binary_udf, 
+                                         data_type output_type, 
+                                         column_view const& outcol_view,
+                                         column_view const& outmsk_view,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+  transformation::jit::binary_operation(A, B, binary_udf, output_type, outcol_view, outmsk_view, mr);
+
+  std::unique_ptr<column> output = make_fixed_width_column(
+    output_type, A.size(), copy_bitmask(A), cudf::UNKNOWN_NULL_COUNT, stream, mr);
+
+
+  return output;
+}
+
 }  // namespace detail
 
 std::unique_ptr<column> transform(column_view const& input,
@@ -139,32 +165,9 @@ std::unique_ptr<column> masked_binary_op(column_view const& A,
                                          column_view const& outmsk_view,
                                          rmm::mr::device_memory_resource* mr)
 {
-  std::cout << "ehllo " << std::endl;
-  std::cout << binary_udf << std::endl;
-
-  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-zz
-
-
-  std::string parsed_ptx = cudf::jit::parse_single_function_ptx(
-                     binary_udf, "GENERIC_BINARY_OP", cudf::jit::get_type_name(output_type), {0});
-
-
-  std::cout << "successfully parsed PTX!!!" << std::endl;
-  std::cout << "__________________________" << std::endl;
-  std::cout << parsed_ptx << std::endl;
-  std::cout << "__________________________" << std::endl;
-
-
-  std::unique_ptr<column> output = make_fixed_width_column(
-    output_type, A.size(), copy_bitmask(A), cudf::UNKNOWN_NULL_COUNT, stream, mr);
-
-
-
-
-  return output;
+  std::cout << "HERE!!" << std::endl;
+  return detail::masked_binary_op_inner(A, B, binary_udf, output_type, outcol_view, outmsk_view, mr);
 }
 
 
-
 }  // namespace cudf
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 891406acba8..b2f59b5133b 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -1,5 +1,4 @@
 import operator
-
 import numba
 import numpy as np
 from numba import cuda, njit
@@ -14,12 +13,15 @@
 from numba.core.typing import signature
 from numba.core.typing.templates import AbstractTemplate
 from numba.cuda.cudadecl import registry as cuda_registry
-from numba.cuda.cudaimpl import lower as cuda_lower
+from numba.cuda.cudaimpl import lower as cuda_lower, registry as cuda_lowering_registry
 from numba.extending import types
 import inspect
 
+from llvmlite import ir
 from cudf.core.scalar import _NAType
 
+from numba.core.extending import make_attribute_wrapper
+
 class Masked(object):
     def __init__(self, value, valid):
         self.value = value
@@ -37,7 +39,6 @@ def __init__(self):
 numba_masked = MaskedType()  # name this something more natural - GM
 numba_na = NAType()
 
-
 @typeof_impl.register(Masked)
 def typeof_masked(val, c):
     return numba_masked
@@ -56,6 +57,8 @@ def typer(value, valid):
 
     return typer
 
+make_attribute_wrapper(MaskedType, "value", "value")
+make_attribute_wrapper(MaskedType, "valid", "valid")
 
 @register_model(MaskedType)
 class MaskedModel(models.StructModel):
@@ -63,6 +66,13 @@ def __init__(self, dmm, fe_type):
         members = [("value", types.int64), ("valid", types.bool_)]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
+#@register_model(NAType) # check what model NoneType uses
+#class NAModel(models.OpaqueModel):
+#    def __init__(self, dmm, fe_type):
+#        members = []
+#        models.StructModel.__init__(self, dmm, fe_type, members)
+
+register_model(NAType)(models.OpaqueModel)
 
 @lower_builtin(Masked, types.int64, types.bool_)
 def impl_masked_constructor(context, builder, sig, args):
@@ -112,13 +122,21 @@ def masked_scalar_add_impl(context, builder, sig, args):
 
     return result._getvalue()
 
+
 @cuda_lower(operator.add, MaskedType, NAType)
 def masked_scalar_add_na_impl(context, builder, sig, args):
-    return_type = sig.return_type
-    result = cgutils.create_struct_proxy(return_type)(context, builder)
-    result.valid = False
+#    return_type = sig.return_type
+    # use context to get llvm type for a bool
+    breakpoint()
+    result = cgutils.create_struct_proxy(numba_masked)(context, builder)
+    result.valid = context.get_constant(types.boolean, 0)
     return result._getvalue()
 
+@cuda_lowering_registry.lower_constant(NAType)
+def constant_dummy(context, builder, ty, pyval):
+    # This handles None, etc.
+    return context.get_dummy_value()
+
 
 @cuda.jit(numba_masked(numba_masked, numba_masked), device=True)
 def masked_add_py(m1, m2):
@@ -137,4 +155,13 @@ def compile_udf(func):
 @cuda.jit(numba_masked(numba_masked), device=True)
 def test_scalar_null_add(masked):
     result = masked + cudf.NA
-    return masked
+    return result
+
+@cuda.jit
+def test_test_scalar_null_add(data, masks):
+    m = Masked(1, True)
+    result = test_scalar_null_add(m)
+    print(result.value)
+    print(types.int8(result.valid))
+    data[0] = result.value
+    masks[0] = result.valid

From 961a9dd3db0e2250553f4f6a074d10657db08abd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 15 Mar 2021 17:46:01 -0700
Subject: [PATCH 011/132] trying to plumb to jitify launcher

---
 cpp/src/transform/jit/code/code.h     |  1 +
 cpp/src/transform/jit/code/kernel.cpp | 58 ++++++++++++---------
 cpp/src/transform/transform.cpp       | 73 ++++++++++++++++++++++++++-
 python/cudf/cudf/core/udf.py          |  5 +-
 4 files changed, 109 insertions(+), 28 deletions(-)

diff --git a/cpp/src/transform/jit/code/code.h b/cpp/src/transform/jit/code/code.h
index b12fac6e2c7..c5321ad824a 100644
--- a/cpp/src/transform/jit/code/code.h
+++ b/cpp/src/transform/jit/code/code.h
@@ -25,6 +25,7 @@ extern const char* kernel;
 extern const char* traits;
 extern const char* operation;
 extern const char* masked_binary_op_kernel;
+extern const char* null_kernel;
 
 }  // namespace code
 }  // namespace jit
diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index d0632e8385a..d12fa8e158b 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -64,44 +64,52 @@ const char* masked_binary_op_kernel =
   R"***(
     
 
-    template <typename TypeOut, typename TypeIn1, typename TypeIn2>
+    template <typename TypeOut, typename TypeLhs, typename TypeRhs>
     __global__
-    void kernel(cudf::size_type size, 
-                cudf::size_type offset,
+    void test_binop_kernel(cudf::size_type size, 
                 TypeOut* out_data, 
-                cudf::bitmask_type const* out_mask,
-                TypeIn1* in_data1, 
-                cudf::bitmask_type const* in_data1_mask,
-                TypeIn2* in_data2,
-                cudf::bitmask_type const* in_data2_mask
+                TypeLhs* lhs_data,
+                TypeRhs* rhs_data 
+                //bool* output_mask,
+                //cudf::bitmask_type const* lhs_mask, 
+                //cudf::size_type lhs_offset,
+                //cudf::bitmask_type const* rhs_mask, 
+                //cudf::size_type rhs_offset)
     ) {
-        int tid = threadIdx.x;
-        int blkid = blockIdx.x;
-        int blksz = blockDim.x;
-        int gridsz = gridDim.x;
+        //int tid = threadIdx.x;
+        //int blkid = blockIdx.x;
+        //int blksz = blockDim.x;
+        //int gridsz = gridDim.x;
 
-        int start = tid + blkid * blksz;
-        int step = blksz * gridsz;
+        //int start = tid + blkid * blksz;
+        //int step = blksz * gridsz;
 
-        Masked output;
+        //Masked output;
 
-        for (cudf::size_type i=start; i<size; i+=step) {
-          bool mask_1 = cudf::bit_is_set(in_data1_mask, offset + i);
-          bool mask_2 = cudf::bit_is_set(in_data2_mask, offset + i);
+        //for (cudf::size_type i=start; i<size; i+=step) {
+        //bool l_valid = lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true,
+        //bool r_valid = rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true,
           
-          GENERIC_BINARY_OP(&output, 
-                            in_data1[i], 
-                            mask_1, 
-                            in_data2[i],
-                            mask_2);  
+        //GENERIC_BINARY_OP(&output, 
+        //                  lhs_data[i], 
+        //                  l_valid, 
+        //                  rhs_data[i],
+        //                  r_valid);  
 
-          out_data[i] = output.value;
-          out_mask[i] = output.valid;
+        //  out_data[i] = output.value;
+        //  out_mask[i] = output.valid;
 
         }
     }
   )***";
 
+const char* null_kernel =
+  R"***(
+    template <typename TypeOut, typename Type2>
+    __global__
+    void null_kernel(TypeOut* out_data, Type2* test) {}
+
+  )***";
 
 }  // namespace code
 }  // namespace jit
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index d5705082dab..c22be708857 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -90,15 +90,86 @@ void binary_operation(column_view const& A,
                       column_view const& outmsk_view,
                       rmm::mr::device_memory_resource* mr)
 {
+
+  std::string hash = "prog_transform" + std::to_string(std::hash<std::string>{}(binary_udf));
+
   std::string cuda_source = code::kernel_header;
   cuda_source += cudf::jit::parse_single_function_ptx(
                      binary_udf, "GENERIC_BINARY_OP", cudf::jit::get_type_name(output_type), {0});
 
-  cuda_source += code::masked_binary_op_kernel;
+  cuda_source += code::null_kernel;
 
   std::cout << "*** CUDA_SOURCE ***" << std::endl;
   std::cout << cuda_source << std::endl;
 
+  rmm::cuda_stream_view stream;
+
+  // Launch the jitify kernel
+
+  cudf::jit::launcher(hash,
+                      cuda_source,
+                      header_names,
+                      cudf::jit::compiler_flags,
+                      headers_code,
+                      stream)
+    .set_kernel_inst("null_kernel",
+                      {
+                        cudf::jit::get_type_name(outcol_view.type()), cudf::jit::get_type_name(outcol_view.type()),
+                      }
+    );
+  
+  /*
+
+
+  cudf::jit::launcher(hash,
+                      cuda_source,
+                      header_names,
+                      cudf::jit::compiler_flags,
+                      headers_code,
+                      stream)
+    .set_kernel_inst("test_binop_kernel",  // name of the kernel we are launching
+                     {cudf::jit::get_type_name(outcol_view.type()),  // list of template arguments
+                      cudf::jit::get_type_name(A.type()),
+                      cudf::jit::get_type_name(B.type())});
+    //.launch(outcol_view.size(), 
+    //        cudf::jit::get_data_ptr(outcol_view), 
+    //        cudf::jit::get_data_ptr(A),
+    //        cudf::jit::get_data_ptr(B),
+    //        cudf::jit::get_data_ptr(outmsk_view), // ?
+    //        A.null_mask(),
+    //        A.offset(),
+    //        B.null_mask(),
+    //        B.offset()
+    //);
+
+  */
+
+  /*
+    void kernel(cudf::size_type size, 
+                TypeOut* out_data, 
+                TypeLhs* lhs_data,
+                TypeRhs* rhs_data, 
+                cudf::bitmask_type* output_mask,
+                cudf::bitmask_type const* lhs_mask, 
+                cudf::size_type lhs_offset,
+                cudf::bitmask_type const* rhs_mask, 
+                cudf::size_type rhs_offset)
+
+       .set_kernel_inst{cudf::jit::get_type_name(out.type()),  // list of template arguments
+                        cudf::jit::get_type_name(lhs.type()),
+                        cudf::jit::get_type_name(rhs.type()),
+                        get_operator_name(op, OperatorType::Direct)})
+      .launch(out.size(),
+              cudf::jit::get_data_ptr(out),
+              cudf::jit::get_data_ptr(lhs),
+              cudf::jit::get_data_ptr(rhs),
+              out.null_mask(),
+              lhs.null_mask(),
+              rhs.offset(),
+              rhs.null_mask(),
+              rhs.offset());
+   */
+
 }
 
 }  // namespace jit
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index b2f59b5133b..215254ef97a 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -127,7 +127,6 @@ def masked_scalar_add_impl(context, builder, sig, args):
 def masked_scalar_add_na_impl(context, builder, sig, args):
 #    return_type = sig.return_type
     # use context to get llvm type for a bool
-    breakpoint()
     result = cgutils.create_struct_proxy(numba_masked)(context, builder)
     result.valid = context.get_constant(types.boolean, 0)
     return result._getvalue()
@@ -152,9 +151,11 @@ def compile_udf(func):
     ptx, _ = cuda.compile_ptx_for_current_device(func, signature, device=True)
     return ptx
 
+NA = _NAType()
+
 @cuda.jit(numba_masked(numba_masked), device=True)
 def test_scalar_null_add(masked):
-    result = masked + cudf.NA
+    result = masked + NA
     return result
 
 @cuda.jit

From 5e930942417b967faf87004d4b28661bf977c586 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 16 Mar 2021 08:49:44 -0700
Subject: [PATCH 012/132] progress on jitify template/launch

---
 cpp/src/transform/jit/code/kernel.cpp |  7 +++++--
 cpp/src/transform/transform.cpp       | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index d12fa8e158b..9f7644875fc 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -105,9 +105,12 @@ const char* masked_binary_op_kernel =
 
 const char* null_kernel =
   R"***(
-    template <typename TypeOut, typename Type2>
+    template <typename TypeOut, typename TypeLhs, typename TypeRhs>
     __global__
-    void null_kernel(TypeOut* out_data, Type2* test) {}
+    void null_kernel(cudf::size_type size,
+                     TypeOut* out_data, 
+                     TypeLhs* lhs_data,
+                     TypeRhs* rhs_data) {}
 
   )***";
 
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index c22be708857..b0f28c83201 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -113,11 +113,19 @@ void binary_operation(column_view const& A,
                       headers_code,
                       stream)
     .set_kernel_inst("null_kernel",
-                      {
-                        cudf::jit::get_type_name(outcol_view.type()), cudf::jit::get_type_name(outcol_view.type()),
-                      }
+                    {
+                      cudf::jit::get_type_name(outcol_view.type()), 
+                      cudf::jit::get_type_name(A.type()),
+                      cudf::jit::get_type_name(B.type())
+                    }
+    )
+    .launch(outcol_view.size(),
+      outcol_view.size(),
+      cudf::jit::get_data_ptr(outcol_view),
+      cudf::jit::get_data_ptr(A),
+      cudf::jit::get_data_ptr(B)      
     );
-  
+    std::cout << "KERNEL LAUNCHED!!!" << std::endl;
   /*
 
 

From 03edcebb3a67d9d9c6e15b465459afc319d417e7 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 16 Mar 2021 11:26:37 -0700
Subject: [PATCH 013/132] null kernel launches with all arguments

---
 cpp/src/transform/jit/code/kernel.cpp |  8 +++++++-
 cpp/src/transform/transform.cpp       | 14 +++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index 9f7644875fc..703873b2fa5 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -110,7 +110,13 @@ const char* null_kernel =
     void null_kernel(cudf::size_type size,
                      TypeOut* out_data, 
                      TypeLhs* lhs_data,
-                     TypeRhs* rhs_data) {}
+                     TypeRhs* rhs_data,
+                     bool* out_mask,
+                     cudf::bitmask_type const* lhs_mask,
+                     cudf::size_type lhs_offset,
+                     cudf::bitmask_type const* rhs_mask,
+                     cudf::size_type rhs_offset
+                    ) {}
 
   )***";
 
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index b0f28c83201..d222bbc5ef2 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -116,14 +116,18 @@ void binary_operation(column_view const& A,
                     {
                       cudf::jit::get_type_name(outcol_view.type()), 
                       cudf::jit::get_type_name(A.type()),
-                      cudf::jit::get_type_name(B.type())
+                      cudf::jit::get_type_name(B.type()),
                     }
     )
     .launch(outcol_view.size(),
-      outcol_view.size(),
-      cudf::jit::get_data_ptr(outcol_view),
-      cudf::jit::get_data_ptr(A),
-      cudf::jit::get_data_ptr(B)      
+            cudf::jit::get_data_ptr(outcol_view),
+            cudf::jit::get_data_ptr(A),
+            cudf::jit::get_data_ptr(B),
+            cudf::jit::get_data_ptr(outmsk_view),
+            A.null_mask(),
+            A.offset(),
+            B.null_mask(),
+            B.offset()
     );
     std::cout << "KERNEL LAUNCHED!!!" << std::endl;
   /*

From 2b4c36fb965891e89adb7f4c672f9bd077fcba65 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 16 Mar 2021 12:45:15 -0700
Subject: [PATCH 014/132] bit_is_set works

---
 cpp/src/transform/jit/code/kernel.cpp | 21 ++++++++++++++++++++-
 cpp/src/transform/transform.cpp       |  3 ++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index 703873b2fa5..12944915632 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -116,7 +116,26 @@ const char* null_kernel =
                      cudf::size_type lhs_offset,
                      cudf::bitmask_type const* rhs_mask,
                      cudf::size_type rhs_offset
-                    ) {}
+                    ) {
+        int tid = threadIdx.x;
+        int blkid = blockIdx.x;
+        int blksz = blockDim.x;
+        int gridsz = gridDim.x;
+
+        int start = tid + blkid * blksz;
+        int step = blksz * gridsz;
+
+        Masked output;
+
+        bool l_valid;
+        bool r_valid;
+
+
+        for (cudf::size_type i=start; i<size; i+=step) {
+          l_valid = cudf::bit_is_set(lhs_mask, lhs_offset + i);
+        }
+
+    }
 
   )***";
 
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index d222bbc5ef2..6a552d4bb58 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -31,6 +31,7 @@
 
 #include <jit/timestamps.hpp.jit>
 #include <jit/types.hpp.jit>
+#include <jit/bit.hpp.jit>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -39,7 +40,7 @@ namespace transformation {
 //! Jit functions
 namespace jit {
 
-const std::vector<std::string> header_names{cudf_types_hpp, cudf_wrappers_timestamps_hpp};
+const std::vector<std::string> header_names{cudf_types_hpp, cudf_wrappers_timestamps_hpp, cudf_utilities_bit_hpp};
 
 std::istream* headers_code(std::string filename, std::iostream& stream)
 {

From 3f76df5374ba9370a4516784cd6b26b042ab3de3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 16 Mar 2021 19:16:36 -0700
Subject: [PATCH 015/132] successfully passing struct through the ptx function

---
 cpp/src/jit/launcher.cpp              |  2 --
 cpp/src/transform/jit/code/kernel.cpp | 23 ++++++++++++++++++-----
 cpp/src/transform/transform.cpp       |  2 ++
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/cpp/src/jit/launcher.cpp b/cpp/src/jit/launcher.cpp
index a19b6a55503..2ddcac7d5ba 100644
--- a/cpp/src/jit/launcher.cpp
+++ b/cpp/src/jit/launcher.cpp
@@ -35,8 +35,6 @@ launcher::launcher(const std::string& hash,
                    rmm::cuda_stream_view stream)
   : cache_instance{cudf::jit::cudfJitCache::Instance()}, stream(stream)
 {
-  std::cout << "launcher::launcher cuda_source:" << std::endl;
-  std::cout << cuda_source << std::endl;
   program = cache_instance.getProgram(
     hash, cuda_source.c_str(), header_names, compiler_flags, file_callback);
 }
diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index 12944915632..feb435e3847 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -33,7 +33,7 @@ const char* kernel_header =
     #include <cudf/utilities/bit.hpp>
 
     struct Masked {
-      int value;
+      int64_t value;
       bool valid;
     };
 
@@ -116,7 +116,7 @@ const char* null_kernel =
                      cudf::size_type lhs_offset,
                      cudf::bitmask_type const* rhs_mask,
                      cudf::size_type rhs_offset
-                    ) {
+    ) {
         int tid = threadIdx.x;
         int blkid = blockIdx.x;
         int blksz = blockDim.x;
@@ -127,12 +127,25 @@ const char* null_kernel =
 
         Masked output;
 
-        bool l_valid;
-        bool r_valid;
+        char l_valid;
+        char r_valid;
+
+        long int l_data;
+        long int r_data;
 
+        int64_t* my_int_addr = &output.value;
 
         for (cudf::size_type i=start; i<size; i+=step) {
-          l_valid = cudf::bit_is_set(lhs_mask, lhs_offset + i);
+          l_valid = lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true;
+          r_valid = rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true;
+          l_data = lhs_data[i];
+          r_data = rhs_data[i];
+
+          GENERIC_BINARY_OP(my_int_addr, lhs_data[i], l_valid, rhs_data[i], r_valid);
+
+          out_data[i] = output.value;
+          out_mask[i] = output.valid;
+
         }
 
     }
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 6a552d4bb58..c5786240068 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -94,6 +94,8 @@ void binary_operation(column_view const& A,
 
   std::string hash = "prog_transform" + std::to_string(std::hash<std::string>{}(binary_udf));
 
+  std::cout << binary_udf << std::endl;
+
   std::string cuda_source = code::kernel_header;
   cuda_source += cudf::jit::parse_single_function_ptx(
                      binary_udf, "GENERIC_BINARY_OP", cudf::jit::get_type_name(output_type), {0});

From db88f9e8a29aa0ddc3307cd6bd42587b6e050f4b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 16 Mar 2021 19:41:20 -0700
Subject: [PATCH 016/132] pipeline fully runs

---
 python/cudf/cudf/_lib/transform.pyx | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index b94d2b33bce..912cd452950 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -163,7 +163,12 @@ def masked_binary_op(Column A, Column B, op):
             outmsk_view
         ))
 
-    return Column.from_unique_ptr(move(c_output))
+    #return Column.from_unique_ptr(move(c_output))
+
+    output_mask_real = bools_to_mask(output_mask)
+    output_column = output_column.set_mask(output_mask_real)
+
+    return output_column
 
 
 def table_encode(Table input):

From 9a67670e696d3878688412f4d80d2ac07ee61a5a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 17 Mar 2021 14:40:57 -0700
Subject: [PATCH 017/132] it lives

---
 cpp/src/transform/jit/code/kernel.cpp | 4 +---
 python/cudf/cudf/_lib/transform.pyx   | 5 +----
 python/cudf/cudf/core/udf.py          | 2 +-
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index feb435e3847..e604b04744e 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -133,15 +133,13 @@ const char* null_kernel =
         long int l_data;
         long int r_data;
 
-        int64_t* my_int_addr = &output.value;
-
         for (cudf::size_type i=start; i<size; i+=step) {
           l_valid = lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true;
           r_valid = rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true;
           l_data = lhs_data[i];
           r_data = rhs_data[i];
 
-          GENERIC_BINARY_OP(my_int_addr, lhs_data[i], l_valid, rhs_data[i], r_valid);
+          GENERIC_BINARY_OP(&output.value, lhs_data[i], l_valid, rhs_data[i], r_valid);
 
           out_data[i] = output.value;
           out_mask[i] = output.valid;
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 912cd452950..235dc3e12bd 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -127,7 +127,7 @@ def transform(Column input, op):
 
     return Column.from_unique_ptr(move(c_output))
 
-def masked_binary_op(Column A, Column B, op):
+def masked_binary_op(Column A, Column B, op, Column output_column, Column output_mask):
     cdef column_view A_view = A.view()
     cdef column_view B_view = B.view()
 
@@ -147,9 +147,6 @@ def masked_binary_op(Column A, Column B, op):
     )
     c_dtype = data_type(c_tid)
 
-    cdef Column output_column = cudf.core.column.column_empty(len(A), dtype='int64')
-    cdef Column output_mask = cudf.core.column.column_empty(len(A), dtype='bool')
-
     cdef column_view outcol_view = output_column.view()
     cdef column_view outmsk_view = output_mask.view()
 
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 215254ef97a..7bb59b6ee6f 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -115,7 +115,7 @@ def masked_scalar_add_impl(context, builder, sig, args):
     )
     result = cgutils.create_struct_proxy(masked_return_type)(context, builder)
 
-    valid = builder.or_(m1.valid, m2.valid)
+    valid = builder.and_(m1.valid, m2.valid)
     result.valid = valid
     with builder.if_then(valid):
         result.value = builder.add(m1.value, m2.value)

From b9da4bff99ae07d5c5ca17b069945b9ca29fb742 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 12 Apr 2021 05:58:33 -0700
Subject: [PATCH 018/132] cleanup and add notebook

---
 cpp/src/transform/jit/code/code.h     |    1 -
 cpp/src/transform/jit/code/kernel.cpp |   45 +-
 cpp/src/transform/transform.cpp       |   55 +-
 notebooks/udf-nulls-demo-final.ipynb  | 1574 +++++++++++++++++++++++++
 4 files changed, 1576 insertions(+), 99 deletions(-)
 create mode 100644 notebooks/udf-nulls-demo-final.ipynb

diff --git a/cpp/src/transform/jit/code/code.h b/cpp/src/transform/jit/code/code.h
index c5321ad824a..b12fac6e2c7 100644
--- a/cpp/src/transform/jit/code/code.h
+++ b/cpp/src/transform/jit/code/code.h
@@ -25,7 +25,6 @@ extern const char* kernel;
 extern const char* traits;
 extern const char* operation;
 extern const char* masked_binary_op_kernel;
-extern const char* null_kernel;
 
 }  // namespace code
 }  // namespace jit
diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
index e604b04744e..dc0372baf05 100644
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ b/cpp/src/transform/jit/code/kernel.cpp
@@ -60,50 +60,7 @@ const char* kernel =
     }
   )***";
 
-const char* masked_binary_op_kernel = 
-  R"***(
-    
-
-    template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-    __global__
-    void test_binop_kernel(cudf::size_type size, 
-                TypeOut* out_data, 
-                TypeLhs* lhs_data,
-                TypeRhs* rhs_data 
-                //bool* output_mask,
-                //cudf::bitmask_type const* lhs_mask, 
-                //cudf::size_type lhs_offset,
-                //cudf::bitmask_type const* rhs_mask, 
-                //cudf::size_type rhs_offset)
-    ) {
-        //int tid = threadIdx.x;
-        //int blkid = blockIdx.x;
-        //int blksz = blockDim.x;
-        //int gridsz = gridDim.x;
-
-        //int start = tid + blkid * blksz;
-        //int step = blksz * gridsz;
-
-        //Masked output;
-
-        //for (cudf::size_type i=start; i<size; i+=step) {
-        //bool l_valid = lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true,
-        //bool r_valid = rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true,
-          
-        //GENERIC_BINARY_OP(&output, 
-        //                  lhs_data[i], 
-        //                  l_valid, 
-        //                  rhs_data[i],
-        //                  r_valid);  
-
-        //  out_data[i] = output.value;
-        //  out_mask[i] = output.valid;
-
-        }
-    }
-  )***";
-
-const char* null_kernel =
+const char* masked_binary_op_kernel =
   R"***(
     template <typename TypeOut, typename TypeLhs, typename TypeRhs>
     __global__
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index c5786240068..9753fe67e5d 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -102,7 +102,6 @@ void binary_operation(column_view const& A,
 
   cuda_source += code::null_kernel;
 
-  std::cout << "*** CUDA_SOURCE ***" << std::endl;
   std::cout << cuda_source << std::endl;
 
   rmm::cuda_stream_view stream;
@@ -115,7 +114,7 @@ void binary_operation(column_view const& A,
                       cudf::jit::compiler_flags,
                       headers_code,
                       stream)
-    .set_kernel_inst("null_kernel",
+    .set_kernel_inst("masked_binary_op_kernel",
                     {
                       cudf::jit::get_type_name(outcol_view.type()), 
                       cudf::jit::get_type_name(A.type()),
@@ -132,58 +131,6 @@ void binary_operation(column_view const& A,
             B.null_mask(),
             B.offset()
     );
-    std::cout << "KERNEL LAUNCHED!!!" << std::endl;
-  /*
-
-
-  cudf::jit::launcher(hash,
-                      cuda_source,
-                      header_names,
-                      cudf::jit::compiler_flags,
-                      headers_code,
-                      stream)
-    .set_kernel_inst("test_binop_kernel",  // name of the kernel we are launching
-                     {cudf::jit::get_type_name(outcol_view.type()),  // list of template arguments
-                      cudf::jit::get_type_name(A.type()),
-                      cudf::jit::get_type_name(B.type())});
-    //.launch(outcol_view.size(), 
-    //        cudf::jit::get_data_ptr(outcol_view), 
-    //        cudf::jit::get_data_ptr(A),
-    //        cudf::jit::get_data_ptr(B),
-    //        cudf::jit::get_data_ptr(outmsk_view), // ?
-    //        A.null_mask(),
-    //        A.offset(),
-    //        B.null_mask(),
-    //        B.offset()
-    //);
-
-  */
-
-  /*
-    void kernel(cudf::size_type size, 
-                TypeOut* out_data, 
-                TypeLhs* lhs_data,
-                TypeRhs* rhs_data, 
-                cudf::bitmask_type* output_mask,
-                cudf::bitmask_type const* lhs_mask, 
-                cudf::size_type lhs_offset,
-                cudf::bitmask_type const* rhs_mask, 
-                cudf::size_type rhs_offset)
-
-       .set_kernel_inst{cudf::jit::get_type_name(out.type()),  // list of template arguments
-                        cudf::jit::get_type_name(lhs.type()),
-                        cudf::jit::get_type_name(rhs.type()),
-                        get_operator_name(op, OperatorType::Direct)})
-      .launch(out.size(),
-              cudf::jit::get_data_ptr(out),
-              cudf::jit::get_data_ptr(lhs),
-              cudf::jit::get_data_ptr(rhs),
-              out.null_mask(),
-              lhs.null_mask(),
-              rhs.offset(),
-              rhs.null_mask(),
-              rhs.offset());
-   */
 
 }
 
diff --git a/notebooks/udf-nulls-demo-final.ipynb b/notebooks/udf-nulls-demo-final.ipynb
new file mode 100644
index 00000000000..bb72fd04378
--- /dev/null
+++ b/notebooks/udf-nulls-demo-final.ipynb
@@ -0,0 +1,1574 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "brutal-princeton",
+   "metadata": {},
+   "source": [
+    "# NA in cuDF UDFs\n",
+    "cuDFs design decision to store null informtion in bitmasks is really smart, and makes things very performant and tractable in memory bound circumstances such as GPU operations. However this design when coupled with the natural inefficiencies that arise from any kind of serial iteration over our data has made `<NA>` support in general user defined functions hard to solution for. This notebook offers an approach based on jitting a UDF's arguments as a special custom Numba type to produce a generic PTX function. This function is than inlined into a general kernel in libcudf and passed the relevant data and masks inside of libcudf.  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "scenic-laugh",
+   "metadata": {},
+   "source": [
+    "#### Problem setup: concrete example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "above-athletics",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import cudf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "declared-correspondence",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>x</th>\n",
+       "      <th>y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      x     y\n",
+       "0     1     1\n",
+       "1  <NA>     2\n",
+       "2     3  <NA>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = cudf.DataFrame({\n",
+    "    'x': [1, None, 3],\n",
+    "    'y': [1, 2, None]\n",
+    "})\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "expected-milwaukee",
+   "metadata": {},
+   "source": [
+    "Consider the following UDF on two variables adapted from https://docs.rapids.ai/api/cudf/stable/guide-to-udfs.html. This API is fairly different from the pandas API, for several reasons:\n",
+    "- In cuDF, We need to write a loop over arrays in classic numba syntax\n",
+    "- In cuDF, the function returns into an output column we provide as an argument\n",
+    "- The result is different!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "tired-niger",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>x</th>\n",
+       "      <th>y</th>\n",
+       "      <th>out</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>2</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      x     y   out\n",
+       "0     1     1     2\n",
+       "1  <NA>     2  <NA>\n",
+       "2     3  <NA>     3"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def pandas_add(x, y):\n",
+    "    if x is not pd.NA and x < 2:\n",
+    "        return x + y\n",
+    "    else:\n",
+    "        return x\n",
+    "    \n",
+    "pandas_df = df.to_pandas(nullable=True)\n",
+    "pandas_df['out'] = pandas_df.apply(lambda row: pandas_add(row['x'], row['y']), axis=1)\n",
+    "pandas_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "employed-offer",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gpu_add(x, y, out):\n",
+    "    for i, (xi, yi) in enumerate(zip(x, y)):\n",
+    "        if xi < 2:\n",
+    "            out[i] = xi + yi\n",
+    "        else:\n",
+    "            out[i] = xi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "animal-gateway",
+   "metadata": {},
+   "source": [
+    "Problem: The null mask of `y` needs to only be considered if `x > 0`. But it"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "occupied-upgrade",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>x</th>\n",
+       "      <th>y</th>\n",
+       "      <th>out</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>2</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "      <td>&lt;NA&gt;</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      x     y   out\n",
+       "0     1     1   2.0\n",
+       "1  <NA>     2  <NA>\n",
+       "2     3  <NA>  <NA>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = df.apply_rows(gpu_add,\n",
+    "              incols=['x', 'y'],\n",
+    "              outcols={'out':np.float64},\n",
+    "              kwargs={})\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "western-relief",
+   "metadata": {},
+   "source": [
+    "We also don't support comparing `cudf.NA` in any of our UDFs, in any way."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "metallic-macedonia",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def gpu_add_error(x, y, out):\n",
+    "    for i, (xi, yi) in enumerate(zip(x, y)):\n",
+    "        if xi is pd.NA:\n",
+    "            return 5\n",
+    "        else:\n",
+    "            return xi + yi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "scenic-graduate",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypingError",
+     "evalue": "Failed in nopython mode pipeline (step: nopython frontend)\n\u001b[1m\u001b[1m\u001b[1mNo implementation of function Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f7cb83dca90>) found for signature:\n \n >>> gpu_add_error <CUDA device function>(array(int64, 1d, A), array(int64, 1d, A), array(float64, 1d, A))\n \nThere are 2 candidate implementations:\n\u001b[1m  - Of which 2 did not match due to:\n  Overload in function 'gpu_add_error <CUDA device function>': File: ../../../../../../ipynb/<ipython-input-6-e11feefd7c2c>: Line 1.\n    With argument(s): '(array(int64, 1d, A), array(int64, 1d, A), array(float64, 1d, A))':\u001b[0m\n\u001b[1m   Rejected as the implementation raised a specific error:\n     TypingError: Failed in nopython mode pipeline (step: nopython frontend)\n   \u001b[1m\u001b[1mUnknown attribute 'NA' of type Module(<module 'pandas' from '/home/nfs/brmiller/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/pandas/__init__.py'>)\n   \u001b[1m\n   File \"<ipython-input-6-e11feefd7c2c>\", line 3:\u001b[0m\n   \u001b[1mdef gpu_add_error(x, y, out):\n       <source elided>\n       for i, (xi, yi) in enumerate(zip(x, y)):\n   \u001b[1m        if xi is pd.NA:\n   \u001b[0m        \u001b[1m^\u001b[0m\u001b[0m\n   \u001b[0m\n   \u001b[0m\u001b[1mDuring: typing of get attribute at <ipython-input-6-e11feefd7c2c> (3)\u001b[0m\n   \u001b[1m\n   File \"<ipython-input-6-e11feefd7c2c>\", line 3:\u001b[0m\n   \u001b[1mdef gpu_add_error(x, y, out):\n       <source elided>\n       for i, (xi, yi) in enumerate(zip(x, y)):\n   \u001b[1m        if xi is pd.NA:\n   \u001b[0m        \u001b[1m^\u001b[0m\u001b[0m\n\u001b[0m\n  raised from /home/nfs/brmiller/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typeinfer.py:1071\n\u001b[0m\n\u001b[0m\u001b[1mDuring: resolving callee type: Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f7cb83dca90>)\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of call at <string> (8)\n\u001b[0m\n\u001b[1m\nFile \"<string>\", line 8:\u001b[0m\n\u001b[1m<source missing, REPL/exec in use?>\u001b[0m\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypingError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-7-ee244a44092a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m               \u001b[0mincols\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'x'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'y'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m               \u001b[0moutcols\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'out'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m               kwargs={})\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/repos/cudf/python/cudf/cudf/core/dataframe.py\u001b[0m in \u001b[0;36mapply_rows\u001b[0;34m(self, func, incols, outcols, kwargs, pessimistic_nulls, cache_key)\u001b[0m\n\u001b[1;32m   4776\u001b[0m             \u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4777\u001b[0m             \u001b[0mpessimistic_nulls\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4778\u001b[0;31m             \u001b[0mcache_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_key\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4779\u001b[0m         )\n\u001b[1;32m   4780\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/repos/cudf/python/cudf/cudf/utils/applyutils.py\u001b[0m in \u001b[0;36mapply_rows\u001b[0;34m(df, func, incols, outcols, kwargs, pessimistic_nulls, cache_key)\u001b[0m\n\u001b[1;32m     83\u001b[0m         \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mincols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutcols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpessimistic_nulls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_key\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     84\u001b[0m     )\n\u001b[0;32m---> 85\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mapplyrows\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/repos/cudf/python/cudf/cudf/utils/applyutils.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, df, **launch_params)\u001b[0m\n\u001b[1;32m    168\u001b[0m         \u001b[0mbound\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    169\u001b[0m         \u001b[0;31m# Launch kernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 170\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlaunch_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbound\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mlaunch_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    171\u001b[0m         \u001b[0;31m# Prepare pessimistic nullmask\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    172\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpessimistic_nulls\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/repos/cudf/python/cudf/cudf/utils/applyutils.py\u001b[0m in \u001b[0;36mlaunch_kernel\u001b[0;34m(self, df, args)\u001b[0m\n\u001b[1;32m    195\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    196\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mlaunch_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 197\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    198\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    199\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m    370\u001b[0m             \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    371\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m             \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspecialize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    373\u001b[0m         \u001b[0mblockdim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compute_thread_per_block\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    374\u001b[0m         \u001b[0mgriddim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mntasks\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mblockdim\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0mblockdim\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mspecialize\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m    897\u001b[0m         \u001b[0mtargetoptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'link'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlink\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    898\u001b[0m         specialization = Dispatcher(self.py_func, [types.void(*argtypes)],\n\u001b[0;32m--> 899\u001b[0;31m                                     self._bind, targetoptions)\n\u001b[0m\u001b[1;32m    900\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspecializations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margtypes\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspecialization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    901\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mspecialization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, func, sigs, bind, targetoptions)\u001b[0m\n\u001b[1;32m    809\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msigs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    810\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Only one signature supported at present\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 811\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msigs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    812\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_compile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    813\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mcompile\u001b[0;34m(self, sig)\u001b[0m\n\u001b[1;32m    947\u001b[0m             kernel = compile_kernel(self.py_func, argtypes,\n\u001b[1;32m    948\u001b[0m                                     \u001b[0mlink\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlink\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 949\u001b[0;31m                                     **self.targetoptions)\n\u001b[0m\u001b[1;32m    950\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinitions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    951\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_bind\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     30\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mcompile_kernel\u001b[0;34m(pyfunc, args, link, debug, inline, fastmath, extensions, max_registers, opt)\u001b[0m\n\u001b[1;32m     55\u001b[0m def compile_kernel(pyfunc, args, link, debug=False, inline=False,\n\u001b[1;32m     56\u001b[0m                    fastmath=False, extensions=[], max_registers=None, opt=True):\n\u001b[0;32m---> 57\u001b[0;31m     \u001b[0mcres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompile_cuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpyfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvoid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdebug\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minline\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     58\u001b[0m     \u001b[0mfname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfndesc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mllvm_func_name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     59\u001b[0m     lib, kernel = cres.target_context.prepare_cuda_kernel(cres.library, fname,\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     30\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mcompile_cuda\u001b[0;34m(pyfunc, return_type, args, debug, inline)\u001b[0m\n\u001b[1;32m     44\u001b[0m                                   \u001b[0mreturn_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_type\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     45\u001b[0m                                   \u001b[0mflags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m                                   locals={})\n\u001b[0m\u001b[1;32m     47\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     48\u001b[0m     \u001b[0mlibrary\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlibrary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36mcompile_extra\u001b[0;34m(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)\u001b[0m\n\u001b[1;32m    600\u001b[0m     pipeline = pipeline_class(typingctx, targetctx, library,\n\u001b[1;32m    601\u001b[0m                               args, return_type, flags, locals)\n\u001b[0;32m--> 602\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile_extra\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    603\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    604\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36mcompile_extra\u001b[0;34m(self, func)\u001b[0m\n\u001b[1;32m    350\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlifted\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    351\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlifted_from\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 352\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compile_bytecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    353\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    354\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mcompile_ir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc_ir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlifted\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlifted_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36m_compile_bytecode\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    412\u001b[0m         \"\"\"\n\u001b[1;32m    413\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunc_ir\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 414\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compile_core\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    415\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    416\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_compile_ir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36m_compile_core\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    392\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfail_reason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    393\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mis_final_pipeline\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 394\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    395\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    396\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mCompilerError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"All available pipelines exhausted\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36m_compile_core\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    383\u001b[0m             \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    384\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m                 \u001b[0mpm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    386\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    387\u001b[0m                     \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_machinery.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m    337\u001b[0m                     \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpipeline_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpass_desc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    338\u001b[0m                 \u001b[0mpatched_exception\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_patch_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 339\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mpatched_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    341\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mdependency_analysis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_machinery.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m    328\u001b[0m                 \u001b[0mpass_inst\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_pass_registry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpass_inst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    329\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpass_inst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCompilerPass\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 330\u001b[0;31m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_runPass\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpass_inst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    331\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    332\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mBaseException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Legacy pass in use\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     30\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_machinery.py\u001b[0m in \u001b[0;36m_runPass\u001b[0;34m(self, index, pss, internal_state)\u001b[0m\n\u001b[1;32m    287\u001b[0m             \u001b[0mmutated\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_initialization\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    288\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mSimpleTimer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpass_time\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 289\u001b[0;31m             \u001b[0mmutated\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_pass\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    290\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mSimpleTimer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfinalize_time\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    291\u001b[0m             \u001b[0mmutated\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_finalizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_machinery.py\u001b[0m in \u001b[0;36mcheck\u001b[0;34m(func, compiler_state)\u001b[0m\n\u001b[1;32m    260\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    261\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompiler_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 262\u001b[0;31m             \u001b[0mmangled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompiler_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    263\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mmangled\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    264\u001b[0m                 msg = (\"CompilerPass implementations should return True/False. \"\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typed_passes.py\u001b[0m in \u001b[0;36mrun_pass\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m     98\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturn_type\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     99\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlocals\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 100\u001b[0;31m                 raise_errors=self._raise_errors)\n\u001b[0m\u001b[1;32m    101\u001b[0m             \u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtypemap\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtypemap\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    102\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_errors\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typed_passes.py\u001b[0m in \u001b[0;36mtype_inference_stage\u001b[0;34m(typingctx, interp, args, return_type, locals, raise_errors)\u001b[0m\n\u001b[1;32m     70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     71\u001b[0m         \u001b[0minfer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_constraint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m         \u001b[0minfer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpropagate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     73\u001b[0m         \u001b[0mtypemap\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrestype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcalltypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minfer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typeinfer.py\u001b[0m in \u001b[0;36mpropagate\u001b[0;34m(self, raise_errors)\u001b[0m\n\u001b[1;32m   1069\u001b[0m                                   if isinstance(e, ForceLiteralArg)]\n\u001b[1;32m   1070\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mforce_lit_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1071\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1072\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1073\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moperator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mor_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_lit_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypingError\u001b[0m: Failed in nopython mode pipeline (step: nopython frontend)\n\u001b[1m\u001b[1m\u001b[1mNo implementation of function Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f7cb83dca90>) found for signature:\n \n >>> gpu_add_error <CUDA device function>(array(int64, 1d, A), array(int64, 1d, A), array(float64, 1d, A))\n \nThere are 2 candidate implementations:\n\u001b[1m  - Of which 2 did not match due to:\n  Overload in function 'gpu_add_error <CUDA device function>': File: ../../../../../../ipynb/<ipython-input-6-e11feefd7c2c>: Line 1.\n    With argument(s): '(array(int64, 1d, A), array(int64, 1d, A), array(float64, 1d, A))':\u001b[0m\n\u001b[1m   Rejected as the implementation raised a specific error:\n     TypingError: Failed in nopython mode pipeline (step: nopython frontend)\n   \u001b[1m\u001b[1mUnknown attribute 'NA' of type Module(<module 'pandas' from '/home/nfs/brmiller/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/pandas/__init__.py'>)\n   \u001b[1m\n   File \"<ipython-input-6-e11feefd7c2c>\", line 3:\u001b[0m\n   \u001b[1mdef gpu_add_error(x, y, out):\n       <source elided>\n       for i, (xi, yi) in enumerate(zip(x, y)):\n   \u001b[1m        if xi is pd.NA:\n   \u001b[0m        \u001b[1m^\u001b[0m\u001b[0m\n   \u001b[0m\n   \u001b[0m\u001b[1mDuring: typing of get attribute at <ipython-input-6-e11feefd7c2c> (3)\u001b[0m\n   \u001b[1m\n   File \"<ipython-input-6-e11feefd7c2c>\", line 3:\u001b[0m\n   \u001b[1mdef gpu_add_error(x, y, out):\n       <source elided>\n       for i, (xi, yi) in enumerate(zip(x, y)):\n   \u001b[1m        if xi is pd.NA:\n   \u001b[0m        \u001b[1m^\u001b[0m\u001b[0m\n\u001b[0m\n  raised from /home/nfs/brmiller/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typeinfer.py:1071\n\u001b[0m\n\u001b[0m\u001b[1mDuring: resolving callee type: Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f7cb83dca90>)\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of call at <string> (8)\n\u001b[0m\n\u001b[1m\nFile \"<string>\", line 8:\u001b[0m\n\u001b[1m<source missing, REPL/exec in use?>\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = df.apply_rows(gpu_add_error,\n",
+    "              incols=['x', 'y'],\n",
+    "              outcols={'out':np.float64},\n",
+    "              kwargs={})\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "material-financing",
+   "metadata": {},
+   "source": [
+    "#### Why\n",
+    "This is because nulls are generally handled \"pessimistically\", which roughly means the value of the output mask bit is just set to be a big `or` between all the input column's bitmasks. This isn't a problem in pandas, because the UDF is applied by looping through the rows and individually passing each value elementwise through the UDF. When it encounters a null, the value that gets passed is `pd.NA`, which behaves the way it needs to for the function to return the correct value for that row. \n",
+    "#### The two things we want to do then are:\n",
+    "- Make the API feel a little more natural\n",
+    "- explicitly be able to handle nulls in a dynamic way"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "informal-wisdom",
+   "metadata": {},
+   "source": [
+    "# Detour: The cuDF UnaryOp Compilation Pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "mineral-roman",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "Python Function -> Numba -> PTX Code -> libcudf parser -> inlineable function -> Jitify -> Execution\n",
+    "                                                                                    |                                         \n",
+    "                                                             data pointers ---------^\n",
+    "                                                             headers ---------------^\n",
+    "                                                             extra kernel code -----^\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "overall-discretion",
+   "metadata": {},
+   "source": [
+    "The proposed solution to this problem draws heavily on the existing concepts in cuDF's unaryop machinery. This is a situation where the API feels really natural and is quite compatible with pandas, even though ours is named `applymap` and theirs is named `apply` for some reason."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "expired-finder",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = cudf.Series([1, None, 3])\n",
+    "\n",
+    "def f(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "attended-mount",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0       2\n",
+       "1    <NA>\n",
+       "2       4\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x.applymap(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "guided-slave",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0       1\n",
+       "1    <NA>\n",
+       "2       3\n",
+       "dtype: Int64"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x_pd = x.to_pandas(nullable=True)\n",
+    "x_pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "modern-barrier",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0       2\n",
+       "1    <NA>\n",
+       "2       4\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x_pd.apply(f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "general-madagascar",
+   "metadata": {},
+   "source": [
+    "The null handling here is pretty simple - it's always a copy of the original bitmask. But that's not why we're here. Let's pop the hood. From https://github.com/rapidsai/cudf/blob/branch-0.19/python/cudf/cudf/core/column/numerical.py#L721-L726:\n",
+    "\n",
+    "```\n",
+    "def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase:\n",
+    "    if callable(op):\n",
+    "        return libcudf.transform.transform(operand, op)\n",
+    "\n",
+    "    op = libcudf.unary.UnaryOp[op.upper()]\n",
+    "    return libcudf.unary.unary_operation(operand, op)\n",
+    "```\n",
+    "\n",
+    "From here the `transform` cython picks up the callable python function as well as the `Column` to which it is to be applied. Here's some pseudocode for what happens inside it:\n",
+    "\n",
+    "```\n",
+    "def transform(Column input, op):\n",
+    "    signature = get_signature(input)\n",
+    "    compiled_op = cudautils.compile_udf(op, signature)\n",
+    "    c_str = compiled_op[0].encode('UTF-8')\n",
+    "\n",
+    "    c_output = move(\n",
+    "                 libcudf_transform(\n",
+    "                    input,\n",
+    "                    c_str\n",
+    "                 )\n",
+    "\n",
+    "    )\n",
+    "\n",
+    "    return Column.from_unique_ptr(move(c_output))\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "explicit-lebanon",
+   "metadata": {},
+   "source": [
+    "What we have so far then is:\n",
+    "1. Our input column\n",
+    "2. A PTX function compiled by Numba based off the python function, and the type of the argument being passed\n",
+    "\n",
+    "`cudautils.compile_udf` calls out to Numba to transform the pure python function into PTX code through LLVM IR and a series of compilation steps. The pipeline inside Numba goes something like:\n",
+    "\n",
+    "```\n",
+    "Python function -> python bytecode -> type inference -> lowering -> LLVM IR -> PTX code\n",
+    "```\n",
+    "\n",
+    "#### Python bytecode: These are instructions for the python interpreter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "fifteen-device",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  4           0 LOAD_FAST                0 (x)\n",
+      "              2 LOAD_CONST               1 (1)\n",
+      "              4 BINARY_ADD\n",
+      "              6 RETURN_VALUE\n"
+     ]
+    }
+   ],
+   "source": [
+    "import dis\n",
+    "dis.dis(f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "latter-helping",
+   "metadata": {},
+   "source": [
+    "#### Type inference: Assembly level languages only operate in terms of primitive types. \n",
+    "\n",
+    "Thus to generate LLVM IR, Numba needs to know the types of every variable at every point during the function, from arguments to return values. This is one of the reasons `signature` is a required arg to `compile_udf`. We only get this information at runtime, because the user can pass anything into their UDF. When they apply their UDF to a `Series`, it's only at that point Numba can know that the `x` in `f(x)` is of type `int64` for instance - and only then can it actually complete the type inference portion of the process.\n",
+    "\n",
+    "#### Lowering\n",
+    "Once type inference is complete and Numba knows the types of all the input, output, and intermediate variables, it combines that with the algorithmic information from the python function's bytecode and produces LLVM IR in a process called \"lowering\". LLVM IR is like a platform independent assembly language. One can compile from LLVM IR to assembly code for any platform, including into PTX code for NVIDIA GPUs\n",
+    "\n",
+    "\n",
+    "#### What does this mean for us?\n",
+    "It means that what we get out of `cudautils.compile_udf` is an actual string containing a PTX function, compiled by Numba for arguments of the type `input.dtype`. It is important to note that this function is a function that operates, like the original function, on a single element. It does NOT contain a kernel. In fact, here's exactly what it is:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "direct-venezuela",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cudf.utils.cudautils import compile_udf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "engaged-coaching",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "//\n",
+      "// Generated by NVIDIA NVVM Compiler\n",
+      "//\n",
+      "// Compiler Build ID: CL-27506705\n",
+      "// Cuda compilation tools, release 10.2, V10.2.89\n",
+      "// Based on LLVM 3.4svn\n",
+      "//\n",
+      "\n",
+      ".version 6.5\n",
+      ".target sm_70\n",
+      ".address_size 64\n",
+      "\n",
+      "\t// .globl\t_ZN8__main__5f$248Ex\n",
+      ".common .global .align 8 .u64 _ZN08NumbaEnv8__main__5f$248Ex;\n",
+      "\n",
+      ".visible .func  (.param .b32 func_retval0) _ZN8__main__5f$248Ex(\n",
+      "\t.param .b64 _ZN8__main__5f$248Ex_param_0,\n",
+      "\t.param .b64 _ZN8__main__5f$248Ex_param_1\n",
+      ")\n",
+      "{\n",
+      "\t.reg .b32 \t%r<2>;\n",
+      "\t.reg .b64 \t%rd<4>;\n",
+      "\n",
+      "\n",
+      "\tld.param.u64 \t%rd1, [_ZN8__main__5f$248Ex_param_0];\n",
+      "\tld.param.u64 \t%rd2, [_ZN8__main__5f$248Ex_param_1];\n",
+      "\tadd.s64 \t%rd3, %rd2, 1;\n",
+      "\tst.u64 \t[%rd1], %rd3;\n",
+      "\tmov.u32 \t%r1, 0;\n",
+      "\tst.param.b32\t[func_retval0+0], %r1;\n",
+      "\tret;\n",
+      "}\n",
+      "\n",
+      "\n",
+      "\u0000\n"
+     ]
+    }
+   ],
+   "source": [
+    "from numba.np import numpy_support\n",
+    "numba_type = numpy_support.from_dtype(np.dtype('int64'))\n",
+    "ptx, _ = compile_udf(f, (numba_type,))\n",
+    "\n",
+    "print(ptx)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "inclusive-endorsement",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "// .globl main\n",
+    "\n",
+    ".visible .func  (.param .b32 return_value) main(\n",
+    "\t.param .b64 param_0,    # TWO input parameters\n",
+    "\t.param .b64 param_1      \n",
+    ")\n",
+    "{\n",
+    "\t.reg .b32 \t%r<2>;      # declare two 32-bit registers, named %r1 and %r2\n",
+    "\t.reg .b64 \t%rd<4>;     # declare 4 64-bit registers named %rd1, %rd2, %rd3, %rd4\n",
+    "\n",
+    "\n",
+    "\tld.param.u64 \t%rd1, [param_0];       # load param_0 into %rd1\n",
+    "\tld.param.u64 \t%rd2, [param_1];       # load param_1 into %rd2\n",
+    "\tadd.s64 \t%rd3, %rd2, 1;             # take the value of %rd2 (e.g. param_1) add 1, place in %rd3\n",
+    "\tst.u64 \t[%rd1], %rd3;                  # store the value of %rd3 into the location pointer to by %rd1\n",
+    "\tmov.u32 \t%r1, 0;                    # move 0 into %r1\n",
+    "\tst.param.b32\t[func_retval0+0], %r1; # place the value of %r1 into the return value\n",
+    "\tret;                                   # return 0\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "imperial-quality",
+   "metadata": {},
+   "source": [
+    "# What happens next?\n",
+    "Libcudf takes it from here. Broadly speaking, what happens at this point is libcudf hacks together a string (which consists of several elements) which ends up being handed off to jitify and compiled into a final kernel. Jitify then launches that kernel, taking the pointer to the beginning of the actual data column to be transformed as an argument. The three elements are:\n",
+    "\n",
+    "1. A header\n",
+    "2. A outer \"calling\" kernel that generically calls the PTX function\n",
+    "3. A processed version of the PTX function that inlines it directly into CUDA\n",
+    "\n",
+    "The libcudf parser essentially takes the PTX function as above and turns it into a generically callable inlinable function. The final file that gets passed off to jitify to be compiled looks like this:\n",
+    "\n",
+    "```\n",
+    "\n",
+    "#pragma once\n",
+    "\n",
+    "// Include Jitify's cstddef header first\n",
+    "#include <cstddef>\n",
+    "\n",
+    "#include <cuda/std/climits>\n",
+    "#include <cuda/std/cstddef>\n",
+    "#include <cuda/std/limits>\n",
+    "#include <cudf/types.hpp>\n",
+    "#include <cudf/wrappers/timestamps.hpp>\n",
+    "#include <cudf/utilities/bit.hpp>\n",
+    "\n",
+    "template <typename TypeOut, typename TypeIn>\n",
+    "    __global__\n",
+    "    void kernel(cudf::size_type size,\n",
+    "                    TypeOut* out_data, TypeIn* in_data) {\n",
+    "        int tid = threadIdx.x;\n",
+    "        int blkid = blockIdx.x;\n",
+    "        int blksz = blockDim.x;\n",
+    "        int gridsz = gridDim.x;\n",
+    "\n",
+    "        int start = tid + blkid * blksz;\n",
+    "        int step = blksz * gridsz;\n",
+    "\n",
+    "        for (cudf::size_type i=start; i<size; i+=step) {\n",
+    "          GENERIC_UNARY_OP(&out_data[i], in_data[i]);  \n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "__device__ __inline__ void GENERIC_UNARY_OP (\n",
+    "  int64_t* _ZN8__main__5f_241Ex_param_0, \n",
+    "  long int _ZN8__main__5f_241Ex_param_1\n",
+    "){\n",
+    "\n",
+    " asm volatile (\"{\");  asm volatile (\"  .reg .b32 _r<2>;\");\n",
+    "   /**   .reg .b32 \t%r<2>  */\n",
+    "  asm volatile (\"  .reg .b64 _rd<4>;\");\n",
+    "   /**   .reg .b64 \t%rd<4>  */\n",
+    "  asm volatile (\"  mov.u64 _rd1,  %0;\": : \"l\"(_ZN8__main__5f_241Ex_param_0));\n",
+    "   /**   ld.param.u64 \t%rd1, [_ZN8__main__5f$241Ex_param_0]\n",
+    "  asm volatile (\"  mov.u64 _rd2,  %0;\": : \"l\"(_ZN8__main__5f_241Ex_param_1));\n",
+    "   /**   ld.param.u64 \t%rd2, [_ZN8__main__5f$241Ex_param_1] \n",
+    "  asm volatile (\"  add.s64 _rd3, _rd2, 1;\");\n",
+    "   /**   add.s64 \t%rd3, %rd2, 1  */\n",
+    "  asm volatile (\"  st.u64 [_rd1], _rd3;\");\n",
+    "   /**   st.u64 \t[%rd1], %rd3  */\n",
+    "  asm volatile (\"  mov.u32 _r1, 0;\");\n",
+    "   /**   mov.u32 \t%r1, 0  */\n",
+    "  asm volatile (\" /** *** SNIP. *** */\");\n",
+    "   /**   st.param.b32\t[func_retval0+0], %r1  */\n",
+    "  asm volatile (\"bra RETTGT;\");\n",
+    " asm volatile (\"RETTGT:}\");\n",
+    "\n",
+    "}\n",
+    "\n",
+    "```\n",
+    "\n",
+    "Apart from returning the data back to the user, that's more or less the process. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "musical-membership",
+   "metadata": {},
+   "source": [
+    "# What does all this have to do with NAs?\n",
+    "The pipeline is based on the idea that anything you could want to do with a single value of `x` arithmatically is expressable as a generic PTX function of `x` as along with some type information, that numba can generate for you. The rest of the machinery is just meant to deliver the data to this function threadwise. We're going to extend this concept to a function of four variables instead of one: a masked binary operation `x + y` where the four arguments are:\n",
+    "\n",
+    "1. `x`\n",
+    "2. `y`\n",
+    "3. `x.mask`\n",
+    "4. `y.mask`\n",
+    "\n",
+    "\n",
+    "We're going to modify the general kernel that calls `GENERIC_UNARY_OP` and generalizes it to accept these four arguments and call a `GENERIC_BINARY_OP` instead (with two extra arguments - the mask bools)\n",
+    "\n",
+    "# Creating a Numba extension type\n",
+    "\n",
+    "Remember how Numba produces PTX code from a python function and some type information? We're going to create a new Type in Numba that is build around a Struct:\n",
+    "\n",
+    "```\n",
+    "struct Masked {\n",
+    "    int64_t value;\n",
+    "    bool valid;\n",
+    "}\n",
+    "```\n",
+    "And we're going to add an overload of `add` (`+`, `operator.add`) to Numba's registry of function signatures that correctly handles null semantics. Then we're going to JIT the incoming python function and use a `Masked` type for every argument. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "qualified-communications",
+   "metadata": {},
+   "source": [
+    "#### Tell Numba that a `MaskedType` exists, and not much else"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "boring-orleans",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba.core.extending import types\n",
+    "class MaskedType(types.Type):\n",
+    "    # A corresponding MaskedType for numba\n",
+    "    # numba can only generate LLVM IR for things\n",
+    "    # that it recognizes. This is the most basic\n",
+    "    # thing needed for numba to recognize the type,\n",
+    "    # all it really says is \"there's a type, \n",
+    "    # called MaskedType\". name is for __repr__\n",
+    "    def __init__(self):\n",
+    "        super().__init__(name=\"Masked\")\n",
+    "        \n",
+    "numba_masked = MaskedType()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "annual-regard",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba.core.extending import make_attribute_wrapper\n",
+    "\n",
+    "make_attribute_wrapper(MaskedType, \"value\", \"value\")\n",
+    "make_attribute_wrapper(MaskedType, \"valid\", \"valid\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "persistent-palace",
+   "metadata": {},
+   "source": [
+    "#### Tell Numba what this type looks like. In our case, it's a struct. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "opening-specific",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba.core.extending import register_model, models\n",
+    "\n",
+    "@register_model(MaskedType)\n",
+    "class MaskedModel(models.StructModel):\n",
+    "    def __init__(self, dmm, fe_type):\n",
+    "        members = [(\"value\", types.int64), (\"valid\", types.bool_)]\n",
+    "        models.StructModel.__init__(self, dmm, fe_type, members)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "posted-confirmation",
+   "metadata": {},
+   "source": [
+    "#### Register an overload of `operator.add` with Numba's registry of `CUDA` functions. \n",
+    "This is part of the typing phase. When we pass `f(x, y): return x + y` into Numba and say that `x` and `y` are of type `Masked`, it hits the `x + y` statement and goes looking for an overload of `add` with a signature matching those operands. It works by either finding a match and the end or not having one. This piece of code conditionally emits the signature it needs to find, when prompted with two arguments of type `Masked`. One can see how they might dynamically return different types depending on arguments. But this roughly says \"when Numba looks for an overload of `add` that takes two `Masked` as arguments, let it know that there is one, and it will return a `Masked`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "subjective-marshall",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba.cuda.cudadecl import registry as cuda_registry\n",
+    "import operator\n",
+    "from numba.core.typing.templates import AbstractTemplate\n",
+    "\n",
+    "\n",
+    "@cuda_registry.register_global(operator.add)\n",
+    "class MaskedScalarAdd(AbstractTemplate):\n",
+    "    # abstracttemplate vs concretetemplate\n",
+    "    def generic(self, args, kws):\n",
+    "        if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):\n",
+    "            return signature(numba_masked, numba_masked, numba_masked)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "compliant-strengthening",
+   "metadata": {},
+   "source": [
+    "#### Implement Masked + Masked\n",
+    "So far, Numba knows:\n",
+    "- There's a `MaskedType`. \n",
+    "- There's an overload of `operator.add` that accepts two `MaskedType` and returns a `MaskedType`\n",
+    "\n",
+    "Now it essentially needs an implementation for that overload of `operator.add`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "treated-pastor",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba.cuda.cudaimpl import lower as cuda_lower\n",
+    "\n",
+    "@cuda_lower(operator.add, MaskedType, MaskedType)\n",
+    "def masked_scalar_add_impl(context, builder, sig, args):\n",
+    "    # get the types from the signature\n",
+    "    masked_type_1, masked_type_2 = sig.args\n",
+    "    masked_return_type = sig.return_type\n",
+    "\n",
+    "    # create LLVM IR structs\n",
+    "    m1 = cgutils.create_struct_proxy(masked_type_1)(\n",
+    "        context, builder, value=args[0]\n",
+    "    )\n",
+    "    m2 = cgutils.create_struct_proxy(masked_type_2)(\n",
+    "        context, builder, value=args[1]\n",
+    "    )\n",
+    "    result = cgutils.create_struct_proxy(masked_return_type)(context, builder)\n",
+    "\n",
+    "    valid = builder.and_(m1.valid, m2.valid)\n",
+    "    result.valid = valid\n",
+    "    with builder.if_then(valid):\n",
+    "        result.value = builder.add(m1.value, m2.value)\n",
+    "\n",
+    "    return result._getvalue()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "clear-assessment",
+   "metadata": {},
+   "source": [
+    "# Testing it Out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "formed-soccer",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import cuda\n",
+    "def compile_masked(func):\n",
+    "    signature = (numba_masked, numba_masked)\n",
+    "    ptx, _ = cuda.compile_ptx_for_current_device(func, signature, device=True)\n",
+    "    return ptx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "essential-shade",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def f(x, y):\n",
+    "    return x + y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "understanding-firmware",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#ptx = compile_masked(f)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "wound-communications",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "        // .globl       _ZN8__main__6f$2411E6Masked6Masked                                                                                                                                                                                                                   \n",
+    ".common .global .align 8 .u64 _ZN08NumbaEnv8__main__6f$2411E6Masked6Masked;                                                                                                                                                                                                  \n",
+    "                                                                                                                                                                                                                                                                             \n",
+    ".visible .func  (.param .b32 func_retval0) _ZN8__main__6f$2411E6Masked6Masked(                                                                                                                                                                                               \n",
+    "        .param .b64 _ZN8__main__6f$2411E6Masked6Masked_param_0,                                                                                                                                                                                                              \n",
+    "        .param .b64 _ZN8__main__6f$2411E6Masked6Masked_param_1,                                                                                                                                                                                                              \n",
+    "        .param .b32 _ZN8__main__6f$2411E6Masked6Masked_param_2,                                                                                                                                                                                                              \n",
+    "        .param .b64 _ZN8__main__6f$2411E6Masked6Masked_param_3,                                                                                                                                                                                                              \n",
+    "        .param .b32 _ZN8__main__6f$2411E6Masked6Masked_param_4                                                                                                                                                                                                               \n",
+    ")                                                                                                                                                                                                                                                                            \n",
+    "{                                                                                                                                                                                                                                                                            \n",
+    "        .reg .pred      %p<4>;                                                                                                                                                                                                                                               \n",
+    "        .reg .b16       %rs<4>;                                                                                                                                                                                                                                              \n",
+    "        .reg .b32       %r<2>;                                                                                                                                                                                                                                               \n",
+    "        .reg .b64       %rd<6>;                                                                                                                                                                                                                                              \n",
+    "                                                                                                                                                                                                                                                                             \n",
+    "                                                                                                                                                                                                                                                                             \n",
+    "        ld.param.u64    %rd1, [_ZN8__main__6f$2411E6Masked6Masked_param_0];                                                                                                                                                                                                  \n",
+    "        ld.param.u64    %rd2, [_ZN8__main__6f$2411E6Masked6Masked_param_1];                                                                                                                                                                                                  \n",
+    "        ld.param.u64    %rd3, [_ZN8__main__6f$2411E6Masked6Masked_param_3];                                                                                                                                                                                                  \n",
+    "        ld.param.u8     %rs1, [_ZN8__main__6f$2411E6Masked6Masked_param_2];                                                                                                                                                                                                  \n",
+    "        setp.ne.s16     %p1, %rs1, 0;                                                                                                                                                                                                                                        \n",
+    "        ld.param.u8     %rs2, [_ZN8__main__6f$2411E6Masked6Masked_param_4];                                                                                                                                                                                                  \n",
+    "        setp.ne.s16     %p2, %rs2, 0;                                                                                                                                                                                                                                        \n",
+    "        and.pred        %p3, %p1, %p2;                                                                                                                                                                                                                                       \n",
+    "        add.s64         %rd4, %rd3, %rd2;                                                                                                                                                                                                                                    \n",
+    "        selp.b64        %rd5, %rd4, 0, %p3;                                                                                                                                                                                                                                  \n",
+    "        selp.u16        %rs3, 1, 0, %p3;                                                                                                                                                                                                                                     \n",
+    "        st.u64  [%rd1], %rd5;                                                                                                                                                                                                                                                \n",
+    "        st.u8   [%rd1+8], %rs3;                                                                                                                                                                                                                                              \n",
+    "        mov.u32         %r1, 0;                                                                                                                                                                                                                                              \n",
+    "        st.param.b32    [func_retval0+0], %r1;                                                                                            \n",
+    "        ret;                                                                                                                                                                                                                                                                 \n",
+    "}        \n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "atmospheric-database",
+   "metadata": {},
+   "source": [
+    "#### Then, this is the whole file being passed to jitify:\n",
+    "\n",
+    "```cuda\n",
+    "\n",
+    "    #pragma once\n",
+    "\n",
+    "    // Include Jitify's cstddef header first\n",
+    "    #include <cstddef>\n",
+    "\n",
+    "    #include <cuda/std/climits>\n",
+    "    #include <cuda/std/cstddef>\n",
+    "    #include <cuda/std/limits>\n",
+    "    #include <cudf/types.hpp>\n",
+    "    #include <cudf/wrappers/timestamps.hpp>\n",
+    "    #include <cudf/utilities/bit.hpp>\n",
+    "\n",
+    "    struct Masked {\n",
+    "      int64_t value;\n",
+    "      bool valid;\n",
+    "    };\n",
+    "    \n",
+    "   \n",
+    "\n",
+    "    void null_kernel(cudf::size_type size,\n",
+    "                     TypeOut* out_data, \n",
+    "                     TypeLhs* lhs_data,\n",
+    "                     TypeRhs* rhs_data,\n",
+    "                     bool* out_mask,\n",
+    "                     cudf::bitmask_type const* lhs_mask,\n",
+    "                     cudf::size_type lhs_offset,\n",
+    "                     cudf::bitmask_type const* rhs_mask,\n",
+    "                     cudf::size_type rhs_offset\n",
+    "    ) {\n",
+    "        int tid = threadIdx.x;\n",
+    "        int blkid = blockIdx.x;\n",
+    "        int blksz = blockDim.x;\n",
+    "        int gridsz = gridDim.x;\n",
+    "\n",
+    "        int start = tid + blkid * blksz;\n",
+    "        int step = blksz * gridsz;\n",
+    "\n",
+    "        Masked output;\n",
+    "\n",
+    "        char l_valid;\n",
+    "        char r_valid;\n",
+    "\n",
+    "        long int l_data;\n",
+    "        long int r_data;\n",
+    "\n",
+    "        for (cudf::size_type i=start; i<size; i+=step) {\n",
+    "          l_valid = lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true;\n",
+    "          r_valid = rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true;\n",
+    "          l_data = lhs_data[i];\n",
+    "          r_data = rhs_data[i];\n",
+    "\n",
+    "          GENERIC_BINARY_OP(&output.value, lhs_data[i], l_valid, rhs_data[i], r_valid);\n",
+    "\n",
+    "          out_data[i] = output.value;\n",
+    "          out_mask[i] = output.valid;\n",
+    "             \n",
+    "      }   \n",
+    "      \n",
+    "__device__ __inline__ void GENERIC_BINARY_OP(                                                                                                                                                                                                                                \n",
+    "  int64_t* _ZN8__main__6f_2413E6Masked6Masked_param_0,                                                                                                                                                                                                                       \n",
+    "  long int _ZN8__main__6f_2413E6Masked6Masked_param_1,                                                                                                                                                                                                                       \n",
+    "  char _ZN8__main__6f_2413E6Masked6Masked_param_2,                                                                                                                                                                                                                           \n",
+    "  long int _ZN8__main__6f_2413E6Masked6Masked_param_3,                                                                                                                                                                                                                       \n",
+    "  char _ZN8__main__6f_2413E6Masked6Masked_param_4                                                                                                                                                                                                                            \n",
+    "){                                                                                                                                                                                                                                                                           \n",
+    "                                                                                                                                                                                                                                                                             \n",
+    " asm volatile (\"{\");  asm volatile (\"  .reg .pred _p<4>;\");                                                                                                                                                                                                                  \n",
+    "   /**   .reg .pred     %p<4>  */                                                                                                                                                                                                                                            \n",
+    "                                                                                                                                                                                                                                                                             \n",
+    "  asm volatile (\"  .reg .b16 _rs<4>;\");                                                                                                                                                                                                                                      \n",
+    "   /**   .reg .b16      %rs<4>  */                                                                                                                                                                                                                                           \n",
+    "                                                                                                                                                                                                                                                                             \n",
+    "  asm volatile (\"  .reg .b32 _r<2>;\");                                                                                                                                                                                                                                       \n",
+    "   /**   .reg .b32      %r<2>  */                                                                                                                                                                                                                                            \n",
+    "                                                                                                                                                                                                                                                                             \n",
+    "  asm volatile (\"  .reg .b64 _rd<6>;\");                                                                                                                                                                                                                                      \n",
+    "   /**   .reg .b64      %rd<6>  */                                                                                                                                                                                                                                           \n",
+    "\n",
+    "  asm volatile (\"  mov.u64 _rd1,  %0;\": : \"l\"(_ZN8__main__6f_2413E6Masked6Masked_param_0));\n",
+    "   /**   ld.param.u64   %rd1, [_ZN8__main__6f$2413E6Masked6Masked_param_0]  */\n",
+    "\n",
+    "  asm volatile (\"  mov.u64 _rd2,  %0;\": : \"l\"(_ZN8__main__6f_2413E6Masked6Masked_param_1));\n",
+    "   /**   ld.param.u64   %rd2, [_ZN8__main__6f$2413E6Masked6Masked_param_1]  */\n",
+    "\n",
+    "  asm volatile (\"  mov.u64 _rd3,  %0;\": : \"l\"(_ZN8__main__6f_2413E6Masked6Masked_param_3));\n",
+    "   /**   ld.param.u64   %rd3, [_ZN8__main__6f$2413E6Masked6Masked_param_3]  */\n",
+    "\n",
+    "  asm volatile (\"  cvt.u8.u8 _rs1,  %0;\": : \"h\"( static_cast<short>(_ZN8__main__6f_2413E6Masked6Masked_param_2)));\n",
+    "   /**   ld.param.u8    %rs1, [_ZN8__main__6f$2413E6Masked6Masked_param_2]  */\n",
+    "\n",
+    "  asm volatile (\"  setp.ne.s16 _p1, _rs1, 0;\");\n",
+    "   /**   setp.ne.s16    %p1, %rs1, 0  */\n",
+    "\n",
+    "  asm volatile (\"  cvt.u8.u8 _rs2,  %0;\": : \"h\"( static_cast<short>(_ZN8__main__6f_2413E6Masked6Masked_param_4)));\n",
+    "   /**   ld.param.u8    %rs2, [_ZN8__main__6f$2413E6Masked6Masked_param_4]  */\n",
+    "\n",
+    "  asm volatile (\"  setp.ne.s16 _p2, _rs2, 0;\");\n",
+    "   /**   setp.ne.s16    %p2, %rs2, 0  */\n",
+    "\n",
+    "  asm volatile (\"  and.pred _p3, _p1, _p2;\");\n",
+    "   /**   and.pred       %p3, %p1, %p2  */\n",
+    "\n",
+    "  asm volatile (\"  add.s64 _rd4, _rd3, _rd2;\");\n",
+    "   /**   add.s64        %rd4, %rd3, %rd2  */\n",
+    "\n",
+    "  asm volatile (\"  selp.b64 _rd5, _rd4, 0, _p3;\");\n",
+    "   /**   selp.b64       %rd5, %rd4, 0, %p3  */\n",
+    "\n",
+    "  asm volatile (\"  selp.u16 _rs3, 1, 0, _p3;\");\n",
+    "   /**   selp.u16       %rs3, 1, 0, %p3  */\n",
+    "\n",
+    "  asm volatile (\"  st.u64 [_rd1], _rd5;\");\n",
+    "   /**   st.u64         [%rd1], %rd5  */\n",
+    "\n",
+    "  asm volatile (\"  st.u8 [_rd1+8], _rs3;\");\n",
+    "   /**   st.u8  [%rd1+8], %rs3  */\n",
+    "\n",
+    "  asm volatile (\"  mov.u32 _r1, 0;\");\n",
+    "   /**   mov.u32        %r1, 0  */\n",
+    "\n",
+    "  asm volatile (\" /** *** The way we parse the CUDA PTX assumes the function returns the return value through the first function parameter. Thus the `st.param.***` instructions are not processed. *** */\");\n",
+    "   /**   st.param.b32   [func_retval0+0], %r1  */\n",
+    "\n",
+    "  asm volatile (\"bra RETTGT;\");\n",
+    "\n",
+    "\n",
+    " asm volatile (\"RETTGT:}\");}      \n",
+    "          \n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "concrete-pillow",
+   "metadata": {},
+   "source": [
+    "# Test it\n",
+    "Here are some very basic cython bindings just used for the purposes of testing this exact functionality\n",
+    "```\n",
+    "def masked_binary_op(Column A, Column B, op, Column output_column, Column output_mask):\n",
+    "    cdef column_view A_view = A.view()\n",
+    "    cdef column_view B_view = B.view()\n",
+    "\n",
+    "    cdef string c_str\n",
+    "    cdef type_id c_tid\n",
+    "    cdef data_type c_dtype\n",
+    "\n",
+    "    if A.dtype != np.dtype('int64') or B.dtype != np.dtype('int64'):\n",
+    "        raise TypeError('int64 please')\n",
+    "    \n",
+    " \n",
+    "    from cudf.core.udf import compile_udf\n",
+    "    c_str = compile_udf(op).encode('UTF-8')\n",
+    "\n",
+    "    c_tid = <type_id> (\n",
+    "        <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]\n",
+    "    )\n",
+    "    c_dtype = data_type(c_tid)\n",
+    "\n",
+    "    cdef column_view outcol_view = output_column.view()\n",
+    "    cdef column_view outmsk_view = output_mask.view()\n",
+    "\n",
+    "    with nogil:\n",
+    "        c_output = move(libcudf_transform.masked_binary_op(\n",
+    "            A_view,\n",
+    "            B_view,\n",
+    "            c_str,\n",
+    "            c_dtype,\n",
+    "            outcol_view,\n",
+    "            outmsk_view\n",
+    "        ))\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "after-fellow",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cudf._lib.transform import masked_binary_op\n",
+    "\n",
+    "def demo_udf(func, s1, s2):\n",
+    "    col1, col2 = s1._column, s2._column\n",
+    "\n",
+    "    output_column = cudf.core.column.as_column(np.arange(8), dtype='int64')\n",
+    "    output_mask = cudf.core.column.as_column([False] * 8)\n",
+    "\n",
+    "    result_col = masked_binary_op(col1, col2, func, output_column, output_mask)\n",
+    "    return cudf.Series(result_col)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "phantom-square",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0       2\n",
+       "1    <NA>\n",
+       "2    <NA>\n",
+       "3    <NA>\n",
+       "4       6\n",
+       "5    <NA>\n",
+       "6      10\n",
+       "7    <NA>\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def f(x, y):\n",
+    "    return x + y\n",
+    "\n",
+    "s1 = cudf.Series([1, None, 3,    None, 2, 2,    5, None])\n",
+    "s2 = cudf.Series([1, 2,    None, None, 4, None, 5, None])\n",
+    "\n",
+    "demo_udf(f, s1, s2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "southern-stationery",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0       2\n",
+       "1    <NA>\n",
+       "2    <NA>\n",
+       "3    <NA>\n",
+       "4       6\n",
+       "5    <NA>\n",
+       "6      10\n",
+       "7    <NA>\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s1 + s2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "recovered-premiere",
+   "metadata": {},
+   "source": [
+    "# `cudf.NA`\n",
+    "In general, we wan't the capability to work with `cudf.NA` inside our functions directly. To do this, we're just going to reapply the same machinery to overload what happens when we add a `MaskedType` to `cudf.NA`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "filled-malta",
+   "metadata": {},
+   "source": [
+    "#### Create an NAType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "chemical-trick",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cudf.core.scalar import _NAType\n",
+    "class NAType(types.Type):\n",
+    "    # \"There is a type called NAType\"\n",
+    "    def __init__(self):\n",
+    "        super().__init__(name=\"NA\")\n",
+    "\n",
+    "numba_na = NAType()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "southern-prague",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "numba.core.datamodel.models.OpaqueModel"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from numba.core.extending import typeof_impl\n",
+    "@typeof_impl.register(_NAType)\n",
+    "def typeof_na(val, c):\n",
+    "    # instances of _NAType will be \n",
+    "    # treaded as instances of NAType. \n",
+    "    return numba_na\n",
+    "\n",
+    "\n",
+    "\n",
+    "register_model(NAType)(models.OpaqueModel)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "accessible-monthly",
+   "metadata": {},
+   "source": [
+    "#### `operator.add` typing for Masked <-> NA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "harmful-glossary",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cuda_registry.register_global(operator.add)\n",
+    "class MaskedScalarAddNull(AbstractTemplate):\n",
+    "    def generic(self, args, kws):\n",
+    "        if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):\n",
+    "            return signature(numba_masked, numba_masked, numba_na)\n",
+    "  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "brown-cheese",
+   "metadata": {},
+   "source": [
+    "#### Lowering: AKA what to actually do when this is requested\n",
+    "This says that when `+` is invoked between a `MaskedType` and an `NAType`, to make a new `MaskedType`, set it's validity to zero and return it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "encouraging-reynolds",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba.cuda.cudaimpl import registry as cuda_lowering_registry\n",
+    "\n",
+    "@cuda_lower(operator.add, MaskedType, NAType)\n",
+    "def masked_scalar_add_na_impl(context, builder, sig, args):\n",
+    "#    return_type = sig.return_type\n",
+    "    # use context to get llvm type for a bool\n",
+    "    result = cgutils.create_struct_proxy(numba_masked)(context, builder)\n",
+    "    result.valid = context.get_constant(types.boolean, 0)\n",
+    "    return result._getvalue()\n",
+    "\n",
+    "\n",
+    "@cuda_lowering_registry.lower_constant(NAType)\n",
+    "def constant_dummy(context, builder, ty, pyval):\n",
+    "    # This handles None, etc.\n",
+    "    return context.get_dummy_value()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "productive-rings",
+   "metadata": {},
+   "source": [
+    "# Constants\n",
+    "At this point the pattern is familiar. Register an overload that emits a signature if the operands match a `MaskedType` and a constant. The lowering is logically fairly simple. If the `MaskedType` is null, the answer is null, else the answer is a new `MaskedType` whose `value` is the sum of the inputs `value` and the constant."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "short-makeup",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llvmlite import ir\n",
+    "\n",
+    "@cuda_registry.register_global(operator.add)\n",
+    "class MaskedScalarAddConstant(AbstractTemplate):\n",
+    "    def generic(self, args, kws):\n",
+    "        if isinstance(args[0], MaskedType) and isinstance(args[1], types.Integer):\n",
+    "            return signature(numba_masked, numba_masked, types.int64)\n",
+    "\n",
+    "@cuda_lower(operator.add, MaskedType, types.Integer)\n",
+    "def masked_scalar_add_constant_impl(context, builder, sig, input_values):\n",
+    "    masked_type, const_type = sig.args\n",
+    "\n",
+    "    indata = cgutils.create_struct_proxy(masked_type)(context, builder, value=input_values[0])\n",
+    "    result = cgutils.create_struct_proxy(numba_masked)(context, builder)\n",
+    "    #to_add_const = context.get_constant(const_type, input_values[1])\n",
+    "\n",
+    "    result.valid = context.get_constant(types.boolean, 0)\n",
+    "    with builder.if_then(indata.valid):\n",
+    "        result.value = builder.add(indata.value, input_values[1])\n",
+    "        result.valid = context.get_constant(types.boolean, 1)\n",
+    "\n",
+    "    return result._getvalue()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "entitled-wealth",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "def f(x, y):\n",
+    "    return x + y + cudf.NA\n",
+    "\n",
+    "s1 = cudf.Series([1, None, 3,    None, 2, 2,    5, None])\n",
+    "s2 = cudf.Series([1, 2,    None, None, 4, None, 5, None])\n",
+    "\n",
+    "result = demo_udf(f, s1, s2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "genuine-davis",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    <NA>\n",
+       "1    <NA>\n",
+       "2    <NA>\n",
+       "3    <NA>\n",
+       "4    <NA>\n",
+       "5    <NA>\n",
+       "6    <NA>\n",
+       "7    <NA>\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "polyphonic-second",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def f(x, y):\n",
+    "    return x + y + 1\n",
+    "\n",
+    "s1 = cudf.Series([1, None, 3,    None, 2, 2,    5, None])\n",
+    "s2 = cudf.Series([1, 2,    None, None, 4, None, 5, None])\n",
+    "\n",
+    "result = demo_udf(f, s1, s2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "sporting-campbell",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0       3\n",
+       "1    <NA>\n",
+       "2    <NA>\n",
+       "3    <NA>\n",
+       "4       7\n",
+       "5    <NA>\n",
+       "6      11\n",
+       "7    <NA>\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "focal-castle",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "romance-complaint",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "speaking-aquatic",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From cb85d88b7f581a134fd34e51f8c275c804b3029c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 12 Apr 2021 14:15:07 -0700
Subject: [PATCH 019/132] integrate jitify2

---
 .../Modules/JitifyPreprocessKernels.cmake     |  1 +
 cpp/src/transform/jit/binop_kernel.cu         | 82 +++++++++++++++++++
 cpp/src/transform/transform.cpp               | 42 +++++++++-
 3 files changed, 121 insertions(+), 4 deletions(-)
 create mode 100644 cpp/src/transform/jit/binop_kernel.cu

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index eb1ade61440..fceed965c9e 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -56,6 +56,7 @@ endfunction()
 
 jit_preprocess_files(SOURCE_DIRECTORY      ${CUDF_SOURCE_DIR}/src
                      FILES                 binaryop/jit/kernel.cu
+                                           transform/jit/binop_kernel.cu
                                            transform/jit/kernel.cu
                                            rolling/jit/kernel.cu
                      )
diff --git a/cpp/src/transform/jit/binop_kernel.cu b/cpp/src/transform/jit/binop_kernel.cu
new file mode 100644
index 00000000000..75c1e383d96
--- /dev/null
+++ b/cpp/src/transform/jit/binop_kernel.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Include Jitify's cstddef header first
+#include <cstddef>
+
+#include <cuda/std/climits>
+#include <cuda/std/cstddef>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <cudf/wrappers/durations.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+#include <transform/jit/operation-udf.hpp>
+
+#include <cudf/types.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+#include <cudf/utilities/bit.hpp>
+
+
+namespace cudf {
+namespace transformation {
+namespace jit {
+
+struct Masked {
+  int64_t value;
+  bool valid;
+};
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+__global__
+void binop_kernel(cudf::size_type size,
+                  TypeOut* out_data, 
+                  TypeLhs* lhs_data,
+                  TypeRhs* rhs_data,
+                  bool* out_mask,
+                  cudf::bitmask_type const* lhs_mask,
+                  cudf::size_type lhs_offset,
+                  cudf::bitmask_type const* rhs_mask,
+                  cudf::size_type rhs_offset
+) {
+    int tid = threadIdx.x;
+    int blkid = blockIdx.x;
+    int blksz = blockDim.x;
+    int gridsz = gridDim.x;
+    int start = tid + blkid * blksz;
+    int step = blksz * gridsz;
+
+    Masked output;
+    char l_valid;
+    char r_valid;
+    long int l_data;
+    long int r_data;
+
+    for (cudf::size_type i=start; i<size; i+=step) {
+      l_valid = lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true;
+      r_valid = rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true;
+      l_data = lhs_data[i];
+      r_data = rhs_data[i];
+      GENERIC_BINARY_OP(&output.value, lhs_data[i], l_valid, rhs_data[i], r_valid);
+      out_data[i] = output.value;
+      out_mask[i] = output.valid;
+    }
+}
+
+}  // namespace jit
+}  // namespace transformation
+}  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 8390137f817..ef26fe4ec3e 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
+#include <jit_preprocessed_files/transform/jit/binop_kernel.cu.jit.hpp>
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -28,10 +29,6 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <jit/timestamps.hpp.jit>
-#include <jit/types.hpp.jit>
-#include <jit/bit.hpp.jit>
-
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -67,7 +64,41 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
+void binary_operation(column_view const& A, 
+                      column_view const& B, 
+                      std::string const& binary_udf, 
+                      data_type output_type, 
+                      column_view const& outcol_view,
+                      column_view const& outmsk_view,
+                      rmm::mr::device_memory_resource* mr)
+{
+  std::string kernel_name =
+  jitify2::reflection::Template("cudf::transformation::jit::binop_kernel")  //
+    .instantiate(cudf::jit::get_type_name(outcol_view.type()),  // list of template arguments
+                 cudf::jit::get_type_name(A.type()),
+                 cudf::jit::get_type_name(B.type()));
+
+  std::string cuda_source = cudf::jit::parse_single_function_ptx(
+                     binary_udf, "GENERIC_BINARY_OP", cudf::jit::get_type_name(output_type), {0});
 
+  rmm::cuda_stream_view stream;
+
+  cudf::jit::get_program_cache(*transform_jit_binop_kernel_cu_jit)
+    .get_kernel(
+      kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                   //
+    ->launch(outcol_view.size(),
+            cudf::jit::get_data_ptr(outcol_view),
+            cudf::jit::get_data_ptr(A),
+            cudf::jit::get_data_ptr(B),
+            cudf::jit::get_data_ptr(outmsk_view),
+            A.null_mask(),
+            A.offset(),
+            B.null_mask(),
+            B.offset()
+    );
+}
+/*
 void binary_operation(column_view const& A, 
                       column_view const& B, 
                       std::string const& binary_udf, 
@@ -77,6 +108,8 @@ void binary_operation(column_view const& A,
                       rmm::mr::device_memory_resource* mr)
 {
 
+  std::string kernel_name
+
   std::string hash = "prog_transform" + std::to_string(std::hash<std::string>{}(binary_udf));
 
   std::cout << binary_udf << std::endl;
@@ -118,6 +151,7 @@ void binary_operation(column_view const& A,
     );
 
 }
+*/
 
 }  // namespace jit
 }  // namespace transformation

From ad067eb7d946014fafdaed55020bb6f20be42792 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 12 Apr 2021 14:16:02 -0700
Subject: [PATCH 020/132] minor cleanup

---
 python/cudf/cudf/core/udf.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 7bb59b6ee6f..564b2cd2cf9 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -66,12 +66,6 @@ def __init__(self, dmm, fe_type):
         members = [("value", types.int64), ("valid", types.bool_)]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
-#@register_model(NAType) # check what model NoneType uses
-#class NAModel(models.OpaqueModel):
-#    def __init__(self, dmm, fe_type):
-#        members = []
-#        models.StructModel.__init__(self, dmm, fe_type, members)
-
 register_model(NAType)(models.OpaqueModel)
 
 @lower_builtin(Masked, types.int64, types.bool_)

From 237af25dd7784c21827825d826ab454c9bcd656f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 22 Apr 2021 10:57:17 -0700
Subject: [PATCH 021/132] pushing forward with ND transform

---
 .../Modules/JitifyPreprocessKernels.cmake     |   1 +
 cpp/include/cudf/transform.hpp                |   7 ++
 .../transform/jit/baked_udf_requirements.cu   |   2 +
 cpp/src/transform/transform.cpp               | 112 +++++++++--------
 python/cudf/cudf/_lib/cpp/transform.pxd       |   8 ++
 python/cudf/cudf/_lib/transform.pyx           |  32 +++++
 python/cudf/cudf/core/udf.py                  | 114 +++++++++++++++---
 7 files changed, 211 insertions(+), 65 deletions(-)
 create mode 100644 cpp/src/transform/jit/baked_udf_requirements.cu

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index fceed965c9e..c481301e335 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -59,6 +59,7 @@ jit_preprocess_files(SOURCE_DIRECTORY      ${CUDF_SOURCE_DIR}/src
                                            transform/jit/binop_kernel.cu
                                            transform/jit/kernel.cu
                                            rolling/jit/kernel.cu
+                                           transform/jit/baked_udf_requirements.cu
                      )
 
 add_custom_target(jitify_preprocess_run DEPENDS ${JIT_PREPROCESSED_FILES})
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 5ba5ac26d86..8ed9b72c97d 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -63,6 +63,13 @@ std::unique_ptr<column> masked_binary_op(
   column_view const& outmsk_view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<column> generalized_masked_op(
+  table_view const& data_view,
+  std::string const& udf,
+  data_type output_type,
+  column_view const& outcol_view,
+  column_view const& outmsk_view,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
diff --git a/cpp/src/transform/jit/baked_udf_requirements.cu b/cpp/src/transform/jit/baked_udf_requirements.cu
new file mode 100644
index 00000000000..d31f1d9f899
--- /dev/null
+++ b/cpp/src/transform/jit/baked_udf_requirements.cu
@@ -0,0 +1,2 @@
+#include <cudf/types.hpp>
+#include <transform/jit/operation-udf.hpp>
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index ef26fe4ec3e..f1ea7956c0d 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -16,6 +16,8 @@
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 #include <jit_preprocessed_files/transform/jit/binop_kernel.cu.jit.hpp>
+#include <jit_preprocessed_files/transform/jit/baked_udf_requirements.cu.jit.hpp>
+
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -28,6 +30,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -98,60 +101,43 @@ void binary_operation(column_view const& A,
             B.offset()
     );
 }
-/*
-void binary_operation(column_view const& A, 
-                      column_view const& B, 
-                      std::string const& binary_udf, 
-                      data_type output_type, 
-                      column_view const& outcol_view,
-                      column_view const& outmsk_view,
-                      rmm::mr::device_memory_resource* mr)
-{
-
-  std::string kernel_name
 
-  std::string hash = "prog_transform" + std::to_string(std::hash<std::string>{}(binary_udf));
-
-  std::cout << binary_udf << std::endl;
-
-  std::string cuda_source = code::kernel_header;
-  cuda_source += cudf::jit::parse_single_function_ptx(
-                     binary_udf, "GENERIC_BINARY_OP", cudf::jit::get_type_name(output_type), {0});
+void generalized_operation(table_view const& data_view,
+                           std::string const& udf,
+                           data_type output_type,
+                           column_view const& outcol_view,
+                           column_view const& outmsk_view,
+                           rmm::mr::device_memory_resource* mr)
+{
+  rmm::cuda_stream_view stream;
+  //std::string cuda_source = cudf::jit::parse_single_function_ptx(
+  //                   udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
+  /*
+  size_t num_cols = data_view.num_columns();
+  std::vector<std::string> input_types(num_cols);
+  std::vector<void*> args(num_cols);
 
-  cuda_source += code::null_kernel;
 
-  std::cout << cuda_source << std::endl;
+  column_view this_view;
+  for (size_t i = 0; i < num_cols; i++) {
+    this_view = data_view.column(i);
+    input_types[i] = cudf::jit::get_type_name(this_view.type());
+  }
+  */
 
-  rmm::cuda_stream_view stream;
+  std::string kernel_name =
+    jitify2::reflection::Template("genop_kernel")  //
+      .instantiate(cudf::jit::get_type_name(outcol_view.type()));
 
-  // Launch the jitify kernel
-
-  cudf::jit::launcher(hash,
-                      cuda_source,
-                      header_names,
-                      cudf::jit::compiler_flags,
-                      headers_code,
-                      stream)
-    .set_kernel_inst("masked_binary_op_kernel",
-                    {
-                      cudf::jit::get_type_name(outcol_view.type()), 
-                      cudf::jit::get_type_name(A.type()),
-                      cudf::jit::get_type_name(B.type()),
-                    }
-    )
-    .launch(outcol_view.size(),
-            cudf::jit::get_data_ptr(outcol_view),
-            cudf::jit::get_data_ptr(A),
-            cudf::jit::get_data_ptr(B),
-            cudf::jit::get_data_ptr(outmsk_view),
-            A.null_mask(),
-            A.offset(),
-            B.null_mask(),
-            B.offset()
-    );
+  cudf::jit::get_program_cache(*transform_jit_baked_udf_requirements_cu_jit)
+    .get_kernel(
+      kernel_name, {}, {{"transform/jit/operation-udf.hpp", udf}}, {"-arch=sm_."})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                   //
+    ->launch(outcol_view.size(),
+             static_cast<cudf::size_type>(7),                                                                 //
+             cudf::jit::get_data_ptr(outcol_view));
 
 }
-*/
 
 }  // namespace jit
 }  // namespace transformation
@@ -197,6 +183,26 @@ std::unique_ptr<column> masked_binary_op_inner(column_view const& A,
   return output;
 }
 
+std::unique_ptr<column> generalized_masked_op_inner(
+  table_view const& data_view,
+  std::string const& udf,
+  data_type output_type,
+  column_view const& outcol_view,
+  column_view const& outmsk_view,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+
+  transformation::jit::generalized_operation(data_view, udf, output_type, outcol_view, outmsk_view, mr);
+
+  std::unique_ptr<column> output;
+
+  return output;
+
+}
+
+
+
 }  // namespace detail
 
 std::unique_ptr<column> transform(column_view const& input,
@@ -217,9 +223,19 @@ std::unique_ptr<column> masked_binary_op(column_view const& A,
                                          column_view const& outmsk_view,
                                          rmm::mr::device_memory_resource* mr)
 {
-  std::cout << "HERE!!" << std::endl;
   return detail::masked_binary_op_inner(A, B, binary_udf, output_type, outcol_view, outmsk_view, mr);
 }
 
+std::unique_ptr<column> generalized_masked_op(
+  table_view const& data_view,
+  std::string const& udf,
+  data_type output_type,
+  column_view const& outcol_view,
+  column_view const& outmsk_view,
+  rmm::mr::device_memory_resource* mr)
+{
+  return detail::generalized_masked_op_inner(data_view, udf, output_type, outcol_view, outmsk_view, mr);
+}
+
 
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 217b3374579..00d0306dbca 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -47,6 +47,14 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         column_view outmask_view,
     ) except +
 
+    cdef unique_ptr[column] generalized_masked_op(
+        table_view data_view,
+        string udf,
+        data_type output_type,
+        column_view outcol_view,
+        column_view outmask_view
+    ) except +
+
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 235dc3e12bd..1b90a5d3a3f 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -127,6 +127,38 @@ def transform(Column input, op):
 
     return Column.from_unique_ptr(move(c_output))
 
+def generalized_masked_op(Table data, op, Column output_column, Column output_mask):
+    cdef string c_str
+    cdef type_id c_tid
+    cdef data_type c_dtype
+
+    cdef table_view data_view = data.view()
+
+    from cudf.core.udf import compile_udf, demo_kernel
+    #c_str = compile_udf(op).encode('UTF-8')
+    c_str = demo_kernel.encode('UTF-8')
+    c_tid = <type_id> (
+        <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]
+    )
+    c_dtype = data_type(c_tid)
+
+    cdef column_view outcol_view = output_column.view()
+    cdef column_view outmsk_view = output_mask.view()
+
+    with nogil:
+        c_output = move(libcudf_transform.generalized_masked_op(
+            data_view,
+            c_str,
+            c_dtype,
+            outcol_view,
+            outmsk_view
+        ))
+
+    output_mask_real = bools_to_mask(output_mask)
+    output_column = output_column.set_mask(output_mask_real)
+
+    return output_column
+
 def masked_binary_op(Column A, Column B, op, Column output_column, Column output_mask):
     cdef column_view A_view = A.view()
     cdef column_view B_view = B.view()
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 564b2cd2cf9..1931ce7e0d0 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -10,7 +10,8 @@
     type_callable,
     typeof_impl,
 )
-from numba.core.typing import signature
+from numba.core.typing import signature as nb_signature
+from inspect import signature as py_signature
 from numba.core.typing.templates import AbstractTemplate
 from numba.cuda.cudadecl import registry as cuda_registry
 from numba.cuda.cudaimpl import lower as cuda_lower, registry as cuda_lowering_registry
@@ -84,14 +85,14 @@ class MaskedScalarAdd(AbstractTemplate):
     # abstracttemplate vs concretetemplate
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
-            return signature(numba_masked, numba_masked, numba_masked)
+            return nb_signature(numba_masked, numba_masked, numba_masked)
 
 
 @cuda_registry.register_global(operator.add)
 class MaskedScalarAddNull(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
-            return signature(numba_masked, numba_masked, numba_na)
+            return nb_signature(numba_masked, numba_masked, numba_na)
 
 @cuda_lower(operator.add, MaskedType, MaskedType)
 def masked_scalar_add_impl(context, builder, sig, args):
@@ -140,23 +141,102 @@ def masked_add_py_2(m1, m2):
     return m1 + m2
 
 def compile_udf(func):
+    n_params = len(py_signature(func).parameters)
+    to_compiler_sig = tuple(numba_masked for arg in range(n_params))
 
-    signature = (numba_masked, numba_masked)
-    ptx, _ = cuda.compile_ptx_for_current_device(func, signature, device=True)
-    return ptx
+    # Get the inlineable PTX function
+    ptx, _ = cuda.compile_ptx_for_current_device(func, to_compiler_sig, device=True)
+
+    # get the kernel that calls the inlineable function
+    kernel = make_kernel(n_params)
+    return kernel, ptx
 
 NA = _NAType()
 
-@cuda.jit(numba_masked(numba_masked), device=True)
-def test_scalar_null_add(masked):
-    result = masked + NA
+def make_kernel(n_params):
+    '''
+    create a string containing the right templated kernel
+    for `func`
+    '''
+    
+    indent = ' '*18
+    
+    # Hack together the template string
+    result = ''
+        
+    templates = 'template <typename TypeOut, '
+    for i in range(n_params):
+        templates += f"typename Type{i}, "
+    
+    templates = templates[:-3] + f"{i}>"
+    result += templates
+    
+    # Hack together the function signature
+    sig = '\n__global__\nvoid genop_kernel(cudf::size_type size,\n'
+    sig += indent + "TypeOut* out_data,\n"
+    sig += indent + 'bool* out_mask,\n'
+    for i in range(n_params):
+        sig += indent + f"Type{i}* data_{i},\n"
+        sig += indent + f"cudf::bitmask_type const* mask_{i},\n"
+        sig += indent + f"cudf::size_type offset_{i},\n"
+    sig = sig[:-2] + ') {'
+    
+    result += sig
+    result += '\n'
+    
+    # standard thread block
+    result += '\n'
+    result += '\tint tid = threadIdx.x;\n'
+    result += '\tint blkid = blockIdx.x;\n'
+    result += '\tint blksz = blockDim.x;\n'
+    result += '\tint gridsz = gridDim.x;\n'
+    result += '\tint start = tid + blkid * blksz;\n'
+    result += '\tint step = blksz * gridsz;\n'
+    result += '\n'
+    
+    result += '\tMasked output;\n'
+    
+    for i in range(n_params):
+        result += f"\tchar valid_{i};\n"
+
+    # main loop
+    result += "\tfor (cudf::size_type i=start; i<size; i+=step) {\n"
+    
+    for i in range(n_params):
+        result += f"\t\tvalid_{i} = cudf::bit_is_set(mask_{i}, offset_{i} + i) : true;\n"
+        
+    # genop signature
+    genop_sig = "\t\tGENERIC_OP(&output.value, "
+    for i in range(n_params):
+        genop_sig += f"data_{i}[i], valid_{i}, "
+    
+    genop_sig = genop_sig[:-2] + ');\n'
+    
+    result += genop_sig
+    
+    # set the output
+    result += "\t\tout_data[i] = output.value;\n"
+    result += "\t\tout_mask[i] = output.valid;\n"
+    
+    result += "\t}\n"
+    result += "}"
+    
     return result
 
-@cuda.jit
-def test_test_scalar_null_add(data, masks):
-    m = Masked(1, True)
-    result = test_scalar_null_add(m)
-    print(result.value)
-    print(types.int8(result.valid))
-    data[0] = result.value
-    masks[0] = result.valid
+
+demo_kernel = ''' 
+template <typename TypeOut>
+__global__
+void genop_kernel(cudf::size_type size, cudf::size_type value, TypeOut* out_data) {
+
+	int tid = threadIdx.x;
+	int blkid = blockIdx.x;
+	int blksz = blockDim.x;
+	int gridsz = gridDim.x;
+	int start = tid + blkid * blksz;
+	int step = blksz * gridsz;
+
+	for (cudf::size_type i=start; i<size; i+=step) {
+        out_data[i] = value;
+	}
+}'''

From 8e11c7e1dd3d54261becd53d9585a59ed60fb0ef Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 27 Apr 2021 14:07:24 -0700
Subject: [PATCH 022/132] variadic kernel up and running

---
 cpp/src/transform/jit/binop_kernel.cu | 58 ++++++++++++++++++++++++++-
 cpp/src/transform/transform.cpp       | 45 ++++++++++++---------
 python/cudf/cudf/_lib/transform.pyx   |  2 +-
 3 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/cpp/src/transform/jit/binop_kernel.cu b/cpp/src/transform/jit/binop_kernel.cu
index 75c1e383d96..c81cd21dac9 100644
--- a/cpp/src/transform/jit/binop_kernel.cu
+++ b/cpp/src/transform/jit/binop_kernel.cu
@@ -31,6 +31,8 @@
 #include <cudf/wrappers/timestamps.hpp>
 #include <cudf/utilities/bit.hpp>
 
+#include <tuple>
+#include <cuda/std/tuple>
 
 namespace cudf {
 namespace transformation {
@@ -40,7 +42,7 @@ struct Masked {
   int64_t value;
   bool valid;
 };
-
+/*
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 __global__
 void binop_kernel(cudf::size_type size,
@@ -76,6 +78,60 @@ void binop_kernel(cudf::size_type size,
       out_mask[i] = output.valid;
     }
 }
+*/
+template <typename TypeIn, typename MaskType, typename OffsetType>
+__device__ auto make_args(cudf::size_type id,
+                          TypeIn in_ptr,
+                          MaskType in_mask,
+                          OffsetType in_offset) 
+{
+    bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
+    return cuda::std::make_tuple(in_ptr[id], valid);
+}
+
+template <typename InType, typename MaskType, typename OffsetType, typename ... Arguments>
+__device__ auto make_args(cudf::size_type id, 
+                          InType in_ptr, 
+                          MaskType in_mask,     // in practice, always cudf::bitmask_type const* 
+                          OffsetType in_offset,  // in practice, always cudf::size_type
+                          Arguments ... args) {
+
+    bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
+    return cuda::std::tuple_cat(
+        cuda::std::make_tuple(in_ptr[id], valid),
+        make_args(id, args...)
+    );
+}
+
+
+template <typename TypeOut, typename ... Arguments>
+__global__
+void generic_udf_kernel(cudf::size_type size, 
+                        TypeOut* out_data, 
+                        bool* out_mask, 
+                        Arguments ... args)
+{   
+
+    int tid = threadIdx.x;
+    int blkid = blockIdx.x;
+    int blksz = blockDim.x;
+    int gridsz = gridDim.x;
+    int start = tid + blkid * blksz;
+    int step = blksz * gridsz;
+
+    Masked output;
+    for (cudf::size_type i=start; i<size; i+=step) {
+      auto func_args = cuda::std::tuple_cat(
+          cuda::std::make_tuple(&output.value),
+          make_args(i, args...) // passed int64*, bool*, int64, int64*, bool*, int64
+      );
+      cuda::std::apply(GENERIC_OP, func_args);
+      out_data[i] = output.value;
+      out_mask[i] = output.valid;
+    }
+
+}
+
 
 }  // namespace jit
 }  // namespace transformation
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index f1ea7956c0d..61c9bdd7fbb 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -75,31 +75,36 @@ void binary_operation(column_view const& A,
                       column_view const& outmsk_view,
                       rmm::mr::device_memory_resource* mr)
 {
-  std::string kernel_name =
-  jitify2::reflection::Template("cudf::transformation::jit::binop_kernel")  //
-    .instantiate(cudf::jit::get_type_name(outcol_view.type()),  // list of template arguments
-                 cudf::jit::get_type_name(A.type()),
-                 cudf::jit::get_type_name(B.type()));
-
-  std::string cuda_source = cudf::jit::parse_single_function_ptx(
-                     binary_udf, "GENERIC_BINARY_OP", cudf::jit::get_type_name(output_type), {0});
 
-  rmm::cuda_stream_view stream;
 
+ std::string generic_kernel_name = 
+ jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
+    .instantiate(cudf::jit::get_type_name(outcol_view.type()),
+                 "int64_t*",
+                 "uint32_t*",
+                 "int64_t",
+                 "int64_t*",
+                 "uint32_t*",
+                 "int64_t");
+
+  std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
+                     binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
+                     
+  rmm::cuda_stream_view generic_stream;
   cudf::jit::get_program_cache(*transform_jit_binop_kernel_cu_jit)
     .get_kernel(
-      kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                   //
+      generic_kernel_name, {}, {{"transform/jit/operation-udf.hpp", generic_cuda_source}}, {"-arch=sm_."})  //
+    ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())                                   //
     ->launch(outcol_view.size(),
-            cudf::jit::get_data_ptr(outcol_view),
-            cudf::jit::get_data_ptr(A),
-            cudf::jit::get_data_ptr(B),
-            cudf::jit::get_data_ptr(outmsk_view),
-            A.null_mask(),
-            A.offset(),
-            B.null_mask(),
-            B.offset()
-    );
+             cudf::jit::get_data_ptr(outcol_view),
+             cudf::jit::get_data_ptr(outmsk_view), 
+             cudf::jit::get_data_ptr(A),
+             A.null_mask(), // cudf::bitmask_type * 
+             A.offset(),
+             cudf::jit::get_data_ptr(B),
+             B.null_mask(),
+             B.offset());
+
 }
 
 void generalized_operation(table_view const& data_view,
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 1b90a5d3a3f..68d8c9f6228 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -172,7 +172,7 @@ def masked_binary_op(Column A, Column B, op, Column output_column, Column output
     
  
     from cudf.core.udf import compile_udf
-    c_str = compile_udf(op).encode('UTF-8')
+    c_str = compile_udf(op)[1].encode('UTF-8')
 
     c_tid = <type_id> (
         <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]

From 591627c4333315352f432c898bd71a9f5b87f0f0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 28 Apr 2021 06:52:59 -0700
Subject: [PATCH 023/132] big plays

---
 cpp/include/cudf/transform.hpp          |  3 +--
 cpp/src/transform/transform.cpp         | 32 ++++++++++++-------------
 python/cudf/cudf/_lib/cpp/transform.pxd |  3 +--
 python/cudf/cudf/_lib/transform.pyx     | 19 ++++-----------
 python/cudf/cudf/core/dataframe.py      |  6 +++++
 python/cudf/cudf/core/frame.py          | 19 ++++++++++++++-
 6 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 8ed9b72c97d..7e5c1797d11 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -55,8 +55,7 @@ std::unique_ptr<column> transform(
 
 
 std::unique_ptr<column> masked_binary_op(
-  column_view const& A,
-  column_view const& B,
+  table_view data_view,
   std::string const& binary_udf,
   data_type output_type,
   column_view const& outcol_view,
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 61c9bdd7fbb..b5c9b76ea6c 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -67,8 +67,7 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-void binary_operation(column_view const& A, 
-                      column_view const& B, 
+void binary_operation(table_view data_view,
                       std::string const& binary_udf, 
                       data_type output_type, 
                       column_view const& outcol_view,
@@ -76,9 +75,11 @@ void binary_operation(column_view const& A,
                       rmm::mr::device_memory_resource* mr)
 {
 
+  column_view A = data_view.column(0);
+  column_view B = data_view.column(1);
 
- std::string generic_kernel_name = 
- jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
+  std::string generic_kernel_name = 
+  jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
     .instantiate(cudf::jit::get_type_name(outcol_view.type()),
                  "int64_t*",
                  "uint32_t*",
@@ -170,19 +171,17 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
-std::unique_ptr<column> masked_binary_op_inner(column_view const& A, 
-                                         column_view const& B, 
-                                         std::string const& binary_udf, 
-                                         data_type output_type, 
-                                         column_view const& outcol_view,
-                                         column_view const& outmsk_view,
-                                         rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> masked_binary_op_inner(table_view data_view, 
+                                               std::string const& binary_udf, 
+                                               data_type output_type, 
+                                               column_view const& outcol_view,
+                                               column_view const& outmsk_view,
+                                               rmm::mr::device_memory_resource* mr)
 {
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-  transformation::jit::binary_operation(A, B, binary_udf, output_type, outcol_view, outmsk_view, mr);
+  transformation::jit::binary_operation(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
 
-  std::unique_ptr<column> output = make_fixed_width_column(
-    output_type, A.size(), copy_bitmask(A), cudf::UNKNOWN_NULL_COUNT, stream, mr);
+  std::unique_ptr<column> output;
 
 
   return output;
@@ -220,15 +219,14 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> masked_binary_op(column_view const& A, 
-                                         column_view const& B, 
+std::unique_ptr<column> masked_binary_op(table_view data_view,
                                          std::string const& binary_udf, 
                                          data_type output_type, 
                                          column_view const& outcol_view,
                                          column_view const& outmsk_view,
                                          rmm::mr::device_memory_resource* mr)
 {
-  return detail::masked_binary_op_inner(A, B, binary_udf, output_type, outcol_view, outmsk_view, mr);
+  return detail::masked_binary_op_inner(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
 }
 
 std::unique_ptr<column> generalized_masked_op(
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 00d0306dbca..d7efc5a2d8f 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -39,8 +39,7 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
     ) except +
 
     cdef unique_ptr[column] masked_binary_op(
-        column_view A,
-        column_view B, 
+        table_view data_view,
         string binary_udf,
         data_type output_type,
         column_view outcol_view,
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 68d8c9f6228..027ff65a13c 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -159,21 +159,12 @@ def generalized_masked_op(Table data, op, Column output_column, Column output_ma
 
     return output_column
 
-def masked_binary_op(Column A, Column B, op, Column output_column, Column output_mask):
-    cdef column_view A_view = A.view()
-    cdef column_view B_view = B.view()
-
-    cdef string c_str
+def masked_udf(Table incols, op, Column output_column, Column output_mask):
+    cdef table_view data_view = incols.data_view()
+    cdef string c_str = op.encode("UTF-8")
     cdef type_id c_tid
     cdef data_type c_dtype
 
-    if A.dtype != np.dtype('int64') or B.dtype != np.dtype('int64'):
-        raise TypeError('int64 please')
-    
- 
-    from cudf.core.udf import compile_udf
-    c_str = compile_udf(op)[1].encode('UTF-8')
-
     c_tid = <type_id> (
         <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]
     )
@@ -184,15 +175,13 @@ def masked_binary_op(Column A, Column B, op, Column output_column, Column output
 
     with nogil:
         c_output = move(libcudf_transform.masked_binary_op(
-            A_view,
-            B_view,
+            data_view,
             c_str,
             c_dtype,
             outcol_view,
             outmsk_view
         ))
 
-    #return Column.from_unique_ptr(move(c_output))
 
     output_mask_real = bools_to_mask(output_mask)
     output_column = output_column.set_mask(output_mask_real)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d5393a724ec..1559a276029 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -35,6 +35,7 @@
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
 from cudf.core.series import Series
+from cudf.core.udf import compile_udf
 from cudf.core.window import Rolling
 from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
 from cudf.utils.docutils import copy_docstring
@@ -4714,6 +4715,11 @@ def query(self, expr, local_dict=None):
             boolmask = queryutils.query_execute(self, expr, callenv)
             return self._apply_boolean_mask(boolmask)
 
+    def apply(self, func):
+        breakpoint()
+        return super()._apply(func)
+
+
     @applyutils.doc_apply()
     def apply_rows(
         self,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index dcf5044ed2f..87380175f3d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1418,6 +1418,23 @@ def _quantiles(
         result._copy_type_metadata(self)
         return result
 
+    def _apply(self, func):
+        from cudf.core.udf import compile_udf
+
+        if not all(np.dtype('int64') == dtype for dtype in self.dtypes):
+            raise TypeError("Currently only int64 is supported")
+
+        kernel, ptx = compile_udf(func)
+
+        output_column = cudf.core.column.column_empty(row_count=len(self), dtype='int64')
+        output_mask = cudf.core.column.column_empty(row_count=len(self), dtype='bool')
+
+        breakpoint()
+        result = cudf._lib.transform.masked_udf(self, ptx, output_column, output_mask)
+        breakpoint()
+        return result
+
+
     def rank(
         self,
         axis=0,
@@ -1456,7 +1473,7 @@ def rank(
         pct : bool, default False
             Whether or not to display the returned rankings in percentile
             form.
-
+f
         Returns
         -------
         same type as caller

From c07e18707ef42af5b4e242a37bc6905ef0f3205c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 28 Apr 2021 07:59:27 -0700
Subject: [PATCH 024/132] general logic for building template instantiation
 arguments

---
 cpp/src/transform/transform.cpp | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index b5c9b76ea6c..3bef0b29063 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -75,18 +75,26 @@ void binary_operation(table_view data_view,
                       rmm::mr::device_memory_resource* mr)
 {
 
+  std::vector<std::string> template_types(
+    // A ptr, mask ptr, and offset for each column
+    // plus one for the type of the output column
+    (data_view.num_columns() * 3) + 1
+  );
+  template_types[0] = cudf::jit::get_type_name(outcol_view.type());
+  for (int i = 0; i < data_view.num_columns(); i++) {
+    int offset = (i * 3) + 1;
+    template_types[offset] = cudf::jit::get_type_name(data_view.column(i).type()) + "*";
+    template_types[offset + 1] = "uint32_t*"; 
+    template_types[offset + 2] = "int64_t";
+  }
+
+
   column_view A = data_view.column(0);
   column_view B = data_view.column(1);
 
   std::string generic_kernel_name = 
   jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
-    .instantiate(cudf::jit::get_type_name(outcol_view.type()),
-                 "int64_t*",
-                 "uint32_t*",
-                 "int64_t",
-                 "int64_t*",
-                 "uint32_t*",
-                 "int64_t");
+    .instantiate(template_types);
 
   std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
                      binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});

From d21b858952bda5ad24bcaca99545361f63953f94 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 29 Apr 2021 08:03:28 -0700
Subject: [PATCH 025/132] cleanup

---
 python/cudf/cudf/core/frame.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 87380175f3d..e0d2746f22c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1429,9 +1429,7 @@ def _apply(self, func):
         output_column = cudf.core.column.column_empty(row_count=len(self), dtype='int64')
         output_mask = cudf.core.column.column_empty(row_count=len(self), dtype='bool')
 
-        breakpoint()
         result = cudf._lib.transform.masked_udf(self, ptx, output_column, output_mask)
-        breakpoint()
         return result
 
 

From 6806968d0f29789451700c6c1cd8307fd58d1c79 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 29 Apr 2021 08:04:24 -0700
Subject: [PATCH 026/132] attempting to use vector overload in jitify

---
 cpp/src/transform/transform.cpp | 67 ++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 3bef0b29063..c8cde8dd8d6 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -67,6 +67,38 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
+std::vector<void*> make_launch_args(table_view data, column_view outcol_view, column_view outmsk_view) {
+  int n_cols = data.num_columns();
+  std::vector<void*> results((n_cols * 3) + 3);
+
+  int64_t size = outcol_view.size();
+  void* outcol_ptr = (void*)cudf::jit::get_data_ptr(outcol_view);
+  void* outmsk_ptr = (void*)cudf::jit::get_data_ptr(outmsk_view);
+
+  results[0] = (void*)&size;
+  results[1] = (void*)&outcol_ptr;
+  results[2] = (void*)&outmsk_ptr;
+
+  for (int i = 0; i < n_cols; i++) {
+    int offset = 3 + (i * n_cols);
+    column_view col = data.column(i);
+
+    void* data_ptr = (void*)cudf::jit::get_data_ptr(col);
+    results[offset] = (void*)&data_ptr;
+
+    cudf::bitmask_type const* mask_ptr = col.null_mask();
+    results[offset + 1] = (void*)&mask_ptr;
+    
+    int64_t col_offset = col.offset();
+    results[offset + 2] = (void*)&col_offset;
+  }
+  return results;
+}
+
+auto make_launch_args_variadic(table_view data) {
+  
+}
+
 void binary_operation(table_view data_view,
                       std::string const& binary_udf, 
                       data_type output_type, 
@@ -88,6 +120,7 @@ void binary_operation(table_view data_view,
     template_types[offset + 2] = "int64_t";
   }
 
+  auto launch_args = make_launch_args(data_view, outcol_view, outmsk_view);
 
   column_view A = data_view.column(0);
   column_view B = data_view.column(1);
@@ -98,21 +131,35 @@ void binary_operation(table_view data_view,
 
   std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
                      binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
-                     
+
+  std::vector<void*> func_args(9);
+
+  cudf::size_type arg_size = outcol_view.size();
+  const void* arg_outcol_view = cudf::jit::get_data_ptr(outcol_view);
+  const void* arg_outmsk_view = cudf::jit::get_data_ptr(outcol_view);
+  const void* arg_A = cudf::jit::get_data_ptr(A);
+  cudf::bitmask_type const* arg_A_mask = A.null_mask();
+  int64_t arg_A_offset = A.offset();
+  const void* arg_B = cudf::jit::get_data_ptr(B);
+  cudf::bitmask_type const* arg_B_mask = B.null_mask();
+  int64_t arg_B_offset = B.offset();
+
+  func_args[0] = &arg_size;
+  func_args[1] = &arg_outcol_view;
+  func_args[2] = &arg_outmsk_view;
+  func_args[3] = &arg_A;
+  func_args[4] = &arg_A_mask;
+  func_args[5] = &arg_A_offset;
+  func_args[6] = &arg_B;
+  func_args[7] = &arg_B_mask;
+  func_args[8] = &arg_B_offset;
+     
   rmm::cuda_stream_view generic_stream;
   cudf::jit::get_program_cache(*transform_jit_binop_kernel_cu_jit)
     .get_kernel(
       generic_kernel_name, {}, {{"transform/jit/operation-udf.hpp", generic_cuda_source}}, {"-arch=sm_."})  //
     ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())                                   //
-    ->launch(outcol_view.size(),
-             cudf::jit::get_data_ptr(outcol_view),
-             cudf::jit::get_data_ptr(outmsk_view), 
-             cudf::jit::get_data_ptr(A),
-             A.null_mask(), // cudf::bitmask_type * 
-             A.offset(),
-             cudf::jit::get_data_ptr(B),
-             B.null_mask(),
-             B.offset());
+    ->launch(func_args);
 
 }
 

From cef8b71c3c1dc865a5159f17ba79b6261c74a066 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 3 May 2021 06:21:33 -0700
Subject: [PATCH 027/132] loop kernel runs finally

---
 cpp/src/transform/jit/binop_kernel.cu |  1 -
 cpp/src/transform/transform.cpp       | 99 ++++++++++++---------------
 2 files changed, 42 insertions(+), 58 deletions(-)

diff --git a/cpp/src/transform/jit/binop_kernel.cu b/cpp/src/transform/jit/binop_kernel.cu
index c81cd21dac9..1b3ea42fc22 100644
--- a/cpp/src/transform/jit/binop_kernel.cu
+++ b/cpp/src/transform/jit/binop_kernel.cu
@@ -103,7 +103,6 @@ __device__ auto make_args(cudf::size_type id,
     );
 }
 
-
 template <typename TypeOut, typename ... Arguments>
 __global__
 void generic_udf_kernel(cudf::size_type size, 
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index c8cde8dd8d6..dffae64c59e 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -67,38 +67,6 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-std::vector<void*> make_launch_args(table_view data, column_view outcol_view, column_view outmsk_view) {
-  int n_cols = data.num_columns();
-  std::vector<void*> results((n_cols * 3) + 3);
-
-  int64_t size = outcol_view.size();
-  void* outcol_ptr = (void*)cudf::jit::get_data_ptr(outcol_view);
-  void* outmsk_ptr = (void*)cudf::jit::get_data_ptr(outmsk_view);
-
-  results[0] = (void*)&size;
-  results[1] = (void*)&outcol_ptr;
-  results[2] = (void*)&outmsk_ptr;
-
-  for (int i = 0; i < n_cols; i++) {
-    int offset = 3 + (i * n_cols);
-    column_view col = data.column(i);
-
-    void* data_ptr = (void*)cudf::jit::get_data_ptr(col);
-    results[offset] = (void*)&data_ptr;
-
-    cudf::bitmask_type const* mask_ptr = col.null_mask();
-    results[offset + 1] = (void*)&mask_ptr;
-    
-    int64_t col_offset = col.offset();
-    results[offset + 2] = (void*)&col_offset;
-  }
-  return results;
-}
-
-auto make_launch_args_variadic(table_view data) {
-  
-}
-
 void binary_operation(table_view data_view,
                       std::string const& binary_udf, 
                       data_type output_type, 
@@ -120,46 +88,63 @@ void binary_operation(table_view data_view,
     template_types[offset + 2] = "int64_t";
   }
 
-  auto launch_args = make_launch_args(data_view, outcol_view, outmsk_view);
 
   column_view A = data_view.column(0);
   column_view B = data_view.column(1);
 
+
+
   std::string generic_kernel_name = 
   jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
-    .instantiate(template_types);
+    .instantiate(cudf::jit::get_type_name(outcol_view.type()),
+                 "int64_t*",
+                 "uint32_t*",
+                 "int64_t",
+                 "int64_t*",
+                 "uint32_t*",
+                 "int64_t");
 
   std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
                      binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
 
-  std::vector<void*> func_args(9);
-
-  cudf::size_type arg_size = outcol_view.size();
-  const void* arg_outcol_view = cudf::jit::get_data_ptr(outcol_view);
-  const void* arg_outmsk_view = cudf::jit::get_data_ptr(outcol_view);
-  const void* arg_A = cudf::jit::get_data_ptr(A);
-  cudf::bitmask_type const* arg_A_mask = A.null_mask();
-  int64_t arg_A_offset = A.offset();
-  const void* arg_B = cudf::jit::get_data_ptr(B);
-  cudf::bitmask_type const* arg_B_mask = B.null_mask();
-  int64_t arg_B_offset = B.offset();
-
-  func_args[0] = &arg_size;
-  func_args[1] = &arg_outcol_view;
-  func_args[2] = &arg_outmsk_view;
-  func_args[3] = &arg_A;
-  func_args[4] = &arg_A_mask;
-  func_args[5] = &arg_A_offset;
-  func_args[6] = &arg_B;
-  func_args[7] = &arg_B_mask;
-  func_args[8] = &arg_B_offset;
-     
+  int n_cols = data_view.num_columns();
+  std::vector<void*> results((n_cols * 3) + 3);
+
+  cudf::size_type size = outcol_view.size();
+  const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
+  const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
+
+  results[0] = &size;
+  results[1] = &outcol_ptr;
+  results[2] = &outmsk_ptr;
+  column_view col;
+
+  std::vector<const void*> data_ptrs(n_cols);
+  std::vector<cudf::bitmask_type const*> mask_ptrs(n_cols);
+  std::vector<int64_t> offsets(n_cols);
+
+  for (int i = 0; i < n_cols; i++) {
+    col = data_view.column(i);
+    data_ptrs[i] = cudf::jit::get_data_ptr(col);
+    mask_ptrs[i] = col.null_mask();
+    offsets[i] = col.offset();
+  }
+
+  int idx = 3;
+  for (int i = 0; i < n_cols; i++) {
+    results[idx] = &data_ptrs[i];
+    results[idx + 1] = &mask_ptrs[i];
+    results[idx + 2] = &offsets[i];
+    idx += 3;
+  }
+  
+
   rmm::cuda_stream_view generic_stream;
   cudf::jit::get_program_cache(*transform_jit_binop_kernel_cu_jit)
     .get_kernel(
       generic_kernel_name, {}, {{"transform/jit/operation-udf.hpp", generic_cuda_source}}, {"-arch=sm_."})  //
     ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())                                   //
-    ->launch(func_args);
+    ->launch(results.data());
 
 }
 

From 19b88c54ef297e661429a8421d589d185c27d67d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 3 May 2021 06:24:28 -0700
Subject: [PATCH 028/132] full pipeline works for a dynamic number of columns

---
 cpp/src/transform/transform.cpp | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index dffae64c59e..cdd422a407d 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -88,21 +88,9 @@ void binary_operation(table_view data_view,
     template_types[offset + 2] = "int64_t";
   }
 
-
-  column_view A = data_view.column(0);
-  column_view B = data_view.column(1);
-
-
-
   std::string generic_kernel_name = 
   jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
-    .instantiate(cudf::jit::get_type_name(outcol_view.type()),
-                 "int64_t*",
-                 "uint32_t*",
-                 "int64_t",
-                 "int64_t*",
-                 "uint32_t*",
-                 "int64_t");
+    .instantiate(template_types);
 
   std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
                      binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});

From 4845f27c2b6dc44561153ca04dc206297db52982 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 3 May 2021 14:24:36 -0700
Subject: [PATCH 029/132] code cleanup

---
 cpp/src/transform/transform.cpp         | 67 -------------------------
 python/cudf/cudf/_lib/cpp/transform.pxd |  8 ---
 python/cudf/cudf/_lib/transform.pyx     | 32 ------------
 3 files changed, 107 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index cdd422a407d..50abd347c42 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -136,43 +136,6 @@ void binary_operation(table_view data_view,
 
 }
 
-void generalized_operation(table_view const& data_view,
-                           std::string const& udf,
-                           data_type output_type,
-                           column_view const& outcol_view,
-                           column_view const& outmsk_view,
-                           rmm::mr::device_memory_resource* mr)
-{
-  rmm::cuda_stream_view stream;
-  //std::string cuda_source = cudf::jit::parse_single_function_ptx(
-  //                   udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
-  /*
-  size_t num_cols = data_view.num_columns();
-  std::vector<std::string> input_types(num_cols);
-  std::vector<void*> args(num_cols);
-
-
-  column_view this_view;
-  for (size_t i = 0; i < num_cols; i++) {
-    this_view = data_view.column(i);
-    input_types[i] = cudf::jit::get_type_name(this_view.type());
-  }
-  */
-
-  std::string kernel_name =
-    jitify2::reflection::Template("genop_kernel")  //
-      .instantiate(cudf::jit::get_type_name(outcol_view.type()));
-
-  cudf::jit::get_program_cache(*transform_jit_baked_udf_requirements_cu_jit)
-    .get_kernel(
-      kernel_name, {}, {{"transform/jit/operation-udf.hpp", udf}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                   //
-    ->launch(outcol_view.size(),
-             static_cast<cudf::size_type>(7),                                                                 //
-             cudf::jit::get_data_ptr(outcol_view));
-
-}
-
 }  // namespace jit
 }  // namespace transformation
 
@@ -215,24 +178,6 @@ std::unique_ptr<column> masked_binary_op_inner(table_view data_view,
   return output;
 }
 
-std::unique_ptr<column> generalized_masked_op_inner(
-  table_view const& data_view,
-  std::string const& udf,
-  data_type output_type,
-  column_view const& outcol_view,
-  column_view const& outmsk_view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-
-  transformation::jit::generalized_operation(data_view, udf, output_type, outcol_view, outmsk_view, mr);
-
-  std::unique_ptr<column> output;
-
-  return output;
-
-}
-
 
 
 }  // namespace detail
@@ -257,16 +202,4 @@ std::unique_ptr<column> masked_binary_op(table_view data_view,
   return detail::masked_binary_op_inner(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
 }
 
-std::unique_ptr<column> generalized_masked_op(
-  table_view const& data_view,
-  std::string const& udf,
-  data_type output_type,
-  column_view const& outcol_view,
-  column_view const& outmsk_view,
-  rmm::mr::device_memory_resource* mr)
-{
-  return detail::generalized_masked_op_inner(data_view, udf, output_type, outcol_view, outmsk_view, mr);
-}
-
-
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index d7efc5a2d8f..d222c461bba 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -46,14 +46,6 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         column_view outmask_view,
     ) except +
 
-    cdef unique_ptr[column] generalized_masked_op(
-        table_view data_view,
-        string udf,
-        data_type output_type,
-        column_view outcol_view,
-        column_view outmask_view
-    ) except +
-
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 027ff65a13c..2c61be00131 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -127,38 +127,6 @@ def transform(Column input, op):
 
     return Column.from_unique_ptr(move(c_output))
 
-def generalized_masked_op(Table data, op, Column output_column, Column output_mask):
-    cdef string c_str
-    cdef type_id c_tid
-    cdef data_type c_dtype
-
-    cdef table_view data_view = data.view()
-
-    from cudf.core.udf import compile_udf, demo_kernel
-    #c_str = compile_udf(op).encode('UTF-8')
-    c_str = demo_kernel.encode('UTF-8')
-    c_tid = <type_id> (
-        <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]
-    )
-    c_dtype = data_type(c_tid)
-
-    cdef column_view outcol_view = output_column.view()
-    cdef column_view outmsk_view = output_mask.view()
-
-    with nogil:
-        c_output = move(libcudf_transform.generalized_masked_op(
-            data_view,
-            c_str,
-            c_dtype,
-            outcol_view,
-            outmsk_view
-        ))
-
-    output_mask_real = bools_to_mask(output_mask)
-    output_column = output_column.set_mask(output_mask_real)
-
-    return output_column
-
 def masked_udf(Table incols, op, Column output_column, Column output_mask):
     cdef table_view data_view = incols.data_view()
     cdef string c_str = op.encode("UTF-8")

From c796dc4360753925a19f97c76c3b571abf481343 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 3 May 2021 14:37:20 -0700
Subject: [PATCH 030/132] more code cleanup/renaming

---
 cpp/cmake/Modules/JitifyPreprocessKernels.cmake            | 3 +--
 cpp/include/cudf/transform.hpp                             | 7 -------
 cpp/src/transform/jit/baked_udf_requirements.cu            | 2 --
 .../jit/{binop_kernel.cu => masked_udf_kernel.cu}          | 0
 cpp/src/transform/transform.cpp                            | 4 ++--
 5 files changed, 3 insertions(+), 13 deletions(-)
 delete mode 100644 cpp/src/transform/jit/baked_udf_requirements.cu
 rename cpp/src/transform/jit/{binop_kernel.cu => masked_udf_kernel.cu} (100%)

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index c481301e335..7e2ec5254d3 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -56,10 +56,9 @@ endfunction()
 
 jit_preprocess_files(SOURCE_DIRECTORY      ${CUDF_SOURCE_DIR}/src
                      FILES                 binaryop/jit/kernel.cu
-                                           transform/jit/binop_kernel.cu
+                                           transform/jit/masked_udf_kernel.cu
                                            transform/jit/kernel.cu
                                            rolling/jit/kernel.cu
-                                           transform/jit/baked_udf_requirements.cu
                      )
 
 add_custom_target(jitify_preprocess_run DEPENDS ${JIT_PREPROCESSED_FILES})
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 7e5c1797d11..eb0f9251ce9 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -62,13 +62,6 @@ std::unique_ptr<column> masked_binary_op(
   column_view const& outmsk_view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<column> generalized_masked_op(
-  table_view const& data_view,
-  std::string const& udf,
-  data_type output_type,
-  column_view const& outcol_view,
-  column_view const& outmsk_view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
diff --git a/cpp/src/transform/jit/baked_udf_requirements.cu b/cpp/src/transform/jit/baked_udf_requirements.cu
deleted file mode 100644
index d31f1d9f899..00000000000
--- a/cpp/src/transform/jit/baked_udf_requirements.cu
+++ /dev/null
@@ -1,2 +0,0 @@
-#include <cudf/types.hpp>
-#include <transform/jit/operation-udf.hpp>
diff --git a/cpp/src/transform/jit/binop_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
similarity index 100%
rename from cpp/src/transform/jit/binop_kernel.cu
rename to cpp/src/transform/jit/masked_udf_kernel.cu
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 50abd347c42..493233df0d6 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -15,7 +15,7 @@
  */
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
-#include <jit_preprocessed_files/transform/jit/binop_kernel.cu.jit.hpp>
+#include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
 #include <jit_preprocessed_files/transform/jit/baked_udf_requirements.cu.jit.hpp>
 
 
@@ -128,7 +128,7 @@ void binary_operation(table_view data_view,
   
 
   rmm::cuda_stream_view generic_stream;
-  cudf::jit::get_program_cache(*transform_jit_binop_kernel_cu_jit)
+  cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
     .get_kernel(
       generic_kernel_name, {}, {{"transform/jit/operation-udf.hpp", generic_cuda_source}}, {"-arch=sm_."})  //
     ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())                                   //

From 4f0ab9bdb00c3f4a3f903d45eb5aafff2a1a8205 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 3 May 2021 14:48:14 -0700
Subject: [PATCH 031/132] even more renaming and cleanup

---
 cpp/include/cudf/transform.hpp             |  2 +-
 cpp/src/transform/jit/masked_udf_kernel.cu | 36 ----------------------
 cpp/src/transform/transform.cpp            | 30 +++++++++---------
 python/cudf/cudf/_lib/cpp/transform.pxd    |  2 +-
 python/cudf/cudf/_lib/transform.pyx        |  2 +-
 5 files changed, 18 insertions(+), 54 deletions(-)

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index eb0f9251ce9..7af849357ec 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -54,7 +54,7 @@ std::unique_ptr<column> transform(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 
-std::unique_ptr<column> masked_binary_op(
+std::unique_ptr<column> generalized_masked_op(
   table_view data_view,
   std::string const& binary_udf,
   data_type output_type,
diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index 1b3ea42fc22..b3bcb0d53fc 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -42,43 +42,7 @@ struct Masked {
   int64_t value;
   bool valid;
 };
-/*
-template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-__global__
-void binop_kernel(cudf::size_type size,
-                  TypeOut* out_data, 
-                  TypeLhs* lhs_data,
-                  TypeRhs* rhs_data,
-                  bool* out_mask,
-                  cudf::bitmask_type const* lhs_mask,
-                  cudf::size_type lhs_offset,
-                  cudf::bitmask_type const* rhs_mask,
-                  cudf::size_type rhs_offset
-) {
-    int tid = threadIdx.x;
-    int blkid = blockIdx.x;
-    int blksz = blockDim.x;
-    int gridsz = gridDim.x;
-    int start = tid + blkid * blksz;
-    int step = blksz * gridsz;
 
-    Masked output;
-    char l_valid;
-    char r_valid;
-    long int l_data;
-    long int r_data;
-
-    for (cudf::size_type i=start; i<size; i+=step) {
-      l_valid = lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true;
-      r_valid = rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true;
-      l_data = lhs_data[i];
-      r_data = rhs_data[i];
-      GENERIC_BINARY_OP(&output.value, lhs_data[i], l_valid, rhs_data[i], r_valid);
-      out_data[i] = output.value;
-      out_mask[i] = output.valid;
-    }
-}
-*/
 template <typename TypeIn, typename MaskType, typename OffsetType>
 __device__ auto make_args(cudf::size_type id,
                           TypeIn in_ptr,
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 493233df0d6..1d8f76d9e2c 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -67,12 +67,12 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-void binary_operation(table_view data_view,
-                      std::string const& binary_udf, 
-                      data_type output_type, 
-                      column_view const& outcol_view,
-                      column_view const& outmsk_view,
-                      rmm::mr::device_memory_resource* mr)
+void generalized_operation(table_view data_view,
+                           std::string const& binary_udf, 
+                           data_type output_type, 
+                           column_view const& outcol_view,
+                           column_view const& outmsk_view,
+                           rmm::mr::device_memory_resource* mr)
 {
 
   std::vector<std::string> template_types(
@@ -162,7 +162,7 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
-std::unique_ptr<column> masked_binary_op_inner(table_view data_view, 
+std::unique_ptr<column> generalized_masked_op(table_view data_view, 
                                                std::string const& binary_udf, 
                                                data_type output_type, 
                                                column_view const& outcol_view,
@@ -170,7 +170,7 @@ std::unique_ptr<column> masked_binary_op_inner(table_view data_view,
                                                rmm::mr::device_memory_resource* mr)
 {
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-  transformation::jit::binary_operation(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
+  transformation::jit::generalized_operation(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
 
   std::unique_ptr<column> output;
 
@@ -192,14 +192,14 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> masked_binary_op(table_view data_view,
-                                         std::string const& binary_udf, 
-                                         data_type output_type, 
-                                         column_view const& outcol_view,
-                                         column_view const& outmsk_view,
-                                         rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> generalized_masked_op(table_view data_view,
+                                              std::string const& binary_udf, 
+                                              data_type output_type, 
+                                              column_view const& outcol_view,
+                                              column_view const& outmsk_view,
+                                              rmm::mr::device_memory_resource* mr)
 {
-  return detail::masked_binary_op_inner(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
+  return detail::generalized_masked_op(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
 }
 
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index d222c461bba..82d45811cb3 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -38,7 +38,7 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         bool is_ptx
     ) except +
 
-    cdef unique_ptr[column] masked_binary_op(
+    cdef unique_ptr[column] generalized_masked_op(
         table_view data_view,
         string binary_udf,
         data_type output_type,
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 2c61be00131..e1b028282f4 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -142,7 +142,7 @@ def masked_udf(Table incols, op, Column output_column, Column output_mask):
     cdef column_view outmsk_view = output_mask.view()
 
     with nogil:
-        c_output = move(libcudf_transform.masked_binary_op(
+        c_output = move(libcudf_transform.generalized_masked_op(
             data_view,
             c_str,
             c_dtype,

From 3389198d389efb44d5a03368fe758fda3124222a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 4 May 2021 05:49:34 -0700
Subject: [PATCH 032/132] remove old code

---
 python/cudf/cudf/core/udf.py | 88 ------------------------------------
 1 file changed, 88 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 1931ce7e0d0..ddc89d9e2f2 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -152,91 +152,3 @@ def compile_udf(func):
     return kernel, ptx
 
 NA = _NAType()
-
-def make_kernel(n_params):
-    '''
-    create a string containing the right templated kernel
-    for `func`
-    '''
-    
-    indent = ' '*18
-    
-    # Hack together the template string
-    result = ''
-        
-    templates = 'template <typename TypeOut, '
-    for i in range(n_params):
-        templates += f"typename Type{i}, "
-    
-    templates = templates[:-3] + f"{i}>"
-    result += templates
-    
-    # Hack together the function signature
-    sig = '\n__global__\nvoid genop_kernel(cudf::size_type size,\n'
-    sig += indent + "TypeOut* out_data,\n"
-    sig += indent + 'bool* out_mask,\n'
-    for i in range(n_params):
-        sig += indent + f"Type{i}* data_{i},\n"
-        sig += indent + f"cudf::bitmask_type const* mask_{i},\n"
-        sig += indent + f"cudf::size_type offset_{i},\n"
-    sig = sig[:-2] + ') {'
-    
-    result += sig
-    result += '\n'
-    
-    # standard thread block
-    result += '\n'
-    result += '\tint tid = threadIdx.x;\n'
-    result += '\tint blkid = blockIdx.x;\n'
-    result += '\tint blksz = blockDim.x;\n'
-    result += '\tint gridsz = gridDim.x;\n'
-    result += '\tint start = tid + blkid * blksz;\n'
-    result += '\tint step = blksz * gridsz;\n'
-    result += '\n'
-    
-    result += '\tMasked output;\n'
-    
-    for i in range(n_params):
-        result += f"\tchar valid_{i};\n"
-
-    # main loop
-    result += "\tfor (cudf::size_type i=start; i<size; i+=step) {\n"
-    
-    for i in range(n_params):
-        result += f"\t\tvalid_{i} = cudf::bit_is_set(mask_{i}, offset_{i} + i) : true;\n"
-        
-    # genop signature
-    genop_sig = "\t\tGENERIC_OP(&output.value, "
-    for i in range(n_params):
-        genop_sig += f"data_{i}[i], valid_{i}, "
-    
-    genop_sig = genop_sig[:-2] + ');\n'
-    
-    result += genop_sig
-    
-    # set the output
-    result += "\t\tout_data[i] = output.value;\n"
-    result += "\t\tout_mask[i] = output.valid;\n"
-    
-    result += "\t}\n"
-    result += "}"
-    
-    return result
-
-
-demo_kernel = ''' 
-template <typename TypeOut>
-__global__
-void genop_kernel(cudf::size_type size, cudf::size_type value, TypeOut* out_data) {
-
-	int tid = threadIdx.x;
-	int blkid = blockIdx.x;
-	int blksz = blockDim.x;
-	int gridsz = gridDim.x;
-	int start = tid + blkid * blksz;
-	int step = blksz * gridsz;
-
-	for (cudf::size_type i=start; i<size; i+=step) {
-        out_data[i] = value;
-	}
-}'''

From 9425d4bc0509a0ba55d890edaa443f4a2a500e92 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 4 May 2021 11:32:23 -0700
Subject: [PATCH 033/132] more cleanup

---
 python/cudf/cudf/core/dataframe.py | 1 -
 python/cudf/cudf/core/udf.py       | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1559a276029..a0d92caa829 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4716,7 +4716,6 @@ def query(self, expr, local_dict=None):
             return self._apply_boolean_mask(boolmask)
 
     def apply(self, func):
-        breakpoint()
         return super()._apply(func)
 
 
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index ddc89d9e2f2..36a47455887 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -148,7 +148,7 @@ def compile_udf(func):
     ptx, _ = cuda.compile_ptx_for_current_device(func, to_compiler_sig, device=True)
 
     # get the kernel that calls the inlineable function
-    kernel = make_kernel(n_params)
+    kernel = None
     return kernel, ptx
 
 NA = _NAType()

From f7845e518405a8d68c2cabf1adbb23a381a00eb3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 4 May 2021 13:56:27 -0700
Subject: [PATCH 034/132] add a decorator to mimic the pandas api better

---
 python/cudf/cudf/core/dataframe.py | 3 ++-
 python/cudf/cudf/core/udf.py       | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a0d92caa829..69602d8499c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4716,7 +4716,8 @@ def query(self, expr, local_dict=None):
             return self._apply_boolean_mask(boolmask)
 
     def apply(self, func):
-        return super()._apply(func)
+        return func(self)
+        #return super()._apply(func)
 
 
     @applyutils.doc_apply()
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 36a47455887..012cf94d65d 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -152,3 +152,10 @@ def compile_udf(func):
     return kernel, ptx
 
 NA = _NAType()
+
+def nulludf(func):
+    def wrapper(*args):
+        from cudf import DataFrame
+        to_udf_table = DataFrame({idx: arg for idx, arg in zip(range(len(args)), args)})
+        return to_udf_table._apply(func)
+    return wrapper

From 9e89ebde14c7510c66ab065adc0bb65b92b1b059 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 4 May 2021 14:11:46 -0700
Subject: [PATCH 035/132] starting to write tests

---
 python/cudf/cudf/core/dataframe.py            |  2 +-
 python/cudf/cudf/tests/test_udf_masked_ops.py | 74 +++++++++++++++++++
 2 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/tests/test_udf_masked_ops.py

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 69602d8499c..2907d541403 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4715,7 +4715,7 @@ def query(self, expr, local_dict=None):
             boolmask = queryutils.query_execute(self, expr, callenv)
             return self._apply_boolean_mask(boolmask)
 
-    def apply(self, func):
+    def apply(self, func, axis=1):
         return func(self)
         #return super()._apply(func)
 
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
new file mode 100644
index 00000000000..b5b39aced20
--- /dev/null
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -0,0 +1,74 @@
+import cudf
+from cudf.core.udf import nulludf
+from cudf.tests.utils import assert_eq
+import pandas as pd
+
+
+def test_apply_basic():
+    def func_pdf(x, y):
+        return x + y
+
+    @nulludf
+    def func_gdf(x, y):
+        return x + y
+
+
+    gdf = cudf.DataFrame({
+        'a':[1,2,3],
+        'b':[4,5,6]
+    })
+
+    pdf = gdf.to_pandas()
+
+    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
+    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
+
+    assert_eq(expect, obtain)
+
+def test_apply_null():
+    def func_pdf(x, y):
+        return x + y
+
+    @nulludf
+    def func_gdf(x, y):
+        return x + y
+
+
+    gdf = cudf.DataFrame({
+        'a':[1,None,3, None],
+        'b':[4,5,None, None]
+    })
+
+    pdf = gdf.to_pandas()
+
+    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
+    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
+
+    assert_eq(expect, obtain)
+
+def test_apply_NA_conditional():
+    def func_pdf(x, y):
+        if x is pd.NA:
+            return y
+        else:
+            return x + y
+
+    @nulludf
+    def func_gdf(x, y):
+        if x is cudf.NA:
+            return y
+        else:
+            return x + y
+
+
+    gdf = cudf.DataFrame({
+        'a':[1,None,3, None],
+        'b':[4,5,None, None]
+    })
+
+    pdf = gdf.to_pandas()
+
+    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
+    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
+
+    assert_eq(expect, obtain)

From e19c8ba96e0b0252f38818ee1633db0c0f6d140d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 5 May 2021 12:47:58 -0700
Subject: [PATCH 036/132] add tests for constants

---
 python/cudf/cudf/core/udf.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 012cf94d65d..a3114461d22 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -30,7 +30,7 @@ def __init__(self, value, valid):
 
 
 class MaskedType(types.Type):
-    def __init__(self):
+    def __init__(self): # add `value`
         super().__init__(name="Masked")
 
 class NAType(types.Type):
@@ -64,11 +64,15 @@ def typer(value, valid):
 @register_model(MaskedType)
 class MaskedModel(models.StructModel):
     def __init__(self, dmm, fe_type):
+        # fe_type is a Maskedtype instance
+        # will have a .value attr
+        # value -> fe_type.value 
         members = [("value", types.int64), ("valid", types.bool_)]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
 register_model(NAType)(models.OpaqueModel)
 
+# types.int64 instance, change to typeclass (types.Integer, types.Number, etc)
 @lower_builtin(Masked, types.int64, types.bool_)
 def impl_masked_constructor(context, builder, sig, args):
     typ = sig.return_type
@@ -85,6 +89,8 @@ class MaskedScalarAdd(AbstractTemplate):
     # abstracttemplate vs concretetemplate
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
+            # result type: f(args[0], args[1], op) where f is numba's typing for self.key
+            # self.key -> operator being used
             return nb_signature(numba_masked, numba_masked, numba_masked)
 
 
@@ -113,6 +119,7 @@ def masked_scalar_add_impl(context, builder, sig, args):
     valid = builder.and_(m1.valid, m2.valid)
     result.valid = valid
     with builder.if_then(valid):
+        # result.value = numba_op(m1.value, m2.value)
         result.value = builder.add(m1.value, m2.value)
 
     return result._getvalue()
@@ -131,6 +138,28 @@ def constant_dummy(context, builder, ty, pyval):
     # This handles None, etc.
     return context.get_dummy_value()
 
+@cuda_registry.register_global(operator.add)
+class MaskedScalarAddConstant(AbstractTemplate):
+    def generic(self, args, kws):
+        if isinstance(args[0], MaskedType) and isinstance(args[1], types.Integer):
+            return nb_signature(numba_masked, numba_masked, types.int64)
+
+@cuda_lower(operator.add, MaskedType, types.Integer)
+def masked_scalar_add_constant_impl(context, builder, sig, input_values):
+    masked_type, const_type = sig.args
+
+    indata = cgutils.create_struct_proxy(masked_type)(context, builder, value=input_values[0])
+    result = cgutils.create_struct_proxy(numba_masked)(context, builder)
+    #to_add_const = context.get_constant(const_type, input_values[1])
+
+    result.valid = context.get_constant(types.boolean, 0)
+    with builder.if_then(indata.valid):
+        result.value = builder.add(indata.value, input_values[1])
+        result.valid = context.get_constant(types.boolean, 1)
+
+    return result._getvalue()
+
+
 
 @cuda.jit(numba_masked(numba_masked, numba_masked), device=True)
 def masked_add_py(m1, m2):

From 9880081495c9fa520f02a968f4f5c123044230db Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 5 May 2021 12:48:21 -0700
Subject: [PATCH 037/132] add failing tests for literal return

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index b5b39aced20..a8475a1849f 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -72,3 +72,40 @@ def func_gdf(x, y):
     obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
 
     assert_eq(expect, obtain)
+
+
+def test_apply_return_literal():
+    # 1. Casting rule literal -> Masked
+    #  -> a) make it so numba knows that we can even promote literals to Masked ()
+    #  -> b) implement custom lowering to specify how this actually happens (python only)
+
+
+    # 2. Custom unfication code
+
+
+    # numba/core/type
+    def func_pdf(x, y):
+        if x is pd.NA:
+            return 5
+        else:
+            return x + y
+
+    @nulludf
+    def func_gdf(x, y):
+        if x is cudf.NA:
+            return 5 # Masked(5, True)
+        else:
+            return x + y
+
+
+    gdf = cudf.DataFrame({
+        'a':[1,None,3, None],
+        'b':[4,5,None, None]
+    })
+
+    pdf = gdf.to_pandas()
+
+    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
+    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
+
+    assert_eq(expect, obtain)

From 3e6a28098eaf906d34c014651773e4c3e1a36e43 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 7 May 2021 13:40:33 -0700
Subject: [PATCH 038/132] add NA and add constant tests

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index a8475a1849f..fc0de1a4292 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -43,7 +43,49 @@ def func_gdf(x, y):
 
     expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
     obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
+    assert_eq(expect, obtain)
+
+
+def test_apply_add_null():
+    def func_pdf(x, y):
+        return x + y + pd.NA
+
+    @nulludf
+    def func_gdf(x, y):
+        return x + y + cudf.NA
+
+
+    gdf = cudf.DataFrame({
+        'a':[1,None,3, None],
+        'b':[4,5,None, None]
+    })
+
+    pdf = gdf.to_pandas()
+
+    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
+    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
+    # TODO: dtype mismatch here
+    assert_eq(expect, obtain, check_dtype=False)
+
 
+def test_apply_add_constant():
+    def func_pdf(x, y):
+        return x + y + 1
+
+    @nulludf
+    def func_gdf(x, y):
+        return x + y + 1
+
+
+    gdf = cudf.DataFrame({
+        'a':[1,None,3, None],
+        'b':[4,5,None, None]
+    })
+
+    pdf = gdf.to_pandas()
+
+    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
+    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
     assert_eq(expect, obtain)
 
 def test_apply_NA_conditional():

From 3028dbad10980ff039f2450ae7363c97e49e5d62 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 7 May 2021 13:40:47 -0700
Subject: [PATCH 039/132] MaskedType is parameterized

---
 python/cudf/cudf/core/udf.py | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index a3114461d22..b185e25bdf9 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -30,19 +30,23 @@ def __init__(self, value, valid):
 
 
 class MaskedType(types.Type):
-    def __init__(self): # add `value`
+    def __init__(self, value): # add `value`
+        self.value = value
         super().__init__(name="Masked")
+    
+    def __repr__(self):
+        return f"MaskedType({self.value})"
 
 class NAType(types.Type):
     def __init__(self):
         super().__init__(name="NA")
 
-numba_masked = MaskedType()  # name this something more natural - GM
+numba_masked = MaskedType(types.int64)  # name this something more natural - GM
 numba_na = NAType()
 
 @typeof_impl.register(Masked)
 def typeof_masked(val, c):
-    return numba_masked
+    return Masked(val.value)
 
 @typeof_impl.register(_NAType)
 def typeof_na(val, c):
@@ -54,7 +58,7 @@ def typer(value, valid):
         if isinstance(value, types.Integer) and isinstance(
             valid, types.Boolean
         ):
-            return numba_masked
+            return  Masked(value)
 
     return typer
 
@@ -91,14 +95,14 @@ def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
             # result type: f(args[0], args[1], op) where f is numba's typing for self.key
             # self.key -> operator being used
-            return nb_signature(numba_masked, numba_masked, numba_masked)
+            return nb_signature(numba_masked, MaskedType(args[0].value), MaskedType(args[1].value))
 
 
 @cuda_registry.register_global(operator.add)
 class MaskedScalarAddNull(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
-            return nb_signature(numba_masked, numba_masked, numba_na)
+            return nb_signature(MaskedType(args[0].value), MaskedType(args[0].value), numba_na)
 
 @cuda_lower(operator.add, MaskedType, MaskedType)
 def masked_scalar_add_impl(context, builder, sig, args):
@@ -127,9 +131,9 @@ def masked_scalar_add_impl(context, builder, sig, args):
 
 @cuda_lower(operator.add, MaskedType, NAType)
 def masked_scalar_add_na_impl(context, builder, sig, args):
-#    return_type = sig.return_type
+    return_type = sig.return_type
     # use context to get llvm type for a bool
-    result = cgutils.create_struct_proxy(numba_masked)(context, builder)
+    result = cgutils.create_struct_proxy(MaskedType(return_type.value))(context, builder)
     result.valid = context.get_constant(types.boolean, 0)
     return result._getvalue()
 
@@ -142,14 +146,15 @@ def constant_dummy(context, builder, ty, pyval):
 class MaskedScalarAddConstant(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], types.Integer):
-            return nb_signature(numba_masked, numba_masked, types.int64)
+            # TODO - need to get the result type of args[0].value and args[1]
+            return nb_signature(MaskedType(args[0].value), MaskedType(args[0].value), types.int64)
 
 @cuda_lower(operator.add, MaskedType, types.Integer)
 def masked_scalar_add_constant_impl(context, builder, sig, input_values):
     masked_type, const_type = sig.args
 
-    indata = cgutils.create_struct_proxy(masked_type)(context, builder, value=input_values[0])
-    result = cgutils.create_struct_proxy(numba_masked)(context, builder)
+    indata = cgutils.create_struct_proxy(MaskedType(masked_type.value))(context, builder, value=input_values[0])
+    result = cgutils.create_struct_proxy(MaskedType(masked_type.value))(context, builder)
     #to_add_const = context.get_constant(const_type, input_values[1])
 
     result.valid = context.get_constant(types.boolean, 0)
@@ -160,15 +165,6 @@ def masked_scalar_add_constant_impl(context, builder, sig, input_values):
     return result._getvalue()
 
 
-
-@cuda.jit(numba_masked(numba_masked, numba_masked), device=True)
-def masked_add_py(m1, m2):
-    return m1 + m2
-
-
-def masked_add_py_2(m1, m2):
-    return m1 + m2
-
 def compile_udf(func):
     n_params = len(py_signature(func).parameters)
     to_compiler_sig = tuple(numba_masked for arg in range(n_params))

From ecd8527815b2c12d959ea2e6e0ade937d11e4604 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 7 May 2021 14:03:39 -0700
Subject: [PATCH 040/132] forward progress on mixed typing

---
 python/cudf/cudf/core/udf.py | 38 ++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index b185e25bdf9..6c8d7837d93 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -30,12 +30,14 @@ def __init__(self, value, valid):
 
 
 class MaskedType(types.Type):
-    def __init__(self, value): # add `value`
-        self.value = value
+    def __init__(self, value):
+        # MaskedType in numba shall be parameterized
+        # with a value type
+        self.value_type = value
         super().__init__(name="Masked")
     
     def __repr__(self):
-        return f"MaskedType({self.value})"
+        return f"MaskedType({self.value_type})"
 
 class NAType(types.Type):
     def __init__(self):
@@ -68,10 +70,7 @@ def typer(value, valid):
 @register_model(MaskedType)
 class MaskedModel(models.StructModel):
     def __init__(self, dmm, fe_type):
-        # fe_type is a Maskedtype instance
-        # will have a .value attr
-        # value -> fe_type.value 
-        members = [("value", types.int64), ("valid", types.bool_)]
+        members = [("value", fe_type.value_type), ("valid", types.bool_)]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
 register_model(NAType)(models.OpaqueModel)
@@ -93,16 +92,25 @@ class MaskedScalarAdd(AbstractTemplate):
     # abstracttemplate vs concretetemplate
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
-            # result type: f(args[0], args[1], op) where f is numba's typing for self.key
-            # self.key -> operator being used
-            return nb_signature(numba_masked, MaskedType(args[0].value), MaskedType(args[1].value))
+            # In the case of op(Masked, Masked), the return type is a Masked
+            # such that Masked.value 
+            return_type = self.context.resolve_function_type(
+                self.key,
+                (
+                    args[0].value_type,
+                    args[1].value_type
+                ),
+                kws
+            ).return_type
+            return nb_signature(MaskedType(return_type), MaskedType(args[0].value_type), MaskedType(args[1].value_type))
 
 
 @cuda_registry.register_global(operator.add)
 class MaskedScalarAddNull(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
-            return nb_signature(MaskedType(args[0].value), MaskedType(args[0].value), numba_na)
+            # in the case
+            return nb_signature(MaskedType(args[0].value_type), MaskedType(args[0].value_type), numba_na)
 
 @cuda_lower(operator.add, MaskedType, MaskedType)
 def masked_scalar_add_impl(context, builder, sig, args):
@@ -133,7 +141,7 @@ def masked_scalar_add_impl(context, builder, sig, args):
 def masked_scalar_add_na_impl(context, builder, sig, args):
     return_type = sig.return_type
     # use context to get llvm type for a bool
-    result = cgutils.create_struct_proxy(MaskedType(return_type.value))(context, builder)
+    result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(context, builder)
     result.valid = context.get_constant(types.boolean, 0)
     return result._getvalue()
 
@@ -147,14 +155,14 @@ class MaskedScalarAddConstant(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], types.Integer):
             # TODO - need to get the result type of args[0].value and args[1]
-            return nb_signature(MaskedType(args[0].value), MaskedType(args[0].value), types.int64)
+            return nb_signature(MaskedType(args[0].value_type), MaskedType(args[0].value_type), types.int64)
 
 @cuda_lower(operator.add, MaskedType, types.Integer)
 def masked_scalar_add_constant_impl(context, builder, sig, input_values):
     masked_type, const_type = sig.args
 
-    indata = cgutils.create_struct_proxy(MaskedType(masked_type.value))(context, builder, value=input_values[0])
-    result = cgutils.create_struct_proxy(MaskedType(masked_type.value))(context, builder)
+    indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(context, builder, value=input_values[0])
+    result = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(context, builder)
     #to_add_const = context.get_constant(const_type, input_values[1])
 
     result.valid = context.get_constant(types.boolean, 0)

From 5791413dda756e8c82380c72ca653a57bd1ff492 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 7 May 2021 14:07:39 -0700
Subject: [PATCH 041/132] generalize MaskedScalarAddConstant

---
 python/cudf/cudf/core/udf.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 6c8d7837d93..7261c36cf27 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -109,8 +109,10 @@ def generic(self, args, kws):
 class MaskedScalarAddNull(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
-            # in the case
-            return nb_signature(MaskedType(args[0].value_type), MaskedType(args[0].value_type), numba_na)
+            # In the case of op(Masked, NA), the result has the same 
+            # dtype as the original regardless of what it is
+            return_type = args[0].value_type
+            return nb_signature(MaskedType(return_type), MaskedType(args[0].value_type), numba_na)
 
 @cuda_lower(operator.add, MaskedType, MaskedType)
 def masked_scalar_add_impl(context, builder, sig, args):
@@ -154,8 +156,17 @@ def constant_dummy(context, builder, ty, pyval):
 class MaskedScalarAddConstant(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], types.Integer):
-            # TODO - need to get the result type of args[0].value and args[1]
-            return nb_signature(MaskedType(args[0].value_type), MaskedType(args[0].value_type), types.int64)
+            # In the case of op(Masked, constant), we resolve the type between
+            # the Masked value_type and the constant's type directly
+            return_type = self.context.resolve_function_type(
+                self.key,
+                (
+                    args[0].value_type,
+                    args[1]
+                ),
+                kws
+            ).return_type
+            return nb_signature(MaskedType(return_type), MaskedType(args[0].value_type), args[1])
 
 @cuda_lower(operator.add, MaskedType, types.Integer)
 def masked_scalar_add_constant_impl(context, builder, sig, input_values):

From 77c8ee4971fb00ddeede28415df55e3a8c57b3ec Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 7 May 2021 14:25:03 -0700
Subject: [PATCH 042/132] write a signature for any incoming types

---
 python/cudf/cudf/core/frame.py |  5 +----
 python/cudf/cudf/core/udf.py   | 10 +++++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index e0d2746f22c..c419021e651 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1421,10 +1421,7 @@ def _quantiles(
     def _apply(self, func):
         from cudf.core.udf import compile_udf
 
-        if not all(np.dtype('int64') == dtype for dtype in self.dtypes):
-            raise TypeError("Currently only int64 is supported")
-
-        kernel, ptx = compile_udf(func)
+        kernel, ptx = compile_udf(func, self.dtypes)
 
         output_column = cudf.core.column.column_empty(row_count=len(self), dtype='int64')
         output_mask = cudf.core.column.column_empty(row_count=len(self), dtype='bool')
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 7261c36cf27..92233854249 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -18,6 +18,8 @@
 from numba.extending import types
 import inspect
 
+from numba.np import numpy_support
+
 from llvmlite import ir
 from cudf.core.scalar import _NAType
 
@@ -76,6 +78,7 @@ def __init__(self, dmm, fe_type):
 register_model(NAType)(models.OpaqueModel)
 
 # types.int64 instance, change to typeclass (types.Integer, types.Number, etc)
+'''
 @lower_builtin(Masked, types.int64, types.bool_)
 def impl_masked_constructor(context, builder, sig, args):
     typ = sig.return_type
@@ -85,7 +88,7 @@ def impl_masked_constructor(context, builder, sig, args):
     masked.value = value
     masked.valid = valid
     return masked._getvalue()  # return a pointer to the struct I created
-
+'''
 
 @cuda_registry.register_global(operator.add)
 class MaskedScalarAdd(AbstractTemplate):
@@ -184,10 +187,11 @@ def masked_scalar_add_constant_impl(context, builder, sig, input_values):
     return result._getvalue()
 
 
-def compile_udf(func):
+def compile_udf(func, dtypes):
     n_params = len(py_signature(func).parameters)
-    to_compiler_sig = tuple(numba_masked for arg in range(n_params))
+    
 
+    to_compiler_sig = tuple(MaskedType(arg) for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes))
     # Get the inlineable PTX function
     ptx, _ = cuda.compile_ptx_for_current_device(func, to_compiler_sig, device=True)
 

From 85f1fbac525888f4acb397c92f515e6f724836d8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 7 May 2021 14:25:49 -0700
Subject: [PATCH 043/132] reformat code

---
 python/cudf/cudf/core/udf.py | 99 +++++++++++++++++++++++++-----------
 1 file changed, 68 insertions(+), 31 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 92233854249..cb71b905447 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -14,7 +14,10 @@
 from inspect import signature as py_signature
 from numba.core.typing.templates import AbstractTemplate
 from numba.cuda.cudadecl import registry as cuda_registry
-from numba.cuda.cudaimpl import lower as cuda_lower, registry as cuda_lowering_registry
+from numba.cuda.cudaimpl import (
+    lower as cuda_lower,
+    registry as cuda_lowering_registry,
+)
 from numba.extending import types
 import inspect
 
@@ -25,6 +28,7 @@
 
 from numba.core.extending import make_attribute_wrapper
 
+
 class Masked(object):
     def __init__(self, value, valid):
         self.value = value
@@ -37,48 +41,56 @@ def __init__(self, value):
         # with a value type
         self.value_type = value
         super().__init__(name="Masked")
-    
+
     def __repr__(self):
         return f"MaskedType({self.value_type})"
 
+
 class NAType(types.Type):
     def __init__(self):
         super().__init__(name="NA")
 
+
 numba_masked = MaskedType(types.int64)  # name this something more natural - GM
 numba_na = NAType()
 
+
 @typeof_impl.register(Masked)
 def typeof_masked(val, c):
     return Masked(val.value)
 
+
 @typeof_impl.register(_NAType)
 def typeof_na(val, c):
     return numba_na
 
+
 @type_callable(Masked)
 def type_masked(context):
     def typer(value, valid):
         if isinstance(value, types.Integer) and isinstance(
             valid, types.Boolean
         ):
-            return  Masked(value)
+            return Masked(value)
 
     return typer
 
+
 make_attribute_wrapper(MaskedType, "value", "value")
 make_attribute_wrapper(MaskedType, "valid", "valid")
 
+
 @register_model(MaskedType)
 class MaskedModel(models.StructModel):
     def __init__(self, dmm, fe_type):
         members = [("value", fe_type.value_type), ("valid", types.bool_)]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
+
 register_model(NAType)(models.OpaqueModel)
 
 # types.int64 instance, change to typeclass (types.Integer, types.Number, etc)
-'''
+"""
 @lower_builtin(Masked, types.int64, types.bool_)
 def impl_masked_constructor(context, builder, sig, args):
     typ = sig.return_type
@@ -88,7 +100,8 @@ def impl_masked_constructor(context, builder, sig, args):
     masked.value = value
     masked.valid = valid
     return masked._getvalue()  # return a pointer to the struct I created
-'''
+"""
+
 
 @cuda_registry.register_global(operator.add)
 class MaskedScalarAdd(AbstractTemplate):
@@ -96,26 +109,30 @@ class MaskedScalarAdd(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
             # In the case of op(Masked, Masked), the return type is a Masked
-            # such that Masked.value 
+            # such that Masked.value
             return_type = self.context.resolve_function_type(
-                self.key,
-                (
-                    args[0].value_type,
-                    args[1].value_type
-                ),
-                kws
+                self.key, (args[0].value_type, args[1].value_type), kws
             ).return_type
-            return nb_signature(MaskedType(return_type), MaskedType(args[0].value_type), MaskedType(args[1].value_type))
+            return nb_signature(
+                MaskedType(return_type),
+                MaskedType(args[0].value_type),
+                MaskedType(args[1].value_type),
+            )
 
 
 @cuda_registry.register_global(operator.add)
 class MaskedScalarAddNull(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
-            # In the case of op(Masked, NA), the result has the same 
+            # In the case of op(Masked, NA), the result has the same
             # dtype as the original regardless of what it is
             return_type = args[0].value_type
-            return nb_signature(MaskedType(return_type), MaskedType(args[0].value_type), numba_na)
+            return nb_signature(
+                MaskedType(return_type),
+                MaskedType(args[0].value_type),
+                numba_na,
+            )
+
 
 @cuda_lower(operator.add, MaskedType, MaskedType)
 def masked_scalar_add_impl(context, builder, sig, args):
@@ -146,38 +163,48 @@ def masked_scalar_add_impl(context, builder, sig, args):
 def masked_scalar_add_na_impl(context, builder, sig, args):
     return_type = sig.return_type
     # use context to get llvm type for a bool
-    result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(context, builder)
+    result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(
+        context, builder
+    )
     result.valid = context.get_constant(types.boolean, 0)
     return result._getvalue()
 
+
 @cuda_lowering_registry.lower_constant(NAType)
 def constant_dummy(context, builder, ty, pyval):
     # This handles None, etc.
     return context.get_dummy_value()
 
+
 @cuda_registry.register_global(operator.add)
 class MaskedScalarAddConstant(AbstractTemplate):
     def generic(self, args, kws):
-        if isinstance(args[0], MaskedType) and isinstance(args[1], types.Integer):
+        if isinstance(args[0], MaskedType) and isinstance(
+            args[1], types.Integer
+        ):
             # In the case of op(Masked, constant), we resolve the type between
             # the Masked value_type and the constant's type directly
             return_type = self.context.resolve_function_type(
-                self.key,
-                (
-                    args[0].value_type,
-                    args[1]
-                ),
-                kws
+                self.key, (args[0].value_type, args[1]), kws
             ).return_type
-            return nb_signature(MaskedType(return_type), MaskedType(args[0].value_type), args[1])
+            return nb_signature(
+                MaskedType(return_type),
+                MaskedType(args[0].value_type),
+                args[1],
+            )
+
 
 @cuda_lower(operator.add, MaskedType, types.Integer)
 def masked_scalar_add_constant_impl(context, builder, sig, input_values):
     masked_type, const_type = sig.args
 
-    indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(context, builder, value=input_values[0])
-    result = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(context, builder)
-    #to_add_const = context.get_constant(const_type, input_values[1])
+    indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
+        context, builder, value=input_values[0]
+    )
+    result = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
+        context, builder
+    )
+    # to_add_const = context.get_constant(const_type, input_values[1])
 
     result.valid = context.get_constant(types.boolean, 0)
     with builder.if_then(indata.valid):
@@ -189,21 +216,31 @@ def masked_scalar_add_constant_impl(context, builder, sig, input_values):
 
 def compile_udf(func, dtypes):
     n_params = len(py_signature(func).parameters)
-    
 
-    to_compiler_sig = tuple(MaskedType(arg) for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes))
+    to_compiler_sig = tuple(
+        MaskedType(arg)
+        for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)
+    )
     # Get the inlineable PTX function
-    ptx, _ = cuda.compile_ptx_for_current_device(func, to_compiler_sig, device=True)
+    ptx, _ = cuda.compile_ptx_for_current_device(
+        func, to_compiler_sig, device=True
+    )
 
     # get the kernel that calls the inlineable function
     kernel = None
     return kernel, ptx
 
+
 NA = _NAType()
 
+
 def nulludf(func):
     def wrapper(*args):
         from cudf import DataFrame
-        to_udf_table = DataFrame({idx: arg for idx, arg in zip(range(len(args)), args)})
+
+        to_udf_table = DataFrame(
+            {idx: arg for idx, arg in zip(range(len(args)), args)}
+        )
         return to_udf_table._apply(func)
+
     return wrapper

From 22c220c8c64eb46e6652d6ae0ed6e9c7391aa4a9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sat, 8 May 2021 14:24:12 -0700
Subject: [PATCH 044/132] need a separate __hash__ for different MaskedType

---
 python/cudf/cudf/core/udf.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index cb71b905447..519ef43b31b 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -39,19 +39,20 @@ class MaskedType(types.Type):
     def __init__(self, value):
         # MaskedType in numba shall be parameterized
         # with a value type
-        self.value_type = value
         super().__init__(name="Masked")
+        self.value_type = value
 
     def __repr__(self):
         return f"MaskedType({self.value_type})"
 
+    def __hash__(self):
+        return self.__repr__().__hash__()
 
 class NAType(types.Type):
     def __init__(self):
         super().__init__(name="NA")
 
 
-numba_masked = MaskedType(types.int64)  # name this something more natural - GM
 numba_na = NAType()
 
 
@@ -216,7 +217,7 @@ def masked_scalar_add_constant_impl(context, builder, sig, input_values):
 
 def compile_udf(func, dtypes):
     n_params = len(py_signature(func).parameters)
-
+    breakpoint()
     to_compiler_sig = tuple(
         MaskedType(arg)
         for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)

From 1ba3338a273cefcdd721c4621836d428bfa6389f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 10 May 2021 11:21:06 -0700
Subject: [PATCH 045/132] first sign that mixed typing works end to end

---
 cpp/src/transform/jit/masked_udf_kernel.cu |  5 +++--
 python/cudf/cudf/_lib/transform.pyx        |  2 +-
 python/cudf/cudf/core/frame.py             |  4 ++--
 python/cudf/cudf/core/udf.py               | 16 ++++++++++------
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index b3bcb0d53fc..d384ffc61b1 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -38,8 +38,9 @@ namespace cudf {
 namespace transformation {
 namespace jit {
 
+template <typename T>
 struct Masked {
-  int64_t value;
+  T value;
   bool valid;
 };
 
@@ -82,7 +83,7 @@ void generic_udf_kernel(cudf::size_type size,
     int start = tid + blkid * blksz;
     int step = blksz * gridsz;
 
-    Masked output;
+    Masked<TypeOut> output;
     for (cudf::size_type i=start; i<size; i+=step) {
       auto func_args = cuda::std::tuple_cat(
           cuda::std::make_tuple(&output.value),
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index e1b028282f4..821136bbc1d 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -134,7 +134,7 @@ def masked_udf(Table incols, op, Column output_column, Column output_mask):
     cdef data_type c_dtype
 
     c_tid = <type_id> (
-        <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]
+        <underlying_type_t_type_id> np_to_cudf_types[output_column.dtype]
     )
     c_dtype = data_type(c_tid)
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c419021e651..b8151e804e9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1421,9 +1421,9 @@ def _quantiles(
     def _apply(self, func):
         from cudf.core.udf import compile_udf
 
-        kernel, ptx = compile_udf(func, self.dtypes)
+        output_dtype, ptx = compile_udf(func, self.dtypes)
 
-        output_column = cudf.core.column.column_empty(row_count=len(self), dtype='int64')
+        output_column = cudf.core.column.column_empty(row_count=len(self), dtype=output_dtype)
         output_mask = cudf.core.column.column_empty(row_count=len(self), dtype='bool')
 
         result = cudf._lib.transform.masked_udf(self, ptx, output_column, output_mask)
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 519ef43b31b..82415645f24 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -155,7 +155,11 @@ def masked_scalar_add_impl(context, builder, sig, args):
     result.valid = valid
     with builder.if_then(valid):
         # result.value = numba_op(m1.value, m2.value)
-        result.value = builder.add(m1.value, m2.value)
+        result.value = context.compile_internal(
+            builder, 
+            lambda x, y: x + y, 
+            nb_signature(masked_return_type.value_type, masked_type_1.value_type, masked_type_2.value_type), (m1.value, m2.value)
+        )
 
     return result._getvalue()
 
@@ -217,19 +221,19 @@ def masked_scalar_add_constant_impl(context, builder, sig, input_values):
 
 def compile_udf(func, dtypes):
     n_params = len(py_signature(func).parameters)
-    breakpoint()
     to_compiler_sig = tuple(
         MaskedType(arg)
         for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)
     )
     # Get the inlineable PTX function
-    ptx, _ = cuda.compile_ptx_for_current_device(
+    ptx, numba_output_type = cuda.compile_ptx_for_current_device(
         func, to_compiler_sig, device=True
     )
+    numpy_output_type = numpy_support.as_dtype(
+        numba_output_type.value_type
+    )
 
-    # get the kernel that calls the inlineable function
-    kernel = None
-    return kernel, ptx
+    return numpy_output_type, ptx
 
 
 NA = _NAType()

From be062287af6e7978287ddd0a6b2f0b7e132c2aa1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 10 May 2021 12:02:38 -0700
Subject: [PATCH 046/132] add tests for columns of mixed data type

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 32 +++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index fc0de1a4292..252ddd24088 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -1,8 +1,9 @@
 import cudf
 from cudf.core.udf import nulludf
-from cudf.tests.utils import assert_eq
+from cudf.tests.utils import assert_eq, NUMERIC_TYPES
 import pandas as pd
-
+import itertools
+import pytest
 
 def test_apply_basic():
     def func_pdf(x, y):
@@ -116,6 +117,33 @@ def func_gdf(x, y):
     assert_eq(expect, obtain)
 
 
+@pytest.mark.parametrize('dtype_a', list(NUMERIC_TYPES))
+@pytest.mark.parametrize('dtype_b', list(NUMERIC_TYPES))
+def test_apply_mixed_dtypes(dtype_a, dtype_b):
+    def func_pdf(x, y):
+        return x + y
+    
+    @nulludf
+    def func_gdf(x, y):
+        return x + y
+
+    gdf = cudf.DataFrame({
+        'a':[1.5,None,3, None],
+        'b':[4,5,None, None]
+    })
+    gdf['a'] = gdf['a'].astype(dtype_a)
+    gdf['b'] = gdf['b'].astype(dtype_b)
+
+    pdf = gdf.to_pandas()
+
+    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
+    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
+
+    # currently, cases where one side is float32 fail, pandas doing some
+    # weird casting here and getting float64 always
+    assert_eq(expect, obtain)
+
+
 def test_apply_return_literal():
     # 1. Casting rule literal -> Masked
     #  -> a) make it so numba knows that we can even promote literals to Masked ()

From 029203b5fb34ab0070cc5e2f8b0ff0f5c4ba67b6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 10 May 2021 12:25:39 -0700
Subject: [PATCH 047/132] incorporate grahams custom unification of
 extensiontypes with literals

---
 python/cudf/cudf/core/udf.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 82415645f24..5726df553c8 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -13,7 +13,9 @@
 from numba.core.typing import signature as nb_signature
 from inspect import signature as py_signature
 from numba.core.typing.templates import AbstractTemplate
-from numba.cuda.cudadecl import registry as cuda_registry
+from numba.cuda.cudadecl import registry as cuda_decl_registry
+from numba.cuda.cudaimpl import registry as cuda_impl_registry
+
 from numba.cuda.cudaimpl import (
     lower as cuda_lower,
     registry as cuda_lowering_registry,
@@ -48,6 +50,14 @@ def __repr__(self):
     def __hash__(self):
         return self.__repr__().__hash__()
 
+    def unify(self, context, other):
+        breakpoint()
+        unified = context.unify_pairs(self.value_type, other)
+        if unified is None:
+            return None
+
+        return MaskedType(unified)
+
 class NAType(types.Type):
     def __init__(self):
         super().__init__(name="NA")
@@ -104,7 +114,7 @@ def impl_masked_constructor(context, builder, sig, args):
 """
 
 
-@cuda_registry.register_global(operator.add)
+@cuda_decl_registry.register_global(operator.add)
 class MaskedScalarAdd(AbstractTemplate):
     # abstracttemplate vs concretetemplate
     def generic(self, args, kws):
@@ -121,7 +131,7 @@ def generic(self, args, kws):
             )
 
 
-@cuda_registry.register_global(operator.add)
+@cuda_decl_registry.register_global(operator.add)
 class MaskedScalarAddNull(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
@@ -181,7 +191,7 @@ def constant_dummy(context, builder, ty, pyval):
     return context.get_dummy_value()
 
 
-@cuda_registry.register_global(operator.add)
+@cuda_decl_registry.register_global(operator.add)
 class MaskedScalarAddConstant(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(
@@ -218,6 +228,15 @@ def masked_scalar_add_constant_impl(context, builder, sig, input_values):
 
     return result._getvalue()
 
+# To handle the unification, we need to support casting from any type to an
+# extension type. The cast implementation takes the value passed in and returns
+# an extension struct wrapping that value.
+@cuda_impl_registry.lower_cast(types.Any, MaskedType)
+def cast_primitive_to_extension(context, builder, fromty, toty, val):
+    casted = context.cast(builder, val, fromty, toty.value_type)
+    ext = cgutils.create_struct_proxy(toty)(context, builder)
+    ext.value = casted
+    return ext._getvalue()
 
 def compile_udf(func, dtypes):
     n_params = len(py_signature(func).parameters)

From f024bf76ea2895b159c4fedce5b5d35879d25635 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 10 May 2021 13:09:53 -0700
Subject: [PATCH 048/132] unify MaskedType and NAType and add a huge comment

---
 python/cudf/cudf/core/udf.py | 50 ++++++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 5726df553c8..29f06744593 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -51,7 +51,53 @@ def __hash__(self):
         return self.__repr__().__hash__()
 
     def unify(self, context, other):
-        breakpoint()
+        '''
+        Logic for sorting out what to do when the UDF conditionally
+        returns a `MaskedType`, an `NAType`, or a literal based off 
+        the data at runtime.
+
+        In this framework, every input column is treated as having
+        type `MaskedType`. Operations like `x + y` are understood 
+        as translating to:
+
+        `Masked(value=x, valid=True) + Masked(value=y, valid=True)`
+
+        This means if the user writes a function such as 
+        def f(x, y):
+            return x + y
+            
+        numba sees this function as:
+        f(x: MaskedType, y: MaskedType) -> MaskedType
+        
+        However if the user writes something like:
+        def f(x, y):
+            if x > 5:
+                return 42
+            else:
+                return x + y
+        
+        numba now sees this as
+        f(x: MaskedType, y: MaskedType) -> MaskedType OR literal 
+
+        In general, when numba sees a function that could return
+        more than a single type, it invokes unification to find a
+        common type that can hold all possible results, eg given 
+        a function that could return an int8 or an int16, numba 
+        will resolve the type to int16. However it does not know
+        how to unify MaskedType with primitive types. We need to 
+        actually define the common type between MaskedType and
+        literals, as well as between a Masked and an NA. For full
+        generality, we'd need to unify NA and literal as well. 
+        '''
+        
+        # If we have Masked and NA, the output should be a 
+        # MaskedType with the original type as its value_type
+        if isinstance(other, NAType):
+            return MaskedType(self.value_type)
+
+        # if we have MaskedType and Literal, the output should be
+        # determined from the MaskedType.value_type (which is a 
+        # primitive type) and other
         unified = context.unify_pairs(self.value_type, other)
         if unified is None:
             return None
@@ -232,7 +278,7 @@ def masked_scalar_add_constant_impl(context, builder, sig, input_values):
 # extension type. The cast implementation takes the value passed in and returns
 # an extension struct wrapping that value.
 @cuda_impl_registry.lower_cast(types.Any, MaskedType)
-def cast_primitive_to_extension(context, builder, fromty, toty, val):
+def cast_primitive_to_masked(context, builder, fromty, toty, val):
     casted = context.cast(builder, val, fromty, toty.value_type)
     ext = cgutils.create_struct_proxy(toty)(context, builder)
     ext.value = casted

From 2ef4520bb46b1a54588377ae6936aa433f6fe1e3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 10 May 2021 14:17:13 -0700
Subject: [PATCH 049/132] Questionable unification of Masked with Literal

---
 python/cudf/cudf/core/udf.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 29f06744593..1bd6e9bef12 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -108,6 +108,13 @@ class NAType(types.Type):
     def __init__(self):
         super().__init__(name="NA")
 
+    def unify(self, context, other):
+        '''
+        Masked  <-> NA works from above
+        Literal <-> NA -> Masked
+        '''
+        if isinstance(other, types.abstract.Literal):
+            return MaskedType(other.literal_type)
 
 numba_na = NAType()
 
@@ -284,6 +291,13 @@ def cast_primitive_to_masked(context, builder, fromty, toty, val):
     ext.value = casted
     return ext._getvalue()
 
+@cuda_impl_registry.lower_cast(NAType, MaskedType)
+def cast_na_to_masked(context, builder, fromty, toty, val):
+    result = cgutils.create_struct_proxy(toty)(context, builder)
+    result.valid = context.get_constant(types.boolean, 0)
+
+    return result._getvalue()
+
 def compile_udf(func, dtypes):
     n_params = len(py_signature(func).parameters)
     to_compiler_sig = tuple(

From 90d5127c83e28050c4268ca33b4da3ecfb89e11c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 11 May 2021 07:02:52 -0700
Subject: [PATCH 050/132] removed unused code

---
 python/cudf/cudf/core/udf.py | 47 ++----------------------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
index 1bd6e9bef12..9f2cd0b412c 100644
--- a/python/cudf/cudf/core/udf.py
+++ b/python/cudf/cudf/core/udf.py
@@ -30,13 +30,6 @@
 
 from numba.core.extending import make_attribute_wrapper
 
-
-class Masked(object):
-    def __init__(self, value, valid):
-        self.value = value
-        self.valid = valid
-
-
 class MaskedType(types.Type):
     def __init__(self, value):
         # MaskedType in numba shall be parameterized
@@ -116,32 +109,9 @@ def unify(self, context, other):
         if isinstance(other, types.abstract.Literal):
             return MaskedType(other.literal_type)
 
-numba_na = NAType()
-
-
-@typeof_impl.register(Masked)
-def typeof_masked(val, c):
-    return Masked(val.value)
-
-
 @typeof_impl.register(_NAType)
 def typeof_na(val, c):
-    return numba_na
-
-
-@type_callable(Masked)
-def type_masked(context):
-    def typer(value, valid):
-        if isinstance(value, types.Integer) and isinstance(
-            valid, types.Boolean
-        ):
-            return Masked(value)
-
-    return typer
-
-
-make_attribute_wrapper(MaskedType, "value", "value")
-make_attribute_wrapper(MaskedType, "valid", "valid")
+    return NAType()
 
 
 @register_model(MaskedType)
@@ -153,19 +123,6 @@ def __init__(self, dmm, fe_type):
 
 register_model(NAType)(models.OpaqueModel)
 
-# types.int64 instance, change to typeclass (types.Integer, types.Number, etc)
-"""
-@lower_builtin(Masked, types.int64, types.bool_)
-def impl_masked_constructor(context, builder, sig, args):
-    typ = sig.return_type
-    value, valid = args
-
-    masked = cgutils.create_struct_proxy(typ)(context, builder)
-    masked.value = value
-    masked.valid = valid
-    return masked._getvalue()  # return a pointer to the struct I created
-"""
-
 
 @cuda_decl_registry.register_global(operator.add)
 class MaskedScalarAdd(AbstractTemplate):
@@ -194,7 +151,7 @@ def generic(self, args, kws):
             return nb_signature(
                 MaskedType(return_type),
                 MaskedType(args[0].value_type),
-                numba_na,
+                NAType(),
             )
 
 

From 0953bd17a50a75560f871f7122a40e4c40ee2a1f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 11 May 2021 08:42:58 -0700
Subject: [PATCH 051/132] move alot of code around and refactor, add comments

---
 python/cudf/cudf/core/dataframe.py            |   1 -
 python/cudf/cudf/core/frame.py                |   4 +-
 python/cudf/cudf/core/udf.py                  | 287 ------------------
 python/cudf/cudf/core/udf/__init__.py         |   1 +
 python/cudf/cudf/core/udf/lowering.py         | 114 +++++++
 python/cudf/cudf/core/udf/pipeline.py         |  49 +++
 python/cudf/cudf/core/udf/typing.py           | 186 ++++++++++++
 python/cudf/cudf/tests/test_udf_masked_ops.py |   2 +-
 8 files changed, 352 insertions(+), 292 deletions(-)
 delete mode 100644 python/cudf/cudf/core/udf.py
 create mode 100644 python/cudf/cudf/core/udf/__init__.py
 create mode 100644 python/cudf/cudf/core/udf/lowering.py
 create mode 100644 python/cudf/cudf/core/udf/pipeline.py
 create mode 100644 python/cudf/cudf/core/udf/typing.py

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2907d541403..91a97ee39e2 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -35,7 +35,6 @@
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
 from cudf.core.series import Series
-from cudf.core.udf import compile_udf
 from cudf.core.window import Rolling
 from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
 from cudf.utils.docutils import copy_docstring
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index b8151e804e9..fe632af6f27 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1419,9 +1419,7 @@ def _quantiles(
         return result
 
     def _apply(self, func):
-        from cudf.core.udf import compile_udf
-
-        output_dtype, ptx = compile_udf(func, self.dtypes)
+        output_dtype, ptx = cudf.core.udf.pipeline.compile_udf(func, self.dtypes)
 
         output_column = cudf.core.column.column_empty(row_count=len(self), dtype=output_dtype)
         output_mask = cudf.core.column.column_empty(row_count=len(self), dtype='bool')
diff --git a/python/cudf/cudf/core/udf.py b/python/cudf/cudf/core/udf.py
deleted file mode 100644
index 9f2cd0b412c..00000000000
--- a/python/cudf/cudf/core/udf.py
+++ /dev/null
@@ -1,287 +0,0 @@
-import operator
-import numba
-import numpy as np
-from numba import cuda, njit
-from numba.core import cgutils
-from numba.core.extending import (
-    lower_builtin,
-    models,
-    register_model,
-    type_callable,
-    typeof_impl,
-)
-from numba.core.typing import signature as nb_signature
-from inspect import signature as py_signature
-from numba.core.typing.templates import AbstractTemplate
-from numba.cuda.cudadecl import registry as cuda_decl_registry
-from numba.cuda.cudaimpl import registry as cuda_impl_registry
-
-from numba.cuda.cudaimpl import (
-    lower as cuda_lower,
-    registry as cuda_lowering_registry,
-)
-from numba.extending import types
-import inspect
-
-from numba.np import numpy_support
-
-from llvmlite import ir
-from cudf.core.scalar import _NAType
-
-from numba.core.extending import make_attribute_wrapper
-
-class MaskedType(types.Type):
-    def __init__(self, value):
-        # MaskedType in numba shall be parameterized
-        # with a value type
-        super().__init__(name="Masked")
-        self.value_type = value
-
-    def __repr__(self):
-        return f"MaskedType({self.value_type})"
-
-    def __hash__(self):
-        return self.__repr__().__hash__()
-
-    def unify(self, context, other):
-        '''
-        Logic for sorting out what to do when the UDF conditionally
-        returns a `MaskedType`, an `NAType`, or a literal based off 
-        the data at runtime.
-
-        In this framework, every input column is treated as having
-        type `MaskedType`. Operations like `x + y` are understood 
-        as translating to:
-
-        `Masked(value=x, valid=True) + Masked(value=y, valid=True)`
-
-        This means if the user writes a function such as 
-        def f(x, y):
-            return x + y
-            
-        numba sees this function as:
-        f(x: MaskedType, y: MaskedType) -> MaskedType
-        
-        However if the user writes something like:
-        def f(x, y):
-            if x > 5:
-                return 42
-            else:
-                return x + y
-        
-        numba now sees this as
-        f(x: MaskedType, y: MaskedType) -> MaskedType OR literal 
-
-        In general, when numba sees a function that could return
-        more than a single type, it invokes unification to find a
-        common type that can hold all possible results, eg given 
-        a function that could return an int8 or an int16, numba 
-        will resolve the type to int16. However it does not know
-        how to unify MaskedType with primitive types. We need to 
-        actually define the common type between MaskedType and
-        literals, as well as between a Masked and an NA. For full
-        generality, we'd need to unify NA and literal as well. 
-        '''
-        
-        # If we have Masked and NA, the output should be a 
-        # MaskedType with the original type as its value_type
-        if isinstance(other, NAType):
-            return MaskedType(self.value_type)
-
-        # if we have MaskedType and Literal, the output should be
-        # determined from the MaskedType.value_type (which is a 
-        # primitive type) and other
-        unified = context.unify_pairs(self.value_type, other)
-        if unified is None:
-            return None
-
-        return MaskedType(unified)
-
-class NAType(types.Type):
-    def __init__(self):
-        super().__init__(name="NA")
-
-    def unify(self, context, other):
-        '''
-        Masked  <-> NA works from above
-        Literal <-> NA -> Masked
-        '''
-        if isinstance(other, types.abstract.Literal):
-            return MaskedType(other.literal_type)
-
-@typeof_impl.register(_NAType)
-def typeof_na(val, c):
-    return NAType()
-
-
-@register_model(MaskedType)
-class MaskedModel(models.StructModel):
-    def __init__(self, dmm, fe_type):
-        members = [("value", fe_type.value_type), ("valid", types.bool_)]
-        models.StructModel.__init__(self, dmm, fe_type, members)
-
-
-register_model(NAType)(models.OpaqueModel)
-
-
-@cuda_decl_registry.register_global(operator.add)
-class MaskedScalarAdd(AbstractTemplate):
-    # abstracttemplate vs concretetemplate
-    def generic(self, args, kws):
-        if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
-            # In the case of op(Masked, Masked), the return type is a Masked
-            # such that Masked.value
-            return_type = self.context.resolve_function_type(
-                self.key, (args[0].value_type, args[1].value_type), kws
-            ).return_type
-            return nb_signature(
-                MaskedType(return_type),
-                MaskedType(args[0].value_type),
-                MaskedType(args[1].value_type),
-            )
-
-
-@cuda_decl_registry.register_global(operator.add)
-class MaskedScalarAddNull(AbstractTemplate):
-    def generic(self, args, kws):
-        if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
-            # In the case of op(Masked, NA), the result has the same
-            # dtype as the original regardless of what it is
-            return_type = args[0].value_type
-            return nb_signature(
-                MaskedType(return_type),
-                MaskedType(args[0].value_type),
-                NAType(),
-            )
-
-
-@cuda_lower(operator.add, MaskedType, MaskedType)
-def masked_scalar_add_impl(context, builder, sig, args):
-    # get the types from the signature
-
-    masked_type_1, masked_type_2 = sig.args
-    masked_return_type = sig.return_type
-
-    # create LLVM IR structs
-    m1 = cgutils.create_struct_proxy(masked_type_1)(
-        context, builder, value=args[0]
-    )
-    m2 = cgutils.create_struct_proxy(masked_type_2)(
-        context, builder, value=args[1]
-    )
-    result = cgutils.create_struct_proxy(masked_return_type)(context, builder)
-
-    valid = builder.and_(m1.valid, m2.valid)
-    result.valid = valid
-    with builder.if_then(valid):
-        # result.value = numba_op(m1.value, m2.value)
-        result.value = context.compile_internal(
-            builder, 
-            lambda x, y: x + y, 
-            nb_signature(masked_return_type.value_type, masked_type_1.value_type, masked_type_2.value_type), (m1.value, m2.value)
-        )
-
-    return result._getvalue()
-
-
-@cuda_lower(operator.add, MaskedType, NAType)
-def masked_scalar_add_na_impl(context, builder, sig, args):
-    return_type = sig.return_type
-    # use context to get llvm type for a bool
-    result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(
-        context, builder
-    )
-    result.valid = context.get_constant(types.boolean, 0)
-    return result._getvalue()
-
-
-@cuda_lowering_registry.lower_constant(NAType)
-def constant_dummy(context, builder, ty, pyval):
-    # This handles None, etc.
-    return context.get_dummy_value()
-
-
-@cuda_decl_registry.register_global(operator.add)
-class MaskedScalarAddConstant(AbstractTemplate):
-    def generic(self, args, kws):
-        if isinstance(args[0], MaskedType) and isinstance(
-            args[1], types.Integer
-        ):
-            # In the case of op(Masked, constant), we resolve the type between
-            # the Masked value_type and the constant's type directly
-            return_type = self.context.resolve_function_type(
-                self.key, (args[0].value_type, args[1]), kws
-            ).return_type
-            return nb_signature(
-                MaskedType(return_type),
-                MaskedType(args[0].value_type),
-                args[1],
-            )
-
-
-@cuda_lower(operator.add, MaskedType, types.Integer)
-def masked_scalar_add_constant_impl(context, builder, sig, input_values):
-    masked_type, const_type = sig.args
-
-    indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
-        context, builder, value=input_values[0]
-    )
-    result = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
-        context, builder
-    )
-    # to_add_const = context.get_constant(const_type, input_values[1])
-
-    result.valid = context.get_constant(types.boolean, 0)
-    with builder.if_then(indata.valid):
-        result.value = builder.add(indata.value, input_values[1])
-        result.valid = context.get_constant(types.boolean, 1)
-
-    return result._getvalue()
-
-# To handle the unification, we need to support casting from any type to an
-# extension type. The cast implementation takes the value passed in and returns
-# an extension struct wrapping that value.
-@cuda_impl_registry.lower_cast(types.Any, MaskedType)
-def cast_primitive_to_masked(context, builder, fromty, toty, val):
-    casted = context.cast(builder, val, fromty, toty.value_type)
-    ext = cgutils.create_struct_proxy(toty)(context, builder)
-    ext.value = casted
-    return ext._getvalue()
-
-@cuda_impl_registry.lower_cast(NAType, MaskedType)
-def cast_na_to_masked(context, builder, fromty, toty, val):
-    result = cgutils.create_struct_proxy(toty)(context, builder)
-    result.valid = context.get_constant(types.boolean, 0)
-
-    return result._getvalue()
-
-def compile_udf(func, dtypes):
-    n_params = len(py_signature(func).parameters)
-    to_compiler_sig = tuple(
-        MaskedType(arg)
-        for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)
-    )
-    # Get the inlineable PTX function
-    ptx, numba_output_type = cuda.compile_ptx_for_current_device(
-        func, to_compiler_sig, device=True
-    )
-    numpy_output_type = numpy_support.as_dtype(
-        numba_output_type.value_type
-    )
-
-    return numpy_output_type, ptx
-
-
-NA = _NAType()
-
-
-def nulludf(func):
-    def wrapper(*args):
-        from cudf import DataFrame
-
-        to_udf_table = DataFrame(
-            {idx: arg for idx, arg in zip(range(len(args)), args)}
-        )
-        return to_udf_table._apply(func)
-
-    return wrapper
diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py
new file mode 100644
index 00000000000..4608cae3228
--- /dev/null
+++ b/python/cudf/cudf/core/udf/__init__.py
@@ -0,0 +1 @@
+from . import typing, lowering
diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
new file mode 100644
index 00000000000..7bff26993d9
--- /dev/null
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -0,0 +1,114 @@
+from numba.cuda.cudaimpl import (
+    lower as cuda_lower,
+    registry as cuda_lowering_registry,
+)
+from numba.core.typing import signature as nb_signature
+from cudf.core.udf.typing import MaskedType, NAType
+from numba.core import cgutils
+from numba.cuda.cudaimpl import registry as cuda_impl_registry
+import operator
+from numba.extending import types
+
+@cuda_lowering_registry.lower_constant(NAType)
+def constant_dummy(context, builder, ty, pyval):
+    # This handles None, etc.
+    return context.get_dummy_value()
+
+# In the typing phase, we declared that a `MaskedType` can be 
+# added to another `MaskedType` and specified what kind of 
+# `MaskedType` would result. Now we have to actually fill in 
+# the implementation details of how to do that. This is where
+# we can involve both validities in constructing the answer
+@cuda_lower(operator.add, MaskedType, MaskedType)
+def masked_scalar_add_impl(context, builder, sig, args):
+    '''
+    Implement `MaskedType` + `MaskedType`
+    '''
+
+    masked_type_1, masked_type_2 = sig.args # MaskedType(...), MaskedType(...)
+    masked_return_type = sig.return_type # MaskedType(...)
+
+    # Let there be two actual LLVM structs backing the two inputs
+    # https://mapping-high-level-constructs-to-llvm-ir.readthedocs.io/en/latest/basic-constructs/structures.html
+    m1 = cgutils.create_struct_proxy(masked_type_1)(
+        context, builder, value=args[0]
+    )
+    m2 = cgutils.create_struct_proxy(masked_type_2)(
+        context, builder, value=args[1]
+    )
+
+    # we will return an output struct
+    result = cgutils.create_struct_proxy(masked_return_type)(context, builder)
+
+    # compute output validity
+    valid = builder.and_(m1.valid, m2.valid)
+    result.valid = valid
+    with builder.if_then(valid):
+        # Let numba handle generating the extra LLVM needed to perform
+        # operations on mixed types, by compiling the final core op between
+        # the two primitive values as a separate function and calling it
+        result.value = context.compile_internal(
+            builder, 
+            lambda x, y: x + y, 
+            nb_signature(
+                masked_return_type.value_type, 
+                masked_type_1.value_type, 
+                masked_type_2.value_type
+            ),
+            (m1.value, m2.value)
+        )
+    return result._getvalue()
+
+@cuda_lower(operator.add, MaskedType, NAType)
+def masked_scalar_add_na_impl(context, builder, sig, args):
+    '''
+    Implement `MaskedType` + `NAType`
+    The answer to this is known up front so no actual addition
+    needs to take place
+    '''
+
+    return_type = sig.return_type # MaskedType(...)
+    result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(
+        context, builder
+    )
+
+    # Invalidate the struct and leave `value` uninitialized
+    result.valid = context.get_constant(types.boolean, 0)
+    return result._getvalue()
+
+@cuda_lower(operator.add, MaskedType, types.Integer)
+def masked_scalar_add_constant_impl(context, builder, sig, input_values):
+    '''
+    Implement `MaskedType` + constant
+    '''
+    masked_type, const_type = sig.args
+
+    indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
+        context, builder, value=input_values[0]
+    )
+    result = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
+        context, builder
+    )
+    result.valid = context.get_constant(types.boolean, 0)
+    with builder.if_then(indata.valid):
+        result.value = builder.add(indata.value, input_values[1])
+        result.valid = context.get_constant(types.boolean, 1)
+
+    return result._getvalue()
+
+# To handle the unification, we need to support casting from any type to an
+# extension type. The cast implementation takes the value passed in and returns
+# an extension struct wrapping that value.
+@cuda_impl_registry.lower_cast(types.Any, MaskedType)
+def cast_primitive_to_masked(context, builder, fromty, toty, val):
+    casted = context.cast(builder, val, fromty, toty.value_type)
+    ext = cgutils.create_struct_proxy(toty)(context, builder)
+    ext.value = casted
+    return ext._getvalue()
+
+@cuda_impl_registry.lower_cast(NAType, MaskedType)
+def cast_na_to_masked(context, builder, fromty, toty, val):
+    result = cgutils.create_struct_proxy(toty)(context, builder)
+    result.valid = context.get_constant(types.boolean, 0)
+
+    return result._getvalue()
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
new file mode 100644
index 00000000000..f85c1ffdd28
--- /dev/null
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -0,0 +1,49 @@
+from inspect import signature as py_signature
+from cudf.core.udf.typing import MaskedType
+from numba.np import numpy_support
+from numba import cuda
+
+def compile_udf(func, dtypes):
+    '''
+    Generate an inlineable PTX function that will be injected into
+    a variadic kernel inside libcudf
+
+    assume all input types are `MaskedType(input_col.dtype)` and then
+    compile the requestied PTX function as a function over those types
+    '''
+    to_compiler_sig = tuple(
+        MaskedType(arg)
+        for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)
+    )
+    # Get the inlineable PTX function
+    ptx, numba_output_type = cuda.compile_ptx_for_current_device(
+        func, to_compiler_sig, device=True
+    )
+    numpy_output_type = numpy_support.as_dtype(
+        numba_output_type.value_type
+    )
+
+    return numpy_output_type, ptx
+
+def nulludf(func):
+    '''
+    Mimic pandas API:
+
+    def f(x, y):
+        return x + y
+    df.apply(lambda row: f(row['x'], row['y']))
+
+    in this scheme, `row` is actually the whole dataframe
+    `DataFrame` sends `self` in as `row` and subsequently
+    we end up calling `f` on the resulting columns since
+    the dataframe is dict-like
+    '''
+    def wrapper(*args):
+        from cudf import DataFrame
+        # This probably creates copies but is fine for now
+        to_udf_table = DataFrame(
+            {idx: arg for idx, arg in zip(range(len(args)), args)}
+        )
+        # Frame._apply
+        return to_udf_table._apply(func)
+    return wrapper
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
new file mode 100644
index 00000000000..28aace52da9
--- /dev/null
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -0,0 +1,186 @@
+from numba import types
+from cudf.core.scalar import _NAType
+from numba.core.extending import typeof_impl, register_model, models
+from numba.cuda.cudadecl import registry as cuda_decl_registry
+from numba.core.typing.templates import AbstractTemplate
+from numba.core.typing import signature as nb_signature
+
+import operator
+
+class MaskedType(types.Type):
+    '''
+    A numba type consiting of a value of some primitive type 
+    and a validity boolean, over which we can define math ops
+    '''
+    def __init__(self, value):
+        # MaskedType in numba shall be parameterized
+        # with a value type
+        super().__init__(name="Masked")
+        self.value_type = value
+
+    def __repr__(self):
+        return f"MaskedType({self.value_type})"
+
+    def __hash__(self):
+        '''
+        Needed so that numba caches type instances with different
+        `value_type` separately.  
+        '''
+        return self.__repr__().__hash__()
+
+    def unify(self, context, other):
+        '''
+        Logic for sorting out what to do when the UDF conditionally
+        returns a `MaskedType`, an `NAType`, or a literal based off 
+        the data at runtime.
+
+        In this framework, every input column is treated as having
+        type `MaskedType`. Operations like `x + y` are understood 
+        as translating to:
+
+        `Masked(value=x, valid=True) + Masked(value=y, valid=True)`
+
+        This means if the user writes a function such as 
+        def f(x, y):
+            return x + y
+            
+        numba sees this function as:
+        f(x: MaskedType, y: MaskedType) -> MaskedType
+        
+        However if the user writes something like:
+        def f(x, y):
+            if x > 5:
+                return 42
+            else:
+                return x + y
+        
+        numba now sees this as
+        f(x: MaskedType, y: MaskedType) -> MaskedType OR literal 
+        '''
+        
+        # If we have Masked and NA, the output should be a 
+        # MaskedType with the original type as its value_type
+        if isinstance(other, NAType):
+            return MaskedType(self.value_type)
+
+        # if we have MaskedType and Literal, the output should be
+        # determined from the MaskedType.value_type (which is a 
+        # primitive type) and other
+        unified = context.unify_pairs(self.value_type, other)
+        if unified is None:
+            return None
+
+        return MaskedType(unified)
+
+# Tell numba how `MaskedType` is constructed on the backend in terms
+# of primitive things that exist at the LLVM level
+@register_model(MaskedType)
+class MaskedModel(models.StructModel):
+    def __init__(self, dmm, fe_type):
+        # This struct has two members, a value and a validity
+        # let the type of the `value` field be the same as the 
+        # `value_type` and let `valid` be a boolean 
+        members = [("value", fe_type.value_type), ("valid", types.bool_)]
+        models.StructModel.__init__(self, dmm, fe_type, members)
+
+
+class NAType(types.Type):
+    '''
+    A type for handling ops against nulls
+    Exists so we can:
+    1. Teach numba that all occurances of `cudf.NA` are 
+       to be read as instances of this type instead
+    2. Define ops like `if x is cudf.NA` where `x` is of 
+       type `Masked` to mean `if x.valid is False`
+    '''
+    def __init__(self):
+        super().__init__(name="NA")
+
+    def unify(self, context, other):
+        '''
+        Masked  <-> NA works from above
+        Literal <-> NA -> Masked
+        '''
+        if isinstance(other, types.abstract.Literal):
+            return MaskedType(other.literal_type)
+
+
+@typeof_impl.register(_NAType)
+def typeof_na(val, c):
+    '''
+    Tie instances of _NAType (cudf.NA) to our NAType.
+    Effectively make it so numba sees `cudf.NA` as an
+    instance of this NAType -> handle it accordingly.
+    '''
+    return NAType()
+
+register_model(NAType)(models.OpaqueModel)
+
+
+# Ultimately, we want numba to produce PTX code that specifies how to add
+# two singular `Masked` structs together, which is defined as producing a
+# new `Masked` with the right validity and if valid, the correct value. 
+# This happens in two phases:
+#   1. Specify that `Masked` + `Masked` exists and what it should return
+#   2. Implement how to actually do (1) at the LLVM level
+# The following code accomplishes (1) - it is really just a way of specifying
+# that the `+` operation has a CUDA overload that accepts two `Masked` that
+# are parameterized with `value_type` and what flavor of `Masked` to return.
+@cuda_decl_registry.register_global(operator.add)
+class MaskedScalarAdd(AbstractTemplate):
+    def generic(self, args, kws):
+        '''
+        Typing for `Masked` + `Masked`
+        Numba expects a valid numba type to be returned if typing is successful
+        else `None` signifies the error state (this is common across numba)
+        '''
+        if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
+            # In the case of op(Masked, Masked), the return type is a Masked
+            # such that Masked.value is the primitive type that would have 
+            # been resolved if we were just adding the `value_type`s. 
+            return_type = self.context.resolve_function_type(
+                self.key, (args[0].value_type, args[1].value_type), kws
+            ).return_type
+            return nb_signature(
+                MaskedType(return_type),
+                MaskedType(args[0].value_type),
+                MaskedType(args[1].value_type),
+            )
+
+@cuda_decl_registry.register_global(operator.add)
+class MaskedScalarAddNull(AbstractTemplate):
+    def generic(self, args, kws):
+        '''
+        Typing for `Masked` + `NA`
+        Handles situations like `x + cudf.NA`
+        '''
+        if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
+            # In the case of op(Masked, NA), the result has the same
+            # dtype as the original regardless of what it is
+            return_type = args[0].value_type
+            return nb_signature(
+                MaskedType(return_type),
+                MaskedType(args[0].value_type),
+                NAType(),
+            )
+
+@cuda_decl_registry.register_global(operator.add)
+class MaskedScalarAddConstant(AbstractTemplate):
+    def generic(self, args, kws):
+        '''
+        Typing for `Masked` + a constant literal
+        handles situations like `x + 1`
+        '''
+        if isinstance(args[0], MaskedType) and isinstance(
+            args[1], types.Integer
+        ):
+            # In the case of op(Masked, constant), we resolve the type between
+            # the Masked value_type and the constant's type directly
+            return_type = self.context.resolve_function_type(
+                self.key, (args[0].value_type, args[1]), kws
+            ).return_type
+            return nb_signature(
+                MaskedType(return_type),
+                MaskedType(args[0].value_type),
+                args[1],
+            )
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 252ddd24088..529f810994f 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -1,5 +1,5 @@
 import cudf
-from cudf.core.udf import nulludf
+from cudf.core.udf.pipeline import nulludf
 from cudf.tests.utils import assert_eq, NUMERIC_TYPES
 import pandas as pd
 import itertools

From 6287404e6cbcc7b2f6a24efd3d6a1c4679372c7e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 12 May 2021 11:56:27 -0700
Subject: [PATCH 052/132] remove erroneous header

---
 cpp/src/transform/transform.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 1d8f76d9e2c..a6eb9407f03 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -16,7 +16,6 @@
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 #include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
-#include <jit_preprocessed_files/transform/jit/baked_udf_requirements.cu.jit.hpp>
 
 
 #include <jit/cache.hpp>

From aa38be229e36df95dc547da1ea36e44e5987dc1b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 13 May 2021 06:28:45 -0700
Subject: [PATCH 053/132] typing and lowering for Masked is NA, currently not
 working

---
 python/cudf/cudf/core/udf/lowering.py | 18 ++++++++++++++++++
 python/cudf/cudf/core/udf/typing.py   | 13 +++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 7bff26993d9..35839164812 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -96,6 +96,24 @@ def masked_scalar_add_constant_impl(context, builder, sig, input_values):
 
     return result._getvalue()
 
+@cuda_lower(operator.is_, MaskedType, NAType)
+def masked_scalar_is_null_impl(context, builder, sig, args):
+    '''
+    Implement `MaskedType` + constant
+    '''
+    masked_type, na = sig.args
+    indata = cgutils.create_struct_proxy(masked_type)(
+        context, builder, value=args[0]
+    )
+    result = context.get_constant(types.boolean, 1)
+
+    with builder.if_else(indata.valid) as (then, otherwise):
+        with then:
+            result = context.get_constant(types.boolean, 0)
+        with otherwise:
+            result = context.get_constant(types.boolean, 1)
+    return result
+
 # To handle the unification, we need to support casting from any type to an
 # extension type. The cast implementation takes the value passed in and returns
 # an extension struct wrapping that value.
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 28aace52da9..0377e370349 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -147,6 +147,19 @@ def generic(self, args, kws):
                 MaskedType(args[1].value_type),
             )
 
+
+@cuda_decl_registry.register_global(operator.is_)
+class MaskedScalarIsNull(AbstractTemplate):
+    '''
+    Typing for `Masked is cudf.NA`
+    '''
+    def generic(self, args, kws):
+        if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
+            return nb_signature(
+                types.boolean, 
+                MaskedType(args[0].value_type), 
+                NAType())
+
 @cuda_decl_registry.register_global(operator.add)
 class MaskedScalarAddNull(AbstractTemplate):
     def generic(self, args, kws):

From b5dcd13452be941d9f2ce8cfcd096f8ae5e44b74 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 13 May 2021 06:32:04 -0700
Subject: [PATCH 054/132] remove erroneous logic

---
 python/cudf/cudf/core/udf/lowering.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 35839164812..63746edda31 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -105,7 +105,6 @@ def masked_scalar_is_null_impl(context, builder, sig, args):
     indata = cgutils.create_struct_proxy(masked_type)(
         context, builder, value=args[0]
     )
-    result = context.get_constant(types.boolean, 1)
 
     with builder.if_else(indata.valid) as (then, otherwise):
         with then:

From 4c29c23208e8322f33df19403815ee63fd3d6fae Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 13 May 2021 08:54:32 -0700
Subject: [PATCH 055/132] fix lowering for Masked is NA

---
 python/cudf/cudf/core/udf/lowering.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 63746edda31..a64722151d6 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -8,6 +8,7 @@
 from numba.cuda.cudaimpl import registry as cuda_impl_registry
 import operator
 from numba.extending import types
+from llvmlite import ir
 
 @cuda_lowering_registry.lower_constant(NAType)
 def constant_dummy(context, builder, ty, pyval):
@@ -105,13 +106,14 @@ def masked_scalar_is_null_impl(context, builder, sig, args):
     indata = cgutils.create_struct_proxy(masked_type)(
         context, builder, value=args[0]
     )
-
+    result = cgutils.alloca_once(builder, ir.IntType(1))
     with builder.if_else(indata.valid) as (then, otherwise):
         with then:
-            result = context.get_constant(types.boolean, 0)
+            builder.store(context.get_constant(types.boolean, 0), result)
         with otherwise:
-            result = context.get_constant(types.boolean, 1)
-    return result
+            builder.store(context.get_constant(types.boolean, 1), result)
+
+    return builder.load(result)
 
 # To handle the unification, we need to support casting from any type to an
 # extension type. The cast implementation takes the value passed in and returns
@@ -121,6 +123,7 @@ def cast_primitive_to_masked(context, builder, fromty, toty, val):
     casted = context.cast(builder, val, fromty, toty.value_type)
     ext = cgutils.create_struct_proxy(toty)(context, builder)
     ext.value = casted
+    ext.valid = context.get_constant(types.boolean, 1)
     return ext._getvalue()
 
 @cuda_impl_registry.lower_cast(NAType, MaskedType)

From 837f2ef3a60c5c4ed55a5abe859ae5c4bc01c0be Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 13 May 2021 12:29:00 -0700
Subject: [PATCH 056/132] rougly fix test_apply_NA_conditional, which was
 passing by coincidence

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 529f810994f..d36d2c1616f 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -109,11 +109,14 @@ def func_gdf(x, y):
         'b':[4,5,None, None]
     })
 
-    pdf = gdf.to_pandas()
+    pdf = gdf.to_pandas(nullable=True)
 
     expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
     obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
 
+    # using a UDF on a nullable dtype in pandas casts to object
+    expect = expect.astype(pd.Int64Dtype())
+    obtain = obtain.to_pandas(nullable=True)
     assert_eq(expect, obtain)
 
 

From 2d7104d1ed1e5fd47c3acec293c66ff577805907 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 14 May 2021 16:05:56 -0500
Subject: [PATCH 057/132] support and test all arithmetic operators

---
 python/cudf/cudf/core/udf/lowering.py         | 105 ++++++++++++------
 python/cudf/cudf/core/udf/typing.py           |  18 ++-
 python/cudf/cudf/tests/test_udf_masked_ops.py |  48 ++++++++
 3 files changed, 135 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index a64722151d6..5021a32d138 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -10,6 +10,18 @@
 from numba.extending import types
 from llvmlite import ir
 
+import operator
+arith_ops = [
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv,
+    operator.floordiv,
+    operator.mod,
+    operator.pow
+
+]
+
 @cuda_lowering_registry.lower_constant(NAType)
 def constant_dummy(context, builder, ty, pyval):
     # This handles None, etc.
@@ -20,45 +32,70 @@ def constant_dummy(context, builder, ty, pyval):
 # `MaskedType` would result. Now we have to actually fill in 
 # the implementation details of how to do that. This is where
 # we can involve both validities in constructing the answer
-@cuda_lower(operator.add, MaskedType, MaskedType)
-def masked_scalar_add_impl(context, builder, sig, args):
+
+def make_arithmetic_op(op):
     '''
-    Implement `MaskedType` + `MaskedType`
+    Make closures that implement arithmetic operations. See
+    register_arithmetic_op for details.
     '''
+    def masked_scalar_op_impl(context, builder, sig, args):
+        '''
+        Implement `MaskedType` + `MaskedType`
+        '''
+
+        masked_type_1, masked_type_2 = sig.args # MaskedType(...), MaskedType(...)
+        masked_return_type = sig.return_type # MaskedType(...)
+
+        # Let there be two actual LLVM structs backing the two inputs
+        # https://mapping-high-level-constructs-to-llvm-ir.readthedocs.io/en/latest/basic-constructs/structures.html
+        m1 = cgutils.create_struct_proxy(masked_type_1)(
+            context, builder, value=args[0]
+        )
+        m2 = cgutils.create_struct_proxy(masked_type_2)(
+            context, builder, value=args[1]
+        )
 
-    masked_type_1, masked_type_2 = sig.args # MaskedType(...), MaskedType(...)
-    masked_return_type = sig.return_type # MaskedType(...)
+        # we will return an output struct
+        result = cgutils.create_struct_proxy(masked_return_type)(context, builder)
+
+        # compute output validity
+        valid = builder.and_(m1.valid, m2.valid)
+        result.valid = valid
+        with builder.if_then(valid):
+            # Let numba handle generating the extra LLVM needed to perform
+            # operations on mixed types, by compiling the final core op between
+            # the two primitive values as a separate function and calling it
+            result.value = context.compile_internal(
+                builder, 
+                lambda x, y: op(x, y), 
+                nb_signature(
+                    masked_return_type.value_type, 
+                    masked_type_1.value_type, 
+                    masked_type_2.value_type
+                ),
+                (m1.value, m2.value)
+            )
+        return result._getvalue()
+    return masked_scalar_op_impl
+
+def register_arithmetic_op(op):
+    '''
+    Register a lowering implementation for the 
+    arithmetic op `op`.
 
-    # Let there be two actual LLVM structs backing the two inputs
-    # https://mapping-high-level-constructs-to-llvm-ir.readthedocs.io/en/latest/basic-constructs/structures.html
-    m1 = cgutils.create_struct_proxy(masked_type_1)(
-        context, builder, value=args[0]
-    )
-    m2 = cgutils.create_struct_proxy(masked_type_2)(
-        context, builder, value=args[1]
-    )
+    Because the lowering implementations compile the final
+    op separately using a lambda and compile_internal, `op`
+    needs to be tied to each lowering implementation using
+    a closure. 
 
-    # we will return an output struct
-    result = cgutils.create_struct_proxy(masked_return_type)(context, builder)
-
-    # compute output validity
-    valid = builder.and_(m1.valid, m2.valid)
-    result.valid = valid
-    with builder.if_then(valid):
-        # Let numba handle generating the extra LLVM needed to perform
-        # operations on mixed types, by compiling the final core op between
-        # the two primitive values as a separate function and calling it
-        result.value = context.compile_internal(
-            builder, 
-            lambda x, y: x + y, 
-            nb_signature(
-                masked_return_type.value_type, 
-                masked_type_1.value_type, 
-                masked_type_2.value_type
-            ),
-            (m1.value, m2.value)
-        )
-    return result._getvalue()
+    This function makes and lowers a closure for one op.
+
+    '''
+    to_lower_op = make_arithmetic_op(op)
+    cuda_lower(op, MaskedType, MaskedType)(to_lower_op)
+
+for op in arith_ops:
+    register_arithmetic_op(op)
 
 @cuda_lower(operator.add, MaskedType, NAType)
 def masked_scalar_add_na_impl(context, builder, sig, args):
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 0377e370349..4ac145c511b 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -7,6 +7,18 @@
 
 import operator
 
+arith_ops = [
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv,
+    operator.floordiv,
+    operator.mod,
+    operator.pow
+
+]
+
+
 class MaskedType(types.Type):
     '''
     A numba type consiting of a value of some primitive type 
@@ -126,8 +138,7 @@ def typeof_na(val, c):
 # The following code accomplishes (1) - it is really just a way of specifying
 # that the `+` operation has a CUDA overload that accepts two `Masked` that
 # are parameterized with `value_type` and what flavor of `Masked` to return.
-@cuda_decl_registry.register_global(operator.add)
-class MaskedScalarAdd(AbstractTemplate):
+class MaskedScalarArithOp(AbstractTemplate):
     def generic(self, args, kws):
         '''
         Typing for `Masked` + `Masked`
@@ -147,6 +158,9 @@ def generic(self, args, kws):
                 MaskedType(args[1].value_type),
             )
 
+for op in arith_ops:
+    # Every op shares the same typing class
+    cuda_decl_registry.register_global(op)(MaskedScalarArithOp)
 
 @cuda_decl_registry.register_global(operator.is_)
 class MaskedScalarIsNull(AbstractTemplate):
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 529f810994f..e8c395194e2 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -4,6 +4,54 @@
 import pandas as pd
 import itertools
 import pytest
+import operator
+
+arith_ops = [
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv,
+    operator.floordiv,
+    operator.mod,
+    operator.pow
+
+]
+
+def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
+    gdf = data
+    pdf = data.to_pandas(nullable=True)
+
+    expect = pdf.apply(
+        lambda row: func_pdf(
+            *[row[i] for i in data.columns]
+        ),
+        axis=1
+    )
+    obtain = gdf.apply(
+        lambda row: func_gdf(
+            *[row[i] for i in data.columns]
+            ),
+            axis=1
+    )
+    assert_eq(expect, obtain, **kwargs)
+
+@pytest.mark.parametrize('op', arith_ops)
+def test_arith_masked_vs_masked(op):
+    # This test should test all the typing
+    # and lowering for arithmetic ops between
+    # two columns 
+    def func_pdf(x, y):
+        return op(x, y)
+
+    @nulludf
+    def func_gdf(x, y):
+        return op(x, y)
+
+    gdf = cudf.DataFrame({
+        'a':[1,None,3, None],
+        'b':[4,5,None, None]
+    })
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 def test_apply_basic():
     def func_pdf(x, y):

From b63b435d5a291b8ca0048797deec2ba61b95df04 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 16 May 2021 10:14:57 -0500
Subject: [PATCH 058/132] typing, lowering, tests for masked+constant

---
 python/cudf/cudf/core/udf/lowering.py         | 63 ++++++++++++-------
 python/cudf/cudf/core/udf/typing.py           | 40 ++++++------
 python/cudf/cudf/tests/test_udf_masked_ops.py | 24 ++++---
 3 files changed, 73 insertions(+), 54 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 5021a32d138..10631b4d84c 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -94,11 +94,8 @@ def register_arithmetic_op(op):
     to_lower_op = make_arithmetic_op(op)
     cuda_lower(op, MaskedType, MaskedType)(to_lower_op)
 
-for op in arith_ops:
-    register_arithmetic_op(op)
-
 @cuda_lower(operator.add, MaskedType, NAType)
-def masked_scalar_add_na_impl(context, builder, sig, args):
+def masked_scalar_null_op_impl(context, builder, sig, args):
     '''
     Implement `MaskedType` + `NAType`
     The answer to this is known up front so no actual addition
@@ -114,25 +111,49 @@ def masked_scalar_add_na_impl(context, builder, sig, args):
     result.valid = context.get_constant(types.boolean, 0)
     return result._getvalue()
 
-@cuda_lower(operator.add, MaskedType, types.Integer)
-def masked_scalar_add_constant_impl(context, builder, sig, input_values):
-    '''
-    Implement `MaskedType` + constant
-    '''
-    masked_type, const_type = sig.args
+def make_const_op(op):
+    def masked_scalar_const_op_impl(context, builder, sig, input_values):
+        '''
+        Implement `MaskedType` + constant
+        '''
+        masked_type, const_type = sig.args
 
-    indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
-        context, builder, value=input_values[0]
-    )
-    result = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
-        context, builder
-    )
-    result.valid = context.get_constant(types.boolean, 0)
-    with builder.if_then(indata.valid):
-        result.value = builder.add(indata.value, input_values[1])
-        result.valid = context.get_constant(types.boolean, 1)
+        indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
+            context, builder, value=input_values[0]
+        )
+        result = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
+            context, builder
+        )
+        result.valid = context.get_constant(types.boolean, 0)
+        with builder.if_then(indata.valid):
+
+            result.value = context.compile_internal(
+                builder, 
+                lambda x, y: op(x, y), 
+                nb_signature(
+                    masked_type.value_type, 
+                    masked_type.value_type, 
+                    const_type
+                ),
+                (indata.value, input_values[1])
+            )
+            result.valid = context.get_constant(types.boolean, 1)
+
+        return result._getvalue()
+    return masked_scalar_const_op_impl
+
+def register_const_op(op):
+    to_lower_op = make_const_op(op)
+    cuda_lower(op, MaskedType, types.Number)(to_lower_op)
+
+
+# register all lowering at init
+for op in arith_ops:
+    register_arithmetic_op(op)
+    register_const_op(op)
+    # null op impl can be shared between all ops
+    cuda_lower(op, MaskedType, NAType)(masked_scalar_null_op_impl)
 
-    return result._getvalue()
 
 @cuda_lower(operator.is_, MaskedType, NAType)
 def masked_scalar_is_null_impl(context, builder, sig, args):
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 4ac145c511b..2a32e8f2f3b 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -158,24 +158,7 @@ def generic(self, args, kws):
                 MaskedType(args[1].value_type),
             )
 
-for op in arith_ops:
-    # Every op shares the same typing class
-    cuda_decl_registry.register_global(op)(MaskedScalarArithOp)
-
-@cuda_decl_registry.register_global(operator.is_)
-class MaskedScalarIsNull(AbstractTemplate):
-    '''
-    Typing for `Masked is cudf.NA`
-    '''
-    def generic(self, args, kws):
-        if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
-            return nb_signature(
-                types.boolean, 
-                MaskedType(args[0].value_type), 
-                NAType())
-
-@cuda_decl_registry.register_global(operator.add)
-class MaskedScalarAddNull(AbstractTemplate):
+class MaskedScalarNullOp(AbstractTemplate):
     def generic(self, args, kws):
         '''
         Typing for `Masked` + `NA`
@@ -191,8 +174,7 @@ def generic(self, args, kws):
                 NAType(),
             )
 
-@cuda_decl_registry.register_global(operator.add)
-class MaskedScalarAddConstant(AbstractTemplate):
+class MaskedScalarConstOp(AbstractTemplate):
     def generic(self, args, kws):
         '''
         Typing for `Masked` + a constant literal
@@ -211,3 +193,21 @@ def generic(self, args, kws):
                 MaskedType(args[0].value_type),
                 args[1],
             )
+
+@cuda_decl_registry.register_global(operator.is_)
+class MaskedScalarIsNull(AbstractTemplate):
+    '''
+    Typing for `Masked is cudf.NA`
+    '''
+    def generic(self, args, kws):
+        if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
+            return nb_signature(
+                types.boolean, 
+                MaskedType(args[0].value_type), 
+                NAType())
+
+for op in arith_ops:
+    # Every op shares the same typing class
+    cuda_decl_registry.register_global(op)(MaskedScalarArithOp)
+    cuda_decl_registry.register_global(op)(MaskedScalarNullOp)
+    cuda_decl_registry.register_global(op)(MaskedScalarConstOp)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index e8c395194e2..23bd8ae4a9c 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -53,26 +53,24 @@ def func_gdf(x, y):
     })
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
-def test_apply_basic():
-    def func_pdf(x, y):
-        return x + y
-
+@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('constant', [1])
+def test_arith_masked_vs_constant(op, constant):
+    def func_pdf(x):
+        return op(x, constant)
+    
     @nulludf
-    def func_gdf(x, y):
-        return x + y
-
+    def func_gdf(x):
+        return op(x, constant)
 
+    # Just a single column -> result will be all NA
     gdf = cudf.DataFrame({
-        'a':[1,2,3],
-        'b':[4,5,6]
+        'data': [1,2,3]
     })
 
-    pdf = gdf.to_pandas()
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
-    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
-    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
 
-    assert_eq(expect, obtain)
 
 def test_apply_null():
     def func_pdf(x, y):

From a3e1444dc0766a3bb33d1d3a7d2615667aa067ce Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 16 May 2021 10:52:12 -0500
Subject: [PATCH 059/132] try and type mixed return value, and fail to do so

---
 python/cudf/cudf/core/udf/typing.py           | 2 +-
 python/cudf/cudf/tests/test_udf_masked_ops.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 2a32e8f2f3b..5b3895e55a6 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -181,7 +181,7 @@ def generic(self, args, kws):
         handles situations like `x + 1`
         '''
         if isinstance(args[0], MaskedType) and isinstance(
-            args[1], types.Integer
+            args[1], types.Number
         ):
             # In the case of op(Masked, constant), we resolve the type between
             # the Masked value_type and the constant's type directly
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 23bd8ae4a9c..c051b854ae8 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -54,7 +54,7 @@ def func_gdf(x, y):
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 @pytest.mark.parametrize('op', arith_ops)
-@pytest.mark.parametrize('constant', [1])
+@pytest.mark.parametrize('constant', [1, 1.5])
 def test_arith_masked_vs_constant(op, constant):
     def func_pdf(x):
         return op(x, constant)

From ca79d722ea7732e113ce5c7d527302def2b4e0d5 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Sun, 16 May 2021 10:57:31 -0500
Subject: [PATCH 060/132] continued adding/refactoring of tests

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 77 +++----------------
 1 file changed, 10 insertions(+), 67 deletions(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index c051b854ae8..b47af7ffc6e 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -70,72 +70,21 @@ def func_gdf(x):
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
-
-
-def test_apply_null():
-    def func_pdf(x, y):
-        return x + y
-
-    @nulludf
-    def func_gdf(x, y):
-        return x + y
-
-
-    gdf = cudf.DataFrame({
-        'a':[1,None,3, None],
-        'b':[4,5,None, None]
-    })
-
-    pdf = gdf.to_pandas()
-
-    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
-    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
-    assert_eq(expect, obtain)
-
-
-def test_apply_add_null():
-    def func_pdf(x, y):
-        return x + y + pd.NA
-
-    @nulludf
-    def func_gdf(x, y):
-        return x + y + cudf.NA
-
-
-    gdf = cudf.DataFrame({
-        'a':[1,None,3, None],
-        'b':[4,5,None, None]
-    })
-
-    pdf = gdf.to_pandas()
-
-    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
-    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
-    # TODO: dtype mismatch here
-    assert_eq(expect, obtain, check_dtype=False)
-
-
-def test_apply_add_constant():
-    def func_pdf(x, y):
-        return x + y + 1
+@pytest.mark.parametrize('op', arith_ops)
+def test_arith_masked_vs_null(op):
+    def func_pdf(x):
+        return op(x, pd.NA)
 
     @nulludf
-    def func_gdf(x, y):
-        return x + y + 1
-
+    def func_gdf(x):
+        return op(x, cudf.NA)
 
     gdf = cudf.DataFrame({
-        'a':[1,None,3, None],
-        'b':[4,5,None, None]
+        'data': [1, None, 3]
     })
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
-    pdf = gdf.to_pandas()
-
-    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
-    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
-    assert_eq(expect, obtain)
-
-def test_apply_NA_conditional():
+def test_masked_is_null_conditional():
     def func_pdf(x, y):
         if x is pd.NA:
             return y
@@ -154,13 +103,7 @@ def func_gdf(x, y):
         'a':[1,None,3, None],
         'b':[4,5,None, None]
     })
-
-    pdf = gdf.to_pandas()
-
-    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
-    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
-
-    assert_eq(expect, obtain)
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize('dtype_a', list(NUMERIC_TYPES))

From e1defcbd3f0f50f04ed07f461583c97ce5ddb24a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 18 May 2021 07:55:14 -0700
Subject: [PATCH 061/132] fix ops between masked and const of different dtype

---
 python/cudf/cudf/core/udf/lowering.py         | 6 +++---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 10631b4d84c..3e4b12b8c5c 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -117,11 +117,11 @@ def masked_scalar_const_op_impl(context, builder, sig, input_values):
         Implement `MaskedType` + constant
         '''
         masked_type, const_type = sig.args
-
+        return_type = sig.return_type
         indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
             context, builder, value=input_values[0]
         )
-        result = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
+        result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(
             context, builder
         )
         result.valid = context.get_constant(types.boolean, 0)
@@ -131,7 +131,7 @@ def masked_scalar_const_op_impl(context, builder, sig, input_values):
                 builder, 
                 lambda x, y: op(x, y), 
                 nb_signature(
-                    masked_type.value_type, 
+                    return_type.value_type, 
                     masked_type.value_type, 
                     const_type
                 ),
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index b47af7ffc6e..b55c779d29f 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -65,7 +65,7 @@ def func_gdf(x):
 
     # Just a single column -> result will be all NA
     gdf = cudf.DataFrame({
-        'data': [1,2,3]
+        'data': [1,2,None]
     })
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)

From 33d3dcb474fcccd8e754451c5c0d1c8216efff0b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 18 May 2021 08:59:45 -0700
Subject: [PATCH 062/132] update tests

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 34 +++++--------------
 1 file changed, 8 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index b55c779d29f..fc2eb3c8f29 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -123,36 +123,23 @@ def func_gdf(x, y):
     gdf['a'] = gdf['a'].astype(dtype_a)
     gdf['b'] = gdf['b'].astype(dtype_b)
 
-    pdf = gdf.to_pandas()
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 
 
-    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
-    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
 
-    # currently, cases where one side is float32 fail, pandas doing some
-    # weird casting here and getting float64 always
-    assert_eq(expect, obtain)
-
-
-def test_apply_return_literal():
-    # 1. Casting rule literal -> Masked
-    #  -> a) make it so numba knows that we can even promote literals to Masked ()
-    #  -> b) implement custom lowering to specify how this actually happens (python only)
-
-
-    # 2. Custom unfication code
-
-
-    # numba/core/type
+@pytest.mark.parametrize('val', [
+    5, 5.5
+])
+def test_apply_return_literal(val):
     def func_pdf(x, y):
         if x is pd.NA:
-            return 5
+            return val
         else:
             return x + y
 
     @nulludf
     def func_gdf(x, y):
         if x is cudf.NA:
-            return 5 # Masked(5, True)
+            return val # Masked(5, True)
         else:
             return x + y
 
@@ -162,9 +149,4 @@ def func_gdf(x, y):
         'b':[4,5,None, None]
     })
 
-    pdf = gdf.to_pandas()
-
-    expect = pdf.apply(lambda row: func_pdf(row['a'], row['b']), axis=1)
-    obtain = gdf.apply(lambda row: func_gdf(row['a'], row['b']), axis=1)
-
-    assert_eq(expect, obtain)
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 

From 5769ded6eba0101cf4ad5b67177442efb1bd2b73 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 18 May 2021 09:37:22 -0700
Subject: [PATCH 063/132] add test for returning NA

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index fc2eb3c8f29..30ad61ff742 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -150,3 +150,21 @@ def func_gdf(x, y):
     })
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 
+
+def test_apply_return_null():
+    def func_pdf(x):
+        if x is pd.NA:
+            return pd.NA
+        else:
+            return x 
+
+    @nulludf
+    def func_gdf(x):
+        if x is cudf.NA:
+            return cudf.NA
+        else:
+            return x
+
+    gdf = cudf.DataFrame({'a': [1, None, 3]})
+    breakpoint()
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 

From 1579b399689e3ae65f3c3ba85bbf768d7237ed28 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 18 May 2021 12:10:03 -0700
Subject: [PATCH 064/132] add masked v masked comparison ops

---
 python/cudf/cudf/core/udf/lowering.py         | 26 +++++++++++++-
 python/cudf/cudf/core/udf/typing.py           | 26 +++++++++++++-
 python/cudf/cudf/tests/test_udf_masked_ops.py | 34 ++++++++++++++++++-
 3 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 3e4b12b8c5c..8c53065cd71 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -22,6 +22,15 @@
 
 ]
 
+comparison_ops = [
+    operator.eq, 
+    operator.ne,
+    operator.lt,
+    operator.le,
+    operator.gt,
+    operator.ge
+]
+
 @cuda_lowering_registry.lower_constant(NAType)
 def constant_dummy(context, builder, ty, pyval):
     # This handles None, etc.
@@ -78,6 +87,7 @@ def masked_scalar_op_impl(context, builder, sig, args):
         return result._getvalue()
     return masked_scalar_op_impl
 
+
 def register_arithmetic_op(op):
     '''
     Register a lowering implementation for the 
@@ -148,7 +158,7 @@ def register_const_op(op):
 
 
 # register all lowering at init
-for op in arith_ops:
+for op in arith_ops + comparison_ops:
     register_arithmetic_op(op)
     register_const_op(op)
     # null op impl can be shared between all ops
@@ -173,6 +183,20 @@ def masked_scalar_is_null_impl(context, builder, sig, args):
 
     return builder.load(result)
 
+@cuda_lower(operator.truth, MaskedType)
+def masked_scalar_truth_impl(context, builder, sig, args):
+    indata = cgutils.create_struct_proxy(MaskedType(types.boolean))(
+        context, builder, value=args[0]
+    )
+    return indata.value
+
+@cuda_lower(bool, MaskedType)
+def masked_scalar_truth_impl(context, builder, sig, args):
+    indata = cgutils.create_struct_proxy(MaskedType(types.boolean))(
+        context, builder, value=args[0]
+    )
+    return indata.value
+
 # To handle the unification, we need to support casting from any type to an
 # extension type. The cast implementation takes the value passed in and returns
 # an extension struct wrapping that value.
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 5b3895e55a6..2fa09fb23a7 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -19,6 +19,15 @@
 ]
 
 
+comparison_ops = [
+    operator.eq, 
+    operator.ne,
+    operator.lt,
+    operator.le,
+    operator.gt,
+    operator.ge
+]
+
 class MaskedType(types.Type):
     '''
     A numba type consiting of a value of some primitive type 
@@ -206,7 +215,22 @@ def generic(self, args, kws):
                 MaskedType(args[0].value_type), 
                 NAType())
 
-for op in arith_ops:
+@cuda_decl_registry.register_global(operator.truth)
+class MaskedScalarTruth(AbstractTemplate):
+    '''
+    Typing for `if Masked`
+    Used for `if x > y`
+    The truthiness of a MaskedType shall be the truthiness
+    of the `value` stored therein
+    '''
+    def generic(self, args, kws):
+        if isinstance(args[0], MaskedType):
+            return nb_signature(
+                types.boolean,
+                MaskedType(types.boolean)
+            )
+
+for op in arith_ops + comparison_ops:
     # Every op shares the same typing class
     cuda_decl_registry.register_global(op)(MaskedScalarArithOp)
     cuda_decl_registry.register_global(op)(MaskedScalarNullOp)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 30ad61ff742..d81d3855dd0 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -17,6 +17,15 @@
 
 ]
 
+comparison_ops = [
+    operator.eq, 
+    operator.ne,
+    operator.lt,
+    operator.le,
+    operator.gt,
+    operator.ge
+]
+
 def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
     gdf = data
     pdf = data.to_pandas(nullable=True)
@@ -53,6 +62,27 @@ def func_gdf(x, y):
     })
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
+@pytest.mark.parametrize('op', comparison_ops)
+def test_compare_masked_vs_masked(op):
+    # this test should test all the 
+    # typing and lowering for comparisons 
+    # between columns
+
+    def func_pdf(x, y):
+        return op(x, y)
+
+    @nulludf
+    def func_gdf(x, y):
+        return op(x, y)
+
+    # we should get:
+    # [?, ?, <NA>, <NA>, <NA>]
+    gdf = cudf.DataFrame({
+        'a': [1, 0, None, 1, None],
+        'b': [0, 1, 0, None, None]
+    })
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
 @pytest.mark.parametrize('op', arith_ops)
 @pytest.mark.parametrize('constant', [1, 1.5])
 def test_arith_masked_vs_constant(op, constant):
@@ -152,6 +182,9 @@ def func_gdf(x, y):
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 
 
 def test_apply_return_null():
+    '''
+    Tests casting / unification of Masked and NA
+    '''
     def func_pdf(x):
         if x is pd.NA:
             return pd.NA
@@ -166,5 +199,4 @@ def func_gdf(x):
             return x
 
     gdf = cudf.DataFrame({'a': [1, None, 3]})
-    breakpoint()
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 

From df28144019737d6a0034acdc867d5ceebe409498 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 18 May 2021 12:37:25 -0700
Subject: [PATCH 065/132] add tests for comparing masked to constant

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index d81d3855dd0..cd20da693e4 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -100,6 +100,27 @@ def func_gdf(x):
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
+@pytest.mark.parametrize('op', comparison_ops)
+@pytest.mark.parametrize('constant', [1, 1.5])
+def test_compare_masked_vs_constant(op, constant):
+    '''
+    technically the exact same test as above
+    '''
+    def func_pdf(x):
+        return op(x, constant)
+    
+    @nulludf
+    def func_gdf(x):
+        return op(x, constant)
+
+    # Just a single column -> result will be all NA
+    gdf = cudf.DataFrame({
+        'data': [1,2,None]
+    })
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
+
+
 @pytest.mark.parametrize('op', arith_ops)
 def test_arith_masked_vs_null(op):
     def func_pdf(x):

From 8bab8909313019c1d3d2283612b9266ae164da12 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 19 May 2021 12:37:21 -0700
Subject: [PATCH 066/132] NA <-> Unmasked unification

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index cd20da693e4..81665c08d9c 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -221,3 +221,20 @@ def func_gdf(x):
 
     gdf = cudf.DataFrame({'a': [1, None, 3]})
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 
+
+def test_apply_return_either_null_or_literal():
+    def func_pdf(x):
+        if x > 5:
+            return 2
+        else:
+            return pd.NA
+
+    @nulludf
+    def func_gdf(x):
+        if x > 5:
+            return 2
+        else:
+            return cudf.NA
+
+    gdf = cudf.DataFrame({'a': [1, 3, 6]})
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 

From 6e7ac8dcaf7c30793f449b8374f07846bd6e7fab Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 19 May 2021 12:39:01 -0700
Subject: [PATCH 067/132] partially address reviews

---
 python/cudf/cudf/core/udf/lowering.py |  2 +-
 python/cudf/cudf/core/udf/typing.py   | 23 +++++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 8c53065cd71..443a98c6fec 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -32,7 +32,7 @@
 ]
 
 @cuda_lowering_registry.lower_constant(NAType)
-def constant_dummy(context, builder, ty, pyval):
+def constant_na(context, builder, ty, pyval):
     # This handles None, etc.
     return context.get_dummy_value()
 
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 2fa09fb23a7..15f304339dc 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -76,17 +76,18 @@ def f(x, y):
                 return x + y
         
         numba now sees this as
-        f(x: MaskedType, y: MaskedType) -> MaskedType OR literal 
+        f(x: MaskedType(dtype_1), y: MaskedType(dtype_2))
+          -> MaskedType(dtype_unified) 
         '''
         
         # If we have Masked and NA, the output should be a 
         # MaskedType with the original type as its value_type
         if isinstance(other, NAType):
-            return MaskedType(self.value_type)
+            return self
 
-        # if we have MaskedType and Literal, the output should be
-        # determined from the MaskedType.value_type (which is a 
-        # primitive type) and other
+        # if we have MaskedType and something that results in a
+        # scalar, unify between the MaskedType's value_type
+        # and that other thing
         unified = context.unify_pairs(self.value_type, other)
         if unified is None:
             return None
@@ -122,9 +123,15 @@ def unify(self, context, other):
         Masked  <-> NA works from above
         Literal <-> NA -> Masked
         '''
-        if isinstance(other, types.abstract.Literal):
-            return MaskedType(other.literal_type)
-
+        breakpoint()
+        if isinstance(other, MaskedType):
+            # bounce to MaskedType.unify
+            return None
+        elif isinstance(other, NAType):
+            # unify {NA, NA} -> NA
+            return self
+        else:
+            return MaskedType(other)
 
 @typeof_impl.register(_NAType)
 def typeof_na(val, c):

From d58234ed5b1d0d9ae0dd07013978709a3bd18533 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 19 May 2021 12:48:06 -0700
Subject: [PATCH 068/132] just use args

---
 python/cudf/cudf/core/udf/typing.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 15f304339dc..fbbb104239b 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -123,7 +123,6 @@ def unify(self, context, other):
         Masked  <-> NA works from above
         Literal <-> NA -> Masked
         '''
-        breakpoint()
         if isinstance(other, MaskedType):
             # bounce to MaskedType.unify
             return None
@@ -170,8 +169,8 @@ def generic(self, args, kws):
             ).return_type
             return nb_signature(
                 MaskedType(return_type),
-                MaskedType(args[0].value_type),
-                MaskedType(args[1].value_type),
+                args[0],
+                args[1],
             )
 
 class MaskedScalarNullOp(AbstractTemplate):
@@ -183,10 +182,9 @@ def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
             # In the case of op(Masked, NA), the result has the same
             # dtype as the original regardless of what it is
-            return_type = args[0].value_type
             return nb_signature(
-                MaskedType(return_type),
-                MaskedType(args[0].value_type),
+                args[0],
+                args[0],
                 NAType(),
             )
 
@@ -206,7 +204,7 @@ def generic(self, args, kws):
             ).return_type
             return nb_signature(
                 MaskedType(return_type),
-                MaskedType(args[0].value_type),
+                args[0],
                 args[1],
             )
 
@@ -219,7 +217,7 @@ def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
             return nb_signature(
                 types.boolean, 
-                MaskedType(args[0].value_type), 
+                args[0], 
                 NAType())
 
 @cuda_decl_registry.register_global(operator.truth)

From 739f6fc8f52eed543c75e8a2346a1b71d364cdac Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 19 May 2021 13:33:40 -0700
Subject: [PATCH 069/132] add reflected ops vs NA

---
 python/cudf/cudf/core/udf/lowering.py | 3 +--
 python/cudf/cudf/core/udf/typing.py   | 6 ++++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 443a98c6fec..0e5cdf831d9 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -104,7 +104,6 @@ def register_arithmetic_op(op):
     to_lower_op = make_arithmetic_op(op)
     cuda_lower(op, MaskedType, MaskedType)(to_lower_op)
 
-@cuda_lower(operator.add, MaskedType, NAType)
 def masked_scalar_null_op_impl(context, builder, sig, args):
     '''
     Implement `MaskedType` + `NAType`
@@ -163,7 +162,7 @@ def register_const_op(op):
     register_const_op(op)
     # null op impl can be shared between all ops
     cuda_lower(op, MaskedType, NAType)(masked_scalar_null_op_impl)
-
+    cuda_lower(op, NAType, MaskedType)(masked_scalar_null_op_impl)
 
 @cuda_lower(operator.is_, MaskedType, NAType)
 def masked_scalar_is_null_impl(context, builder, sig, args):
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index fbbb104239b..ea5c90f8c62 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -187,6 +187,12 @@ def generic(self, args, kws):
                 args[0],
                 NAType(),
             )
+        elif isinstance(args[0], NAType) and isinstance(args[1], MaskedType):
+            return nb_signature(
+                args[1],
+                NAType(),
+                args[1]
+            )
 
 class MaskedScalarConstOp(AbstractTemplate):
     def generic(self, args, kws):

From 14e3ab8547ee7ab79ff7c99cc72376cbcfdbad8f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 19 May 2021 13:46:17 -0700
Subject: [PATCH 070/132] add tests for reflected masked/na ops

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 81665c08d9c..700c57cdc6c 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -135,6 +135,21 @@ def func_gdf(x):
     })
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
+@pytest.mark.parametrize('op', arith_ops)
+def test_arith_masked_vs_null_reflected(op):
+    def func_pdf(x):
+        return op(pd.NA, x)
+
+    @nulludf
+    def func_gdf(x):
+        return op(cudf.NA, x)
+
+    gdf = cudf.DataFrame({
+        'data': [1, None, 3]
+    })
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
+
 def test_masked_is_null_conditional():
     def func_pdf(x, y):
         if x is pd.NA:

From 417c1303adeab9f3ee573e6ac53ffbf0c72170ad Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 19 May 2021 14:09:25 -0700
Subject: [PATCH 071/132] typing for const + masked, lowering can wait for now

---
 python/cudf/cudf/core/udf/typing.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index ea5c90f8c62..357139ff4aa 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -213,7 +213,19 @@ def generic(self, args, kws):
                 args[0],
                 args[1],
             )
-
+        elif isinstance(args[0], types.Number) and isinstance(
+            args[1], MaskedType
+        ):
+            breakpoint()
+            return_type = self.context.resolve_function_type(
+                self.key, (args[1].value_type, args[0]), kws
+            ).return_type
+            return nb_signature(
+                MaskedType(return_type),
+                args[0],
+                args[1],
+            )
+            
 @cuda_decl_registry.register_global(operator.is_)
 class MaskedScalarIsNull(AbstractTemplate):
     '''

From 195e9b8a85f8bcd75802a8693bc53f7ac33b4212 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 19 May 2021 14:19:58 -0700
Subject: [PATCH 072/132] add grahams fix for Masked + const of a different
 type

---
 python/cudf/cudf/core/udf/lowering.py | 10 ++++++++++
 python/cudf/cudf/core/udf/typing.py   |  6 ++++++
 2 files changed, 16 insertions(+)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 0e5cdf831d9..6753351b2d1 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -213,3 +213,13 @@ def cast_na_to_masked(context, builder, fromty, toty, val):
     result.valid = context.get_constant(types.boolean, 0)
 
     return result._getvalue()
+
+@cuda_impl_registry.lower_cast(MaskedType, MaskedType)
+def cast_masked_to_masked(context, builder, fromty, toty, val):
+    operand = cgutils.create_struct_proxy(fromty)(context, builder, value=val)
+    casted = context.cast(builder, operand.value, fromty.value_type,
+                          toty.value_type)
+    ext = cgutils.create_struct_proxy(toty)(context, builder)
+    ext.value = casted
+    ext.valid = operand.valid
+    return ext._getvalue()
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 357139ff4aa..872afc35cde 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -93,6 +93,12 @@ def f(x, y):
             return None
 
         return MaskedType(unified)
+    
+    def __eq__(self, other):
+        if not isinstance(other, MaskedType):
+            return False
+
+        return self.value_type == other.value_type
 
 # Tell numba how `MaskedType` is constructed on the backend in terms
 # of primitive things that exist at the LLVM level

From 32f54d4a3c1dd0061d0e03baa892dedd4025e155 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 28 May 2021 07:41:33 -0700
Subject: [PATCH 073/132] refactor a little c++

---
 cpp/src/transform/transform.cpp | 35 ++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index a6eb9407f03..d730ebf5488 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -66,26 +66,37 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-void generalized_operation(table_view data_view,
-                           std::string const& binary_udf, 
-                           data_type output_type, 
-                           column_view const& outcol_view,
-                           column_view const& outmsk_view,
-                           rmm::mr::device_memory_resource* mr)
+std::vector<std::string> make_template_types(column_view outcol_view, table_view data_view)
 {
+  int entries_per_col = 3; // type, mask, offset
+  std::string mask_type = "uint32_t*";
+  std::string offset_type = "int64_t";
 
   std::vector<std::string> template_types(
-    // A ptr, mask ptr, and offset for each column
-    // plus one for the type of the output column
-    (data_view.num_columns() * 3) + 1
+    // output type comes first and is one extra
+    (data_view.num_columns() * entries_per_col) + 1
   );
   template_types[0] = cudf::jit::get_type_name(outcol_view.type());
+
   for (int i = 0; i < data_view.num_columns(); i++) {
-    int offset = (i * 3) + 1;
+    int offset = (i * entries_per_col) + 1;
     template_types[offset] = cudf::jit::get_type_name(data_view.column(i).type()) + "*";
-    template_types[offset + 1] = "uint32_t*"; 
-    template_types[offset + 2] = "int64_t";
+    template_types[offset + 1] = mask_type;
+    template_types[offset + 2] = offset_type;
   }
+  return template_types;
+}
+
+
+void generalized_operation(table_view data_view,
+                           std::string const& binary_udf, 
+                           data_type output_type, 
+                           column_view const& outcol_view,
+                           column_view const& outmsk_view,
+                           rmm::mr::device_memory_resource* mr)
+{
+
+  std::vector<std::string> template_types = make_template_types(outcol_view, data_view);
 
   std::string generic_kernel_name = 
   jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")

From 37a925747db8a74e85d677926114b5e9bf8b518b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 28 May 2021 09:17:29 -0700
Subject: [PATCH 074/132] minor docstring updates

---
 python/cudf/cudf/core/dataframe.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ee28640dc49..3b487f96a57 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4712,8 +4712,33 @@ def query(self, expr, local_dict=None):
             return self._apply_boolean_mask(boolmask)
 
     def apply(self, func, axis=1):
+        """
+        Apply a function along an axis of the DataFrame.
+
+        Designed to mimic `pandas.DataFrame.apply`. Applies a user
+        defined function row wise over a dataframe, with true null
+        handling. Works with UDFs using `core.udf.pipeline.nulludf`
+        and returns a single series. Uses numba to jit compile the
+        function to PTX via LLVM. 
+
+        Parameters
+        ----------
+        func : function
+            Function to apply to each row.
+
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Axis along which the function is applied:
+            * 0 or 'index': apply function to each column.
+              Note: axis=0 is not yet supported.
+            * 1 or 'columns': apply function to each row.
+
+        """
+        if axis != 1:
+            raise ValueError(
+                "DataFrame.apply currently only supports row wise ops"
+            )
+
         return func(self)
-        #return super()._apply(func)
 
 
     @applyutils.doc_apply()

From 22d610f17226c4544e3297c3146c9abd80f2d2a9 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Mon, 24 May 2021 11:26:41 +0100
Subject: [PATCH 075/132] Add compilation tests for masked extensions + fix

This fixes the lowering of <numeric value> + Masked. All tests in
test_extension_compilation are currently passing.
---
 python/cudf/cudf/core/udf/lowering.py         |  29 +++--
 python/cudf/cudf/core/udf/typing.py           |   1 -
 .../cudf/tests/test_extension_compilation.py  | 117 ++++++++++++++++++
 3 files changed, 137 insertions(+), 10 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_extension_compilation.py

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 6753351b2d1..50476ebf795 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -51,7 +51,6 @@ def masked_scalar_op_impl(context, builder, sig, args):
         '''
         Implement `MaskedType` + `MaskedType`
         '''
-
         masked_type_1, masked_type_2 = sig.args # MaskedType(...), MaskedType(...)
         masked_return_type = sig.return_type # MaskedType(...)
 
@@ -120,15 +119,23 @@ def masked_scalar_null_op_impl(context, builder, sig, args):
     result.valid = context.get_constant(types.boolean, 0)
     return result._getvalue()
 
+
 def make_const_op(op):
     def masked_scalar_const_op_impl(context, builder, sig, input_values):
         '''
         Implement `MaskedType` + constant
         '''
-        masked_type, const_type = sig.args
+        # Which way round are the argument types?
+        if isinstance(sig.args[0], MaskedType):
+            masked_type, const_type = sig.args
+            masked_value, numeric_value = input_values
+        else:
+            const_type, masked_type = sig.args
+            numeric_value, masked_value = input_values
+
         return_type = sig.return_type
         indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
-            context, builder, value=input_values[0]
+            context, builder, value=masked_value
         )
         result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(
             context, builder
@@ -137,23 +144,25 @@ def masked_scalar_const_op_impl(context, builder, sig, input_values):
         with builder.if_then(indata.valid):
 
             result.value = context.compile_internal(
-                builder, 
-                lambda x, y: op(x, y), 
+                builder,
+                lambda x, y: op(x, y),
                 nb_signature(
-                    return_type.value_type, 
-                    masked_type.value_type, 
+                    return_type.value_type,
+                    masked_type.value_type,
                     const_type
                 ),
-                (indata.value, input_values[1])
+                (indata.value, numeric_value)
             )
             result.valid = context.get_constant(types.boolean, 1)
 
         return result._getvalue()
     return masked_scalar_const_op_impl
 
+
 def register_const_op(op):
     to_lower_op = make_const_op(op)
     cuda_lower(op, MaskedType, types.Number)(to_lower_op)
+    cuda_lower(op, types.Number, MaskedType)(to_lower_op)
 
 
 # register all lowering at init
@@ -190,7 +199,7 @@ def masked_scalar_truth_impl(context, builder, sig, args):
     return indata.value
 
 @cuda_lower(bool, MaskedType)
-def masked_scalar_truth_impl(context, builder, sig, args):
+def masked_scalar_bool_impl(context, builder, sig, args):
     indata = cgutils.create_struct_proxy(MaskedType(types.boolean))(
         context, builder, value=args[0]
     )
@@ -207,6 +216,7 @@ def cast_primitive_to_masked(context, builder, fromty, toty, val):
     ext.valid = context.get_constant(types.boolean, 1)
     return ext._getvalue()
 
+
 @cuda_impl_registry.lower_cast(NAType, MaskedType)
 def cast_na_to_masked(context, builder, fromty, toty, val):
     result = cgutils.create_struct_proxy(toty)(context, builder)
@@ -214,6 +224,7 @@ def cast_na_to_masked(context, builder, fromty, toty, val):
 
     return result._getvalue()
 
+
 @cuda_impl_registry.lower_cast(MaskedType, MaskedType)
 def cast_masked_to_masked(context, builder, fromty, toty, val):
     operand = cgutils.create_struct_proxy(fromty)(context, builder, value=val)
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 872afc35cde..8f967bb4cd1 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -222,7 +222,6 @@ def generic(self, args, kws):
         elif isinstance(args[0], types.Number) and isinstance(
             args[1], MaskedType
         ):
-            breakpoint()
             return_type = self.context.resolve_function_type(
                 self.key, (args[1].value_type, args[0]), kws
             ).return_type
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
new file mode 100644
index 00000000000..10b60c5ca81
--- /dev/null
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -0,0 +1,117 @@
+import operator
+import pytest
+
+from numba import types
+from numba.cuda import compile_ptx
+
+from cudf import NA
+from cudf.core.udf.typing import MaskedType
+
+arith_ops = (
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv,
+    operator.floordiv,
+    operator.mod,
+    operator.pow,
+)
+
+number_types = (
+    types.float32,
+    types.float64,
+    types.int8,
+    types.int16,
+    types.int32,
+    types.int64,
+    types.uint8,
+    types.uint16,
+    types.uint32,
+    types.uint64,
+)
+
+QUICK = False
+
+if QUICK:
+    arith_ops = (operator.add, operator.truediv, operator.pow)
+    number_types = (types.int32, types.float32)
+
+
+number_ids = tuple(str(t) for t in number_types)
+
+
+@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+@pytest.mark.parametrize('constant', [1, 1.5])
+def test_compile_arith_masked_vs_constant(op, ty, constant):
+
+    def func(x):
+        return op(x, constant)
+
+    cc = (7, 5)
+    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
+
+    assert isinstance(resty, MaskedType)
+
+    # Check that the masked typing matches that of the unmasked typing
+    um_ptx, um_resty = compile_ptx(func, (ty,), cc=cc, device=True)
+    assert resty.value_type == um_resty
+
+
+@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+@pytest.mark.parametrize('constant', [1, 1.5])
+def test_compile_arith_constant_vs_masked(op, ty, constant):
+
+    def func(x):
+        return op(constant, x)
+
+    cc = (7, 5)
+    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
+
+    assert isinstance(resty, MaskedType)
+
+
+@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+def test_compile_arith_masked_vs_na(op, ty):
+
+    def func(x):
+        return op(x, NA)
+
+    cc = (7, 5)
+    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
+
+    assert isinstance(resty, MaskedType)
+
+
+@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+def test_compile_arith_na_vs_masked(op, ty):
+
+    def func(x):
+        return op(x, NA)
+
+    cc = (7, 5)
+    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
+
+
+@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('ty1', number_types, ids=number_ids)
+@pytest.mark.parametrize('ty2', number_types, ids=number_ids)
+@pytest.mark.parametrize('masked', ((False, True), (True, False),
+                                    (True, True)),
+                         ids=('um', 'mu', 'mm'))
+def test_compile_arith_masked_ops(op, ty1, ty2, masked):
+
+    def func(x, y):
+        return op(x, y)
+
+    cc = (7, 5)
+
+    if masked[0]:
+        ty1 = MaskedType(ty1)
+    if masked[1]:
+        ty2 = MaskedType(ty2)
+
+    ptx, resty = compile_ptx(func, (ty1, ty2), cc=cc, device=True)

From 8a1b053b809504af113cb273cd31a77cc44dfea2 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Mon, 24 May 2021 11:51:53 +0100
Subject: [PATCH 076/132] Fix flake8 in masked ops code

---
 python/cudf/cudf/core/udf/lowering.py         | 45 +++++++++-----
 python/cudf/cudf/core/udf/pipeline.py         |  3 +-
 python/cudf/cudf/core/udf/typing.py           | 55 +++++++++-------
 python/cudf/cudf/tests/test_udf_masked_ops.py | 62 ++++++++++---------
 4 files changed, 95 insertions(+), 70 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 50476ebf795..fcb865be5dd 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -10,7 +10,6 @@
 from numba.extending import types
 from llvmlite import ir
 
-import operator
 arith_ops = [
     operator.add,
     operator.sub,
@@ -23,7 +22,7 @@
 ]
 
 comparison_ops = [
-    operator.eq, 
+    operator.eq,
     operator.ne,
     operator.lt,
     operator.le,
@@ -31,14 +30,16 @@
     operator.ge
 ]
 
+
 @cuda_lowering_registry.lower_constant(NAType)
 def constant_na(context, builder, ty, pyval):
     # This handles None, etc.
     return context.get_dummy_value()
 
-# In the typing phase, we declared that a `MaskedType` can be 
-# added to another `MaskedType` and specified what kind of 
-# `MaskedType` would result. Now we have to actually fill in 
+
+# In the typing phase, we declared that a `MaskedType` can be
+# added to another `MaskedType` and specified what kind of a
+# `MaskedType` would result. Now we have to actually fill in
 # the implementation details of how to do that. This is where
 # we can involve both validities in constructing the answer
 
@@ -51,8 +52,10 @@ def masked_scalar_op_impl(context, builder, sig, args):
         '''
         Implement `MaskedType` + `MaskedType`
         '''
-        masked_type_1, masked_type_2 = sig.args # MaskedType(...), MaskedType(...)
-        masked_return_type = sig.return_type # MaskedType(...)
+        # MaskedType(...), MaskedType(...)
+        masked_type_1, masked_type_2 = sig.args
+        # MaskedType(...)
+        masked_return_type = sig.return_type
 
         # Let there be two actual LLVM structs backing the two inputs
         # https://mapping-high-level-constructs-to-llvm-ir.readthedocs.io/en/latest/basic-constructs/structures.html
@@ -64,7 +67,8 @@ def masked_scalar_op_impl(context, builder, sig, args):
         )
 
         # we will return an output struct
-        result = cgutils.create_struct_proxy(masked_return_type)(context, builder)
+        result = cgutils.create_struct_proxy(masked_return_type)(context,
+                                                                 builder)
 
         # compute output validity
         valid = builder.and_(m1.valid, m2.valid)
@@ -74,11 +78,11 @@ def masked_scalar_op_impl(context, builder, sig, args):
             # operations on mixed types, by compiling the final core op between
             # the two primitive values as a separate function and calling it
             result.value = context.compile_internal(
-                builder, 
-                lambda x, y: op(x, y), 
+                builder,
+                lambda x, y: op(x, y),
                 nb_signature(
-                    masked_return_type.value_type, 
-                    masked_type_1.value_type, 
+                    masked_return_type.value_type,
+                    masked_type_1.value_type,
                     masked_type_2.value_type
                 ),
                 (m1.value, m2.value)
@@ -89,13 +93,13 @@ def masked_scalar_op_impl(context, builder, sig, args):
 
 def register_arithmetic_op(op):
     '''
-    Register a lowering implementation for the 
+    Register a lowering implementation for the
     arithmetic op `op`.
 
     Because the lowering implementations compile the final
     op separately using a lambda and compile_internal, `op`
     needs to be tied to each lowering implementation using
-    a closure. 
+    a closure.
 
     This function makes and lowers a closure for one op.
 
@@ -103,6 +107,7 @@ def register_arithmetic_op(op):
     to_lower_op = make_arithmetic_op(op)
     cuda_lower(op, MaskedType, MaskedType)(to_lower_op)
 
+
 def masked_scalar_null_op_impl(context, builder, sig, args):
     '''
     Implement `MaskedType` + `NAType`
@@ -110,7 +115,7 @@ def masked_scalar_null_op_impl(context, builder, sig, args):
     needs to take place
     '''
 
-    return_type = sig.return_type # MaskedType(...)
+    return_type = sig.return_type  # MaskedType(...)
     result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(
         context, builder
     )
@@ -134,10 +139,12 @@ def masked_scalar_const_op_impl(context, builder, sig, input_values):
             numeric_value, masked_value = input_values
 
         return_type = sig.return_type
-        indata = cgutils.create_struct_proxy(MaskedType(masked_type.value_type))(
+        masked_input_type = MaskedType(masked_type.value_type)
+        indata = cgutils.create_struct_proxy(masked_input_type)(
             context, builder, value=masked_value
         )
-        result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(
+        masked_return_type = MaskedType(return_type.value_type)
+        result = cgutils.create_struct_proxy(masked_return_type)(
             context, builder
         )
         result.valid = context.get_constant(types.boolean, 0)
@@ -173,6 +180,7 @@ def register_const_op(op):
     cuda_lower(op, MaskedType, NAType)(masked_scalar_null_op_impl)
     cuda_lower(op, NAType, MaskedType)(masked_scalar_null_op_impl)
 
+
 @cuda_lower(operator.is_, MaskedType, NAType)
 def masked_scalar_is_null_impl(context, builder, sig, args):
     '''
@@ -191,6 +199,7 @@ def masked_scalar_is_null_impl(context, builder, sig, args):
 
     return builder.load(result)
 
+
 @cuda_lower(operator.truth, MaskedType)
 def masked_scalar_truth_impl(context, builder, sig, args):
     indata = cgutils.create_struct_proxy(MaskedType(types.boolean))(
@@ -198,6 +207,7 @@ def masked_scalar_truth_impl(context, builder, sig, args):
     )
     return indata.value
 
+
 @cuda_lower(bool, MaskedType)
 def masked_scalar_bool_impl(context, builder, sig, args):
     indata = cgutils.create_struct_proxy(MaskedType(types.boolean))(
@@ -205,6 +215,7 @@ def masked_scalar_bool_impl(context, builder, sig, args):
     )
     return indata.value
 
+
 # To handle the unification, we need to support casting from any type to an
 # extension type. The cast implementation takes the value passed in and returns
 # an extension struct wrapping that value.
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
index f85c1ffdd28..ab80400eb29 100644
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -1,8 +1,8 @@
-from inspect import signature as py_signature
 from cudf.core.udf.typing import MaskedType
 from numba.np import numpy_support
 from numba import cuda
 
+
 def compile_udf(func, dtypes):
     '''
     Generate an inlineable PTX function that will be injected into
@@ -25,6 +25,7 @@ def compile_udf(func, dtypes):
 
     return numpy_output_type, ptx
 
+
 def nulludf(func):
     '''
     Mimic pandas API:
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 8f967bb4cd1..713ccb26486 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -20,7 +20,7 @@
 
 
 comparison_ops = [
-    operator.eq, 
+    operator.eq,
     operator.ne,
     operator.lt,
     operator.le,
@@ -28,9 +28,10 @@
     operator.ge
 ]
 
+
 class MaskedType(types.Type):
     '''
-    A numba type consiting of a value of some primitive type 
+    A numba type consiting of a value of some primitive type
     and a validity boolean, over which we can define math ops
     '''
     def __init__(self, value):
@@ -45,42 +46,42 @@ def __repr__(self):
     def __hash__(self):
         '''
         Needed so that numba caches type instances with different
-        `value_type` separately.  
+        `value_type` separately.
         '''
         return self.__repr__().__hash__()
 
     def unify(self, context, other):
         '''
         Logic for sorting out what to do when the UDF conditionally
-        returns a `MaskedType`, an `NAType`, or a literal based off 
+        returns a `MaskedType`, an `NAType`, or a literal based off
         the data at runtime.
 
         In this framework, every input column is treated as having
-        type `MaskedType`. Operations like `x + y` are understood 
+        type `MaskedType`. Operations like `x + y` are understood
         as translating to:
 
         `Masked(value=x, valid=True) + Masked(value=y, valid=True)`
 
-        This means if the user writes a function such as 
+        This means if the user writes a function such as
         def f(x, y):
             return x + y
-            
+
         numba sees this function as:
         f(x: MaskedType, y: MaskedType) -> MaskedType
-        
+
         However if the user writes something like:
         def f(x, y):
             if x > 5:
                 return 42
             else:
                 return x + y
-        
+
         numba now sees this as
         f(x: MaskedType(dtype_1), y: MaskedType(dtype_2))
-          -> MaskedType(dtype_unified) 
+          -> MaskedType(dtype_unified)
         '''
-        
-        # If we have Masked and NA, the output should be a 
+
+        # If we have Masked and NA, the output should be a
         # MaskedType with the original type as its value_type
         if isinstance(other, NAType):
             return self
@@ -93,21 +94,22 @@ def f(x, y):
             return None
 
         return MaskedType(unified)
-    
+
     def __eq__(self, other):
         if not isinstance(other, MaskedType):
             return False
 
         return self.value_type == other.value_type
 
+
 # Tell numba how `MaskedType` is constructed on the backend in terms
 # of primitive things that exist at the LLVM level
 @register_model(MaskedType)
 class MaskedModel(models.StructModel):
     def __init__(self, dmm, fe_type):
         # This struct has two members, a value and a validity
-        # let the type of the `value` field be the same as the 
-        # `value_type` and let `valid` be a boolean 
+        # let the type of the `value` field be the same as the
+        # `value_type` and let `valid` be a boolean
         members = [("value", fe_type.value_type), ("valid", types.bool_)]
         models.StructModel.__init__(self, dmm, fe_type, members)
 
@@ -116,9 +118,9 @@ class NAType(types.Type):
     '''
     A type for handling ops against nulls
     Exists so we can:
-    1. Teach numba that all occurances of `cudf.NA` are 
+    1. Teach numba that all occurances of `cudf.NA` are
        to be read as instances of this type instead
-    2. Define ops like `if x is cudf.NA` where `x` is of 
+    2. Define ops like `if x is cudf.NA` where `x` is of
        type `Masked` to mean `if x.valid is False`
     '''
     def __init__(self):
@@ -138,6 +140,7 @@ def unify(self, context, other):
         else:
             return MaskedType(other)
 
+
 @typeof_impl.register(_NAType)
 def typeof_na(val, c):
     '''
@@ -147,12 +150,13 @@ def typeof_na(val, c):
     '''
     return NAType()
 
+
 register_model(NAType)(models.OpaqueModel)
 
 
 # Ultimately, we want numba to produce PTX code that specifies how to add
 # two singular `Masked` structs together, which is defined as producing a
-# new `Masked` with the right validity and if valid, the correct value. 
+# new `Masked` with the right validity and if valid, the correct value.
 # This happens in two phases:
 #   1. Specify that `Masked` + `Masked` exists and what it should return
 #   2. Implement how to actually do (1) at the LLVM level
@@ -168,8 +172,8 @@ def generic(self, args, kws):
         '''
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
             # In the case of op(Masked, Masked), the return type is a Masked
-            # such that Masked.value is the primitive type that would have 
-            # been resolved if we were just adding the `value_type`s. 
+            # such that Masked.value is the primitive type that would have
+            # been resolved if we were just adding the `value_type`s.
             return_type = self.context.resolve_function_type(
                 self.key, (args[0].value_type, args[1].value_type), kws
             ).return_type
@@ -179,6 +183,7 @@ def generic(self, args, kws):
                 args[1],
             )
 
+
 class MaskedScalarNullOp(AbstractTemplate):
     def generic(self, args, kws):
         '''
@@ -200,6 +205,7 @@ def generic(self, args, kws):
                 args[1]
             )
 
+
 class MaskedScalarConstOp(AbstractTemplate):
     def generic(self, args, kws):
         '''
@@ -230,7 +236,8 @@ def generic(self, args, kws):
                 args[0],
                 args[1],
             )
-            
+
+
 @cuda_decl_registry.register_global(operator.is_)
 class MaskedScalarIsNull(AbstractTemplate):
     '''
@@ -239,10 +246,11 @@ class MaskedScalarIsNull(AbstractTemplate):
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
             return nb_signature(
-                types.boolean, 
-                args[0], 
+                types.boolean,
+                args[0],
                 NAType())
 
+
 @cuda_decl_registry.register_global(operator.truth)
 class MaskedScalarTruth(AbstractTemplate):
     '''
@@ -258,6 +266,7 @@ def generic(self, args, kws):
                 MaskedType(types.boolean)
             )
 
+
 for op in arith_ops + comparison_ops:
     # Every op shares the same typing class
     cuda_decl_registry.register_global(op)(MaskedScalarArithOp)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 700c57cdc6c..90c93532cb0 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -2,7 +2,6 @@
 from cudf.core.udf.pipeline import nulludf
 from cudf.tests.utils import assert_eq, NUMERIC_TYPES
 import pandas as pd
-import itertools
 import pytest
 import operator
 
@@ -18,7 +17,7 @@
 ]
 
 comparison_ops = [
-    operator.eq, 
+    operator.eq,
     operator.ne,
     operator.lt,
     operator.le,
@@ -26,6 +25,7 @@
     operator.ge
 ]
 
+
 def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
     gdf = data
     pdf = data.to_pandas(nullable=True)
@@ -39,16 +39,17 @@ def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
     obtain = gdf.apply(
         lambda row: func_gdf(
             *[row[i] for i in data.columns]
-            ),
-            axis=1
+        ),
+        axis=1
     )
     assert_eq(expect, obtain, **kwargs)
 
+
 @pytest.mark.parametrize('op', arith_ops)
 def test_arith_masked_vs_masked(op):
     # This test should test all the typing
     # and lowering for arithmetic ops between
-    # two columns 
+    # two columns
     def func_pdf(x, y):
         return op(x, y)
 
@@ -57,15 +58,16 @@ def func_gdf(x, y):
         return op(x, y)
 
     gdf = cudf.DataFrame({
-        'a':[1,None,3, None],
-        'b':[4,5,None, None]
+        'a': [1, None, 3, None],
+        'b': [4, 5, None, None]
     })
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
+
 @pytest.mark.parametrize('op', comparison_ops)
 def test_compare_masked_vs_masked(op):
-    # this test should test all the 
-    # typing and lowering for comparisons 
+    # this test should test all the
+    # typing and lowering for comparisons
     # between columns
 
     def func_pdf(x, y):
@@ -83,23 +85,25 @@ def func_gdf(x, y):
     })
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
+
 @pytest.mark.parametrize('op', arith_ops)
 @pytest.mark.parametrize('constant', [1, 1.5])
 def test_arith_masked_vs_constant(op, constant):
     def func_pdf(x):
         return op(x, constant)
-    
+
     @nulludf
     def func_gdf(x):
         return op(x, constant)
 
     # Just a single column -> result will be all NA
     gdf = cudf.DataFrame({
-        'data': [1,2,None]
+        'data': [1, 2, None]
     })
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
+
 @pytest.mark.parametrize('op', comparison_ops)
 @pytest.mark.parametrize('constant', [1, 1.5])
 def test_compare_masked_vs_constant(op, constant):
@@ -108,19 +112,18 @@ def test_compare_masked_vs_constant(op, constant):
     '''
     def func_pdf(x):
         return op(x, constant)
-    
+
     @nulludf
     def func_gdf(x):
         return op(x, constant)
 
     # Just a single column -> result will be all NA
     gdf = cudf.DataFrame({
-        'data': [1,2,None]
+        'data': [1, 2, None]
     })
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
-
 @pytest.mark.parametrize('op', arith_ops)
 def test_arith_masked_vs_null(op):
     def func_pdf(x):
@@ -135,6 +138,7 @@ def func_gdf(x):
     })
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
+
 @pytest.mark.parametrize('op', arith_ops)
 def test_arith_masked_vs_null_reflected(op):
     def func_pdf(x):
@@ -164,10 +168,9 @@ def func_gdf(x, y):
         else:
             return x + y
 
-
     gdf = cudf.DataFrame({
-        'a':[1,None,3, None],
-        'b':[4,5,None, None]
+        'a': [1, None, 3, None],
+        'b': [4, 5, None, None]
     })
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
@@ -177,19 +180,19 @@ def func_gdf(x, y):
 def test_apply_mixed_dtypes(dtype_a, dtype_b):
     def func_pdf(x, y):
         return x + y
-    
+
     @nulludf
     def func_gdf(x, y):
         return x + y
 
     gdf = cudf.DataFrame({
-        'a':[1.5,None,3, None],
-        'b':[4,5,None, None]
+        'a': [1.5, None, 3, None],
+        'b': [4, 5, None, None]
     })
     gdf['a'] = gdf['a'].astype(dtype_a)
     gdf['b'] = gdf['b'].astype(dtype_b)
 
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize('val', [
@@ -205,17 +208,17 @@ def func_pdf(x, y):
     @nulludf
     def func_gdf(x, y):
         if x is cudf.NA:
-            return val # Masked(5, True)
+            return val  # Masked(5, True)
         else:
             return x + y
 
-
     gdf = cudf.DataFrame({
-        'a':[1,None,3, None],
-        'b':[4,5,None, None]
+        'a': [1, None, 3, None],
+        'b': [4, 5, None, None]
     })
 
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
 
 def test_apply_return_null():
     '''
@@ -225,7 +228,7 @@ def func_pdf(x):
         if x is pd.NA:
             return pd.NA
         else:
-            return x 
+            return x
 
     @nulludf
     def func_gdf(x):
@@ -235,7 +238,8 @@ def func_gdf(x):
             return x
 
     gdf = cudf.DataFrame({'a': [1, None, 3]})
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
 
 def test_apply_return_either_null_or_literal():
     def func_pdf(x):
@@ -252,4 +256,4 @@ def func_gdf(x):
             return cudf.NA
 
     gdf = cudf.DataFrame({'a': [1, 3, 6]})
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False) 
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)

From 58dab996fc42f3a82a3b4228565114b49314c187 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Mon, 24 May 2021 12:14:20 +0100
Subject: [PATCH 077/132] Add tests of comparisons, start testing unary ops

---
 .../cudf/tests/test_extension_compilation.py  | 37 ++++++++++++++++---
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 10b60c5ca81..978315fe7b8 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -17,6 +17,22 @@
     operator.pow,
 )
 
+comparison_ops = (
+    operator.lt,
+    operator.le,
+    operator.eq,
+    operator.ne,
+    operator.ge,
+    operator.gt
+)
+
+unary_ops = (
+    operator.not_,
+    operator.truth
+)
+
+ops = arith_ops + comparison_ops
+
 number_types = (
     types.float32,
     types.float64,
@@ -40,7 +56,18 @@
 number_ids = tuple(str(t) for t in number_types)
 
 
-@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('op', unary_ops)
+@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+def test_compile_masked_unary(op, ty):
+
+    def func(x):
+        return op(x)
+
+    cc = (7, 5)
+    ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
+
+
+@pytest.mark.parametrize('op', ops)
 @pytest.mark.parametrize('ty', number_types, ids=number_ids)
 @pytest.mark.parametrize('constant', [1, 1.5])
 def test_compile_arith_masked_vs_constant(op, ty, constant):
@@ -58,7 +85,7 @@ def func(x):
     assert resty.value_type == um_resty
 
 
-@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('op', ops)
 @pytest.mark.parametrize('ty', number_types, ids=number_ids)
 @pytest.mark.parametrize('constant', [1, 1.5])
 def test_compile_arith_constant_vs_masked(op, ty, constant):
@@ -72,7 +99,7 @@ def func(x):
     assert isinstance(resty, MaskedType)
 
 
-@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('op', ops)
 @pytest.mark.parametrize('ty', number_types, ids=number_ids)
 def test_compile_arith_masked_vs_na(op, ty):
 
@@ -85,7 +112,7 @@ def func(x):
     assert isinstance(resty, MaskedType)
 
 
-@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('op', ops)
 @pytest.mark.parametrize('ty', number_types, ids=number_ids)
 def test_compile_arith_na_vs_masked(op, ty):
 
@@ -96,7 +123,7 @@ def func(x):
     ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
 
 
-@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('op', ops)
 @pytest.mark.parametrize('ty1', number_types, ids=number_ids)
 @pytest.mark.parametrize('ty2', number_types, ids=number_ids)
 @pytest.mark.parametrize('masked', ((False, True), (True, False),

From 4f064978826b2c6592c0700fa794320852e2d54a Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Mon, 24 May 2021 13:41:24 +0100
Subject: [PATCH 078/132] Don't test as-yet unimplemented not

---
 python/cudf/cudf/tests/test_extension_compilation.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 978315fe7b8..1a8367e5532 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -27,8 +27,7 @@
 )
 
 unary_ops = (
-    operator.not_,
-    operator.truth
+    operator.truth,
 )
 
 ops = arith_ops + comparison_ops
@@ -117,7 +116,7 @@ def func(x):
 def test_compile_arith_na_vs_masked(op, ty):
 
     def func(x):
-        return op(x, NA)
+        return op(NA, x)
 
     cc = (7, 5)
     ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)

From a59b2409ef3551c9e278d15afcddeee0e559f57d Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Mon, 24 May 2021 17:03:03 +0100
Subject: [PATCH 079/132] Add execution test for masked ops

This test executes and operation with and without masking and compares
the masked result to the unmasked result.

Some additional typing and lowering is added to support the construction
of test data in a kernel, via the `Masked` constructor.
---
 python/cudf/cudf/core/udf/classes.py          |  4 ++
 python/cudf/cudf/core/udf/lowering.py         | 14 +++++-
 python/cudf/cudf/core/udf/typing.py           | 43 ++++++++++++++++++-
 .../cudf/tests/test_extension_compilation.py  | 38 +++++++++++++++-
 4 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 python/cudf/cudf/core/udf/classes.py

diff --git a/python/cudf/cudf/core/udf/classes.py b/python/cudf/cudf/core/udf/classes.py
new file mode 100644
index 00000000000..1e979253fba
--- /dev/null
+++ b/python/cudf/cudf/core/udf/classes.py
@@ -0,0 +1,4 @@
+class Masked:
+    def __init__(self, value, valid):
+        self.value = value
+        self.valid = valid
diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index fcb865be5dd..3ebec1258dd 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -1,3 +1,4 @@
+from . import classes
 from numba.cuda.cudaimpl import (
     lower as cuda_lower,
     registry as cuda_lowering_registry,
@@ -7,7 +8,7 @@
 from numba.core import cgutils
 from numba.cuda.cudaimpl import registry as cuda_impl_registry
 import operator
-from numba.extending import types
+from numba.extending import lower_builtin, types
 from llvmlite import ir
 
 arith_ops = [
@@ -245,3 +246,14 @@ def cast_masked_to_masked(context, builder, fromty, toty, val):
     ext.value = casted
     ext.valid = operand.valid
     return ext._getvalue()
+
+
+# Masked constructor for use in a kernel for testing
+@lower_builtin(classes.Masked, types.Number, types.boolean)
+def masked_constructor(context, builder, sig, args):
+    ty = sig.return_type
+    value, valid = args
+    masked = cgutils.create_struct_proxy(ty)(context, builder)
+    masked.value = value
+    masked.valid = valid
+    return masked._getvalue()
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 713ccb26486..54733a07aae 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -1,9 +1,14 @@
+from . import classes
 from numba import types
 from cudf.core.scalar import _NAType
-from numba.core.extending import typeof_impl, register_model, models
+from numba.core.extending import (typeof_impl, register_model, models,
+                                  make_attribute_wrapper)
 from numba.cuda.cudadecl import registry as cuda_decl_registry
-from numba.core.typing.templates import AbstractTemplate
+from numba.core.typing.templates import (AbstractTemplate, AttributeTemplate,
+                                         ConcreteTemplate)
 from numba.core.typing import signature as nb_signature
+from numba.core.typing.typeof import typeof
+
 
 import operator
 
@@ -102,6 +107,40 @@ def __eq__(self, other):
         return self.value_type == other.value_type
 
 
+# For typing a Masked constant value defined outside a kernel (e.g. captured in
+# a closure).
+@typeof_impl.register(classes.Masked)
+def typeof_interval(val, c):
+    return MaskedType(typeof(val))
+
+
+@cuda_decl_registry.register
+class MaskedConstructor(ConcreteTemplate):
+    key = classes.Masked
+
+    cases = [nb_signature(MaskedType(t), t, types.boolean)
+             for t in (types.integer_domain | types.real_domain)]
+
+
+make_attribute_wrapper(MaskedType, 'value', 'value')
+make_attribute_wrapper(MaskedType, 'valid', 'valid')
+
+
+@cuda_decl_registry.register_attr
+class ClassesTemplate(AttributeTemplate):
+    key = types.Module(classes)
+
+    def resolve_Masked(self, mod):
+        breakpoint()
+        return types.Function(MaskedConstructor)
+
+
+# For typing classes.Masked
+cuda_decl_registry.register_global(classes, types.Module(classes))
+# For typing bare Masked
+cuda_decl_registry.register_global(classes.Masked, types.Function(MaskedConstructor))
+
+
 # Tell numba how `MaskedType` is constructed on the backend in terms
 # of primitive things that exist at the LLVM level
 @register_model(MaskedType)
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 1a8367e5532..e0cc0239001 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,11 +1,13 @@
 import operator
 import pytest
 
-from numba import types
+from numba import cuda, types
 from numba.cuda import compile_ptx
 
 from cudf import NA
 from cudf.core.udf.typing import MaskedType
+from cudf.core.udf.classes import Masked
+
 
 arith_ops = (
     operator.add,
@@ -66,6 +68,40 @@ def func(x):
     ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
 
 
+@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+def test_execute_masked_binary(op, ty):
+
+    @cuda.jit(device=True)
+    def func(x, y):
+        return op(x, y)
+
+    @cuda.jit(debug=True)
+    def test_kernel(x, y):
+        # Reference result with unmasked value
+        u = func(x, y)
+
+        # Construct masked values to test with
+        x0, y0 = Masked(x, False), Masked(y, False)
+        x1, y1 = Masked(x, True), Masked(y, True)
+
+        # Call with masked types
+        r0 = func(x0, y0)
+        r1 = func(x1, y1)
+
+        # Check masks are as expected, and unmasked result matches masked
+        # result
+        if r0.valid:
+            raise RuntimeError('Expected r0 to be invalid')
+        if not r1.valid:
+            raise RuntimeError('Expected r1 to be valid')
+        if u != r1.value:
+            print('Values: ', u, r1.value)
+            raise RuntimeError('u != r1.value')
+
+    test_kernel[1, 1](1, 2)
+
+
 @pytest.mark.parametrize('op', ops)
 @pytest.mark.parametrize('ty', number_types, ids=number_ids)
 @pytest.mark.parametrize('constant', [1, 1.5])

From e440770335a3ff19c185a6f4739e2905f1557b74 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 25 May 2021 12:33:46 +0100
Subject: [PATCH 080/132] Begin adding tests for operator.is_ with NA

---
 .../cudf/tests/test_extension_compilation.py  | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index e0cc0239001..00a924f926a 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -177,3 +177,33 @@ def func(x, y):
         ty2 = MaskedType(ty2)
 
     ptx, resty = compile_ptx(func, (ty1, ty2), cc=cc, device=True)
+
+
+def func_x_is_na(x):
+    return x is NA
+
+
+def func_na_is_x(x):
+    return NA is x
+
+
+@pytest.mark.parametrize('fn', (func_x_is_na, func_na_is_x))
+def test_is_na(fn):
+
+    valid = Masked(1, True)
+    invalid = Masked(1, False)
+
+    device_fn = cuda.jit(device=True)(fn)
+
+    @cuda.jit(debug=True)
+    def test_kernel():
+        valid_result = device_fn(valid)
+        invalid_result = device_fn(invalid)
+
+        if not valid_result:
+            raise RuntimeError('Valid masked value is NA and should not be')
+
+        if invalid_result:
+            raise RuntimeError('Invalid masked value is not NA and should be')
+
+    test_kernel[1, 1]()

From a6f67fa3d51de76054516370f2e7e25a4f502a5e Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 25 May 2021 12:42:32 +0100
Subject: [PATCH 081/132] Some tidy-ups in typing

- Edit some comments
- No need to keep constructing a new `NAType()`, as it is not
  parameterized.
- Rename `MaskedScalarConstOp` to `MaskedScalarScalarOp` to more closely
  reflect what it handles - it deals with all scalars because it types
  things with `types.Number`. (if it would only accept `types.Literal`
  this would be more like "const").
---
 python/cudf/cudf/core/udf/typing.py | 41 +++++++++++++++++++----------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 54733a07aae..d3526354d65 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -36,11 +36,11 @@
 
 class MaskedType(types.Type):
     '''
-    A numba type consiting of a value of some primitive type
+    A Numba type consisting of a value of some primitive type
     and a validity boolean, over which we can define math ops
     '''
     def __init__(self, value):
-        # MaskedType in numba shall be parameterized
+        # MaskedType in Numba shall be parameterized
         # with a value type
         super().__init__(name="Masked")
         self.value_type = value
@@ -96,14 +96,19 @@ def f(x, y):
         # and that other thing
         unified = context.unify_pairs(self.value_type, other)
         if unified is None:
+            # The value types don't unify, so there is no unified masked type
             return None
 
         return MaskedType(unified)
 
     def __eq__(self, other):
+        # Equality is required for determining whether a cast is required
+        # between two different types.
         if not isinstance(other, MaskedType):
+            # Require a cast when the other type is not masked
             return False
 
+        # Require a cast for another masked with a different value type
         return self.value_type == other.value_type
 
 
@@ -114,6 +119,8 @@ def typeof_interval(val, c):
     return MaskedType(typeof(val))
 
 
+# Implemented typing for Masked(value, valid) - the construction of a Masked
+# type in a kernel.
 @cuda_decl_registry.register
 class MaskedConstructor(ConcreteTemplate):
     key = classes.Masked
@@ -122,10 +129,12 @@ class MaskedConstructor(ConcreteTemplate):
              for t in (types.integer_domain | types.real_domain)]
 
 
+# Provide access to `m.value` and `m.valid` in a kernel for a Masked `m`.
 make_attribute_wrapper(MaskedType, 'value', 'value')
 make_attribute_wrapper(MaskedType, 'valid', 'valid')
 
 
+# Typing for `classes.Masked`
 @cuda_decl_registry.register_attr
 class ClassesTemplate(AttributeTemplate):
     key = types.Module(classes)
@@ -135,10 +144,11 @@ def resolve_Masked(self, mod):
         return types.Function(MaskedConstructor)
 
 
-# For typing classes.Masked
+# Registration of the global is also needed for Numba to type classes.Masked
 cuda_decl_registry.register_global(classes, types.Module(classes))
-# For typing bare Masked
-cuda_decl_registry.register_global(classes.Masked, types.Function(MaskedConstructor))
+# For typing bare Masked (as in `from .classes import Masked`
+cuda_decl_registry.register_global(classes.Masked,
+                                   types.Function(MaskedConstructor))
 
 
 # Tell numba how `MaskedType` is constructed on the backend in terms
@@ -180,6 +190,9 @@ def unify(self, context, other):
             return MaskedType(other)
 
 
+na_type = NAType()
+
+
 @typeof_impl.register(_NAType)
 def typeof_na(val, c):
     '''
@@ -187,7 +200,7 @@ def typeof_na(val, c):
     Effectively make it so numba sees `cudf.NA` as an
     instance of this NAType -> handle it accordingly.
     '''
-    return NAType()
+    return na_type
 
 
 register_model(NAType)(models.OpaqueModel)
@@ -235,27 +248,27 @@ def generic(self, args, kws):
             return nb_signature(
                 args[0],
                 args[0],
-                NAType(),
+                na_type,
             )
         elif isinstance(args[0], NAType) and isinstance(args[1], MaskedType):
             return nb_signature(
                 args[1],
-                NAType(),
+                na_type,
                 args[1]
             )
 
 
-class MaskedScalarConstOp(AbstractTemplate):
+class MaskedScalarScalarOp(AbstractTemplate):
     def generic(self, args, kws):
         '''
-        Typing for `Masked` + a constant literal
+        Typing for `Masked` + a scalar.
         handles situations like `x + 1`
         '''
         if isinstance(args[0], MaskedType) and isinstance(
             args[1], types.Number
         ):
-            # In the case of op(Masked, constant), we resolve the type between
-            # the Masked value_type and the constant's type directly
+            # In the case of op(Masked, scalar), we resolve the type between
+            # the Masked value_type and the scalar's type directly
             return_type = self.context.resolve_function_type(
                 self.key, (args[0].value_type, args[1]), kws
             ).return_type
@@ -287,7 +300,7 @@ def generic(self, args, kws):
             return nb_signature(
                 types.boolean,
                 args[0],
-                NAType())
+                na_type)
 
 
 @cuda_decl_registry.register_global(operator.truth)
@@ -310,4 +323,4 @@ def generic(self, args, kws):
     # Every op shares the same typing class
     cuda_decl_registry.register_global(op)(MaskedScalarArithOp)
     cuda_decl_registry.register_global(op)(MaskedScalarNullOp)
-    cuda_decl_registry.register_global(op)(MaskedScalarConstOp)
+    cuda_decl_registry.register_global(op)(MaskedScalarScalarOp)

From d9e8fdb96303d4f32ed5d8eeb7aaccbffb887a11 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 25 May 2021 21:55:50 +0100
Subject: [PATCH 082/132] Fix test and implementation of `is NA`

---
 python/cudf/cudf/core/udf/lowering.py         | 21 ++++++++++++++++---
 python/cudf/cudf/core/udf/typing.py           |  9 ++++++--
 .../cudf/tests/test_extension_compilation.py  |  8 +++----
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 3ebec1258dd..35492130e26 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -183,13 +183,20 @@ def register_const_op(op):
 
 
 @cuda_lower(operator.is_, MaskedType, NAType)
+@cuda_lower(operator.is_, NAType, MaskedType)
 def masked_scalar_is_null_impl(context, builder, sig, args):
     '''
-    Implement `MaskedType` + constant
+    Implement `MaskedType` is `NA`
     '''
-    masked_type, na = sig.args
+    if isinstance(sig.args[1], NAType):
+        masked_type, na = sig.args
+        value = args[0]
+    else:
+        na, masked_type = sig.args
+        value = args[1]
+
     indata = cgutils.create_struct_proxy(masked_type)(
-        context, builder, value=args[0]
+        context, builder, value=value
     )
     result = cgutils.alloca_once(builder, ir.IntType(1))
     with builder.if_else(indata.valid) as (then, otherwise):
@@ -257,3 +264,11 @@ def masked_constructor(context, builder, sig, args):
     masked.value = value
     masked.valid = valid
     return masked._getvalue()
+
+
+@cuda_impl_registry.lower_constant(MaskedType)
+def lower_constant_masked(context, builder, ty, val):
+    masked = cgutils.create_struct_proxy(ty)(context, builder)
+    masked.value = context.get_constant(ty.value_type, val.value)
+    masked.valid = context.get_constant(types.boolean, val.valid)
+    return masked._getvalue()
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index d3526354d65..afc3547c17f 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -115,8 +115,8 @@ def __eq__(self, other):
 # For typing a Masked constant value defined outside a kernel (e.g. captured in
 # a closure).
 @typeof_impl.register(classes.Masked)
-def typeof_interval(val, c):
-    return MaskedType(typeof(val))
+def typeof_masked(val, c):
+    return MaskedType(typeof(val.value))
 
 
 # Implemented typing for Masked(value, valid) - the construction of a Masked
@@ -301,6 +301,11 @@ def generic(self, args, kws):
                 types.boolean,
                 args[0],
                 na_type)
+        elif isinstance(args[1], MaskedType) and isinstance(args[0], NAType):
+            return nb_signature(
+                types.boolean,
+                na_type,
+                args[1])
 
 
 @cuda_decl_registry.register_global(operator.truth)
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 00a924f926a..6aa5a6d5c86 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -197,13 +197,13 @@ def test_is_na(fn):
 
     @cuda.jit(debug=True)
     def test_kernel():
-        valid_result = device_fn(valid)
-        invalid_result = device_fn(invalid)
+        valid_is_na = device_fn(valid)
+        invalid_is_na = device_fn(invalid)
 
-        if not valid_result:
+        if valid_is_na:
             raise RuntimeError('Valid masked value is NA and should not be')
 
-        if invalid_result:
+        if not invalid_is_na:
             raise RuntimeError('Invalid masked value is not NA and should be')
 
     test_kernel[1, 1]()

From 1d6755a86a29de178e2fe3b20b12691074eccdcf Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Wed, 26 May 2021 21:12:22 +0100
Subject: [PATCH 083/132] Add tests of comparison with NA behaviour

In a slight deviation from pandas (and Python for comparisons with None)
semantics, any comparison with NA returns False.

Comparisons between NA and non-masked values require some additional
implementation (ideally a typing + lowering for a correct cast).
---
 .../cudf/tests/test_extension_compilation.py  | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 6aa5a6d5c86..efd0509f475 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -207,3 +207,111 @@ def test_kernel():
             raise RuntimeError('Invalid masked value is not NA and should be')
 
     test_kernel[1, 1]()
+
+
+def func_lt_na(x):
+    return x < NA
+
+
+def func_gt_na(x):
+    return x > NA
+
+
+def func_eq_na(x):
+    return x == NA
+
+
+def func_ne_na(x):
+    return x != NA
+
+
+def func_ge_na(x):
+    return x >= NA
+
+
+def func_le_na(x):
+    return x <= NA
+
+
+def func_na_lt(x):
+    return x < NA
+
+
+def func_na_gt(x):
+    return x > NA
+
+
+def func_na_eq(x):
+    return x == NA
+
+
+def func_na_ne(x):
+    return x != NA
+
+
+def func_na_ge(x):
+    return x >= NA
+
+
+def func_na_le(x):
+    return x <= NA
+
+
+na_comparison_funcs = (
+    func_lt_na,
+    func_gt_na,
+    func_eq_na,
+    func_ne_na,
+    func_ge_na,
+    func_le_na,
+    func_na_lt,
+    func_na_gt,
+    func_na_eq,
+    func_na_ne,
+    func_na_ge,
+    func_na_le,
+)
+
+
+@pytest.mark.parametrize('fn', na_comparison_funcs)
+@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+def test_na_masked_comparisons(fn, ty):
+
+    device_fn = cuda.jit(device=True)(fn)
+
+    @cuda.jit(debug=True)
+    def test_kernel():
+        unmasked = ty(1)
+        valid_masked = Masked(unmasked, True)
+        invalid_masked = Masked(unmasked, False)
+
+        valid_cmp_na = device_fn(valid_masked)
+        invalid_cmp_na = device_fn(invalid_masked)
+
+        if valid_cmp_na:
+            raise RuntimeError('Valid masked value compared True with NA')
+
+        if invalid_cmp_na:
+            raise RuntimeError('Invalid masked value compared True with NA')
+
+    test_kernel[1, 1]()
+
+
+# xfail because scalars do not yet cast for a comparison to NA
+@pytest.mark.xfail
+@pytest.mark.parametrize('fn', na_comparison_funcs)
+@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+def test_na_scalar_comparisons(fn, ty):
+
+    device_fn = cuda.jit(device=True)(fn)
+
+    @cuda.jit(debug=True)
+    def test_kernel():
+        unmasked = ty(1)
+
+        unmasked_cmp_na = device_fn(unmasked)
+
+        if unmasked_cmp_na:
+            raise RuntimeError('Unmasked value compared True with NA')
+
+    test_kernel[1, 1]()

From 671792c1dd1c96c36223608583645b50ed30df01 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 28 May 2021 11:58:32 -0700
Subject: [PATCH 084/132] test reflected const/masked ops - separate lowering
 to account for non-commutative ops

---
 python/cudf/cudf/core/udf/lowering.py         | 47 +++++++++++++++----
 python/cudf/cudf/tests/test_udf_masked_ops.py | 16 +++++++
 2 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 35492130e26..e617b7cc5a7 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -127,17 +127,12 @@ def masked_scalar_null_op_impl(context, builder, sig, args):
 
 
 def make_const_op(op):
-    def masked_scalar_const_op_impl(context, builder, sig, input_values):
+    def masked_scalar_const_op_impl(context, builder, sig, args):
         '''
         Implement `MaskedType` + constant
         '''
-        # Which way round are the argument types?
-        if isinstance(sig.args[0], MaskedType):
-            masked_type, const_type = sig.args
-            masked_value, numeric_value = input_values
-        else:
-            const_type, masked_type = sig.args
-            numeric_value, masked_value = input_values
+        masked_type, const_type = sig.args
+        masked_value, numeric_value = args
 
         return_type = sig.return_type
         masked_input_type = MaskedType(masked_type.value_type)
@@ -150,7 +145,6 @@ def masked_scalar_const_op_impl(context, builder, sig, input_values):
         )
         result.valid = context.get_constant(types.boolean, 0)
         with builder.if_then(indata.valid):
-
             result.value = context.compile_internal(
                 builder,
                 lambda x, y: op(x, y),
@@ -166,11 +160,44 @@ def masked_scalar_const_op_impl(context, builder, sig, input_values):
         return result._getvalue()
     return masked_scalar_const_op_impl
 
+def make_reflected_const_op(op):
+    def masked_scalar_reflected_const_op_impl(context, builder, sig, args):
+        const_type, masked_type = sig.args
+        numeric_value, masked_value = args
+
+        return_type = sig.return_type
+        masked_input_type = MaskedType(masked_type.value_type)
+        indata = cgutils.create_struct_proxy(masked_input_type)(
+            context, builder, value=masked_value
+        )
+        masked_return_type = MaskedType(return_type.value_type)
+        result = cgutils.create_struct_proxy(masked_return_type)(
+            context, builder
+        )
+        result.valid = context.get_constant(types.boolean, 0)
+        with builder.if_then(indata.valid):
+            result.value = context.compile_internal(
+                builder,
+                lambda x, y: op(x, y),
+                nb_signature(
+                    return_type.value_type,
+                    const_type,
+                    masked_type.value_type
+                ),
+                (numeric_value, indata.value)
+            )
+            result.valid = context.get_constant(types.boolean, 1)
+
+        return result._getvalue()
+    return masked_scalar_reflected_const_op_impl
+        
 
 def register_const_op(op):
     to_lower_op = make_const_op(op)
     cuda_lower(op, MaskedType, types.Number)(to_lower_op)
-    cuda_lower(op, types.Number, MaskedType)(to_lower_op)
+
+    to_lower_op_reflected = make_reflected_const_op(op)
+    cuda_lower(op, types.Number, MaskedType)(to_lower_op_reflected)
 
 
 # register all lowering at init
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 90c93532cb0..77c082ba9a6 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -103,6 +103,22 @@ def func_gdf(x):
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
+@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize('constant', [1, 1.5])
+def test_arith_masked_vs_constant_reflected(op, constant):
+    def func_pdf(x):
+        return op(constant, x)
+
+    @nulludf
+    def func_gdf(x):
+        return op(constant, x)
+
+    # Just a single column -> result will be all NA
+    gdf = cudf.DataFrame({
+        'data': [1, 2, None]
+    })
+
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 @pytest.mark.parametrize('op', comparison_ops)
 @pytest.mark.parametrize('constant', [1, 1.5])

From c3007dec0f550bf6346879e277a3b0749a0198cc Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 28 May 2021 13:08:46 -0700
Subject: [PATCH 085/132] unify masked with masked

---
 python/cudf/cudf/core/udf/typing.py           |  5 +-
 python/cudf/cudf/tests/test_udf_masked_ops.py | 73 +++++++++++++------
 2 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index afc3547c17f..49668869eff 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -43,6 +43,8 @@ def __init__(self, value):
         # MaskedType in Numba shall be parameterized
         # with a value type
         super().__init__(name="Masked")
+        if isinstance(value, MaskedType):
+            breakpoint()
         self.value_type = value
 
     def __repr__(self):
@@ -90,6 +92,8 @@ def f(x, y):
         # MaskedType with the original type as its value_type
         if isinstance(other, NAType):
             return self
+        elif isinstance(other, MaskedType):
+            return MaskedType(context.unify_pairs(self.value_type, other.value_type))
 
         # if we have MaskedType and something that results in a
         # scalar, unify between the MaskedType's value_type
@@ -140,7 +144,6 @@ class ClassesTemplate(AttributeTemplate):
     key = types.Module(classes)
 
     def resolve_Masked(self, mod):
-        breakpoint()
         return types.Function(MaskedConstructor)
 
 
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 77c082ba9a6..6b50ca8dd7e 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -120,25 +120,6 @@ def func_gdf(x):
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
-@pytest.mark.parametrize('op', comparison_ops)
-@pytest.mark.parametrize('constant', [1, 1.5])
-def test_compare_masked_vs_constant(op, constant):
-    '''
-    technically the exact same test as above
-    '''
-    def func_pdf(x):
-        return op(x, constant)
-
-    @nulludf
-    def func_gdf(x):
-        return op(x, constant)
-
-    # Just a single column -> result will be all NA
-    gdf = cudf.DataFrame({
-        'data': [1, 2, None]
-    })
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
-
 
 @pytest.mark.parametrize('op', arith_ops)
 def test_arith_masked_vs_null(op):
@@ -194,6 +175,12 @@ def func_gdf(x, y):
 @pytest.mark.parametrize('dtype_a', list(NUMERIC_TYPES))
 @pytest.mark.parametrize('dtype_b', list(NUMERIC_TYPES))
 def test_apply_mixed_dtypes(dtype_a, dtype_b):
+    '''
+    Test that operations can be performed between columns 
+    of different dtypes and return a column with the correct
+    values and nulls
+    '''
+    # TODO: Parameterize over the op here
     def func_pdf(x, y):
         return x + y
 
@@ -215,16 +202,21 @@ def func_gdf(x, y):
     5, 5.5
 ])
 def test_apply_return_literal(val):
+    '''
+    Test unification codepath for scalars and MaskedType
+    makes sure that numba knows how to cast a scalar value
+    to a MaskedType
+    '''
     def func_pdf(x, y):
-        if x is pd.NA:
+        if x is not pd.NA and x < 2:
             return val
         else:
             return x + y
 
     @nulludf
     def func_gdf(x, y):
-        if x is cudf.NA:
-            return val  # Masked(5, True)
+        if x  is not cudf.NA and x < 2:
+            return val
         else:
             return x + y
 
@@ -273,3 +265,40 @@ def func_gdf(x):
 
     gdf = cudf.DataFrame({'a': [1, 3, 6]})
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+
+
+def test_apply_everything():
+    def func_pdf(w, x, y, z):
+        if x is pd.NA:
+            return w + y - z
+        elif ((z > y) is not pd.NA) and z > y:
+            return x
+        elif ((x + y) is not pd.NA) and x + y == 0:
+            return z / x
+        elif x + y is pd.NA:
+            return 2.5
+        else:
+            return (y > 2)
+
+    @nulludf
+    def func_gdf(w, x, y, z):
+        if x is cudf.NA:
+            return w + y - z
+        elif ((z > y) is not cudf.NA) and z > y:
+            return x
+        elif ((x + y) is not cudf.NA) and x + y == 0:
+            return z / x
+        elif x + y is cudf.NA:
+            return 2.5
+        else:
+            return (y > 2)
+
+    gdf = cudf.DataFrame(
+        {
+            'a': [1, 3, 6, 0, None, 5, None],
+            'b': [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0],
+            'c': [2, 3, 6, 0, None, 5, None],
+            'd': [4, None, 6, 0, None, 5, None],
+        }
+    )
+    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)

From 100ac44645fa5b5d364b751f858ccb242cca5306 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 28 May 2021 14:41:48 -0700
Subject: [PATCH 086/132] allocate and build the final column in libcudf rather
 than cython

---
 cpp/include/cudf/transform.hpp          |  2 --
 cpp/src/transform/transform.cpp         | 21 +++++++++++++--------
 python/cudf/cudf/_lib/cpp/transform.pxd |  4 +---
 python/cudf/cudf/_lib/transform.pyx     | 15 +++------------
 python/cudf/cudf/core/frame.py          |  9 ++++-----
 5 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 7af849357ec..35ba7fd53f6 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -58,8 +58,6 @@ std::unique_ptr<column> generalized_masked_op(
   table_view data_view,
   std::string const& binary_udf,
   data_type output_type,
-  column_view const& outcol_view,
-  column_view const& outmsk_view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index d730ebf5488..b48a3058b31 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -173,18 +173,25 @@ std::unique_ptr<column> transform(column_view const& input,
 }
 
 std::unique_ptr<column> generalized_masked_op(table_view data_view, 
-                                               std::string const& binary_udf, 
+                                               std::string const& udf, 
                                                data_type output_type, 
-                                               column_view const& outcol_view,
-                                               column_view const& outmsk_view,
                                                rmm::mr::device_memory_resource* mr)
 {
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-  transformation::jit::generalized_operation(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
+  std::unique_ptr<column> output = make_fixed_width_column(
+    output_type, data_view.num_rows()
+  );
+  std::unique_ptr<column> output_mask = make_fixed_width_column(
+    cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows()
+  );
 
-  std::unique_ptr<column> output;
+  mutable_column_view output_view = *output;
+  mutable_column_view output_mask_view = *output_mask;
 
+  transformation::jit::generalized_operation(data_view, udf, output_type, output_view, output_mask_view, mr);
 
+  auto final_output_mask = cudf::bools_to_mask(output_mask_view);
+  output.get()->set_null_mask(std::move(*(final_output_mask.first)));
   return output;
 }
 
@@ -205,11 +212,9 @@ std::unique_ptr<column> transform(column_view const& input,
 std::unique_ptr<column> generalized_masked_op(table_view data_view,
                                               std::string const& binary_udf, 
                                               data_type output_type, 
-                                              column_view const& outcol_view,
-                                              column_view const& outmsk_view,
                                               rmm::mr::device_memory_resource* mr)
 {
-  return detail::generalized_masked_op(data_view, binary_udf, output_type, outcol_view, outmsk_view, mr);
+  return detail::generalized_masked_op(data_view, binary_udf, output_type, mr);
 }
 
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 82d45811cb3..c8e84a3a2e9 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -40,10 +40,8 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] generalized_masked_op(
         table_view data_view,
-        string binary_udf,
+        string udf,
         data_type output_type,
-        column_view outcol_view,
-        column_view outmask_view,
     ) except +
 
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 63cd3b30a81..23854cd3e98 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -122,34 +122,25 @@ def transform(Column input, op):
 
     return Column.from_unique_ptr(move(c_output))
 
-def masked_udf(Table incols, op, Column output_column, Column output_mask):
+def masked_udf(Table incols, op, output_type):
     cdef table_view data_view = incols.data_view()
     cdef string c_str = op.encode("UTF-8")
     cdef type_id c_tid
     cdef data_type c_dtype
 
     c_tid = <type_id> (
-        <underlying_type_t_type_id> np_to_cudf_types[output_column.dtype]
+        <underlying_type_t_type_id> np_to_cudf_types[output_type]
     )
     c_dtype = data_type(c_tid)
 
-    cdef column_view outcol_view = output_column.view()
-    cdef column_view outmsk_view = output_mask.view()
-
     with nogil:
         c_output = move(libcudf_transform.generalized_masked_op(
             data_view,
             c_str,
             c_dtype,
-            outcol_view,
-            outmsk_view
         ))
 
-
-    output_mask_real = bools_to_mask(output_mask)
-    output_column = output_column.set_mask(output_mask_real)
-
-    return output_column
+    return Column.from_unique_ptr(move(c_output))
 
 
 def table_encode(Table input):
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index fc354708f4c..86abfffa697 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1455,12 +1455,11 @@ def _quantiles(
         return result
 
     def _apply(self, func):
+        '''
+        Apply `func` across the rows of the frame. 
+        '''
         output_dtype, ptx = cudf.core.udf.pipeline.compile_udf(func, self.dtypes)
-
-        output_column = cudf.core.column.column_empty(row_count=len(self), dtype=output_dtype)
-        output_mask = cudf.core.column.column_empty(row_count=len(self), dtype='bool')
-
-        result = cudf._lib.transform.masked_udf(self, ptx, output_column, output_mask)
+        result = cudf._lib.transform.masked_udf(self, ptx, output_dtype)
         return result
 
 

From 91c91eb0779a99aff8bb90c44add024b83c4a400 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Jun 2021 07:24:04 -0700
Subject: [PATCH 087/132] refactor c++ a bit

---
 cpp/src/transform/transform.cpp | 37 +++++++++++++--------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index b48a3058b31..777be859180 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -105,44 +105,37 @@ void generalized_operation(table_view data_view,
   std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
                      binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
 
-  int n_cols = data_view.num_columns();
-  std::vector<void*> results((n_cols * 3) + 3);
+  // {size, out_ptr, out_mask_ptr, col0_ptr, col0_mask_ptr, col0_offset, col1_ptr...}
+  std::vector<void*> kernel_args((data_view.num_columns() * 3) + 3);
 
   cudf::size_type size = outcol_view.size();
   const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
   const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
+  kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr});
+
+  std::vector<const void*> data_ptrs(data_view.num_columns());
+  std::vector<cudf::bitmask_type const*> mask_ptrs(data_view.num_columns());
+  std::vector<int64_t> offsets(data_view.num_columns());
 
-  results[0] = &size;
-  results[1] = &outcol_ptr;
-  results[2] = &outmsk_ptr;
   column_view col;
+  for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
+    col = data_view.column(col_idx);
 
-  std::vector<const void*> data_ptrs(n_cols);
-  std::vector<cudf::bitmask_type const*> mask_ptrs(n_cols);
-  std::vector<int64_t> offsets(n_cols);
+    data_ptrs[col_idx] = cudf::jit::get_data_ptr(col);
+    mask_ptrs[col_idx] = col.null_mask();
+    offsets[col_idx] = col.offset();
 
-  for (int i = 0; i < n_cols; i++) {
-    col = data_view.column(i);
-    data_ptrs[i] = cudf::jit::get_data_ptr(col);
-    mask_ptrs[i] = col.null_mask();
-    offsets[i] = col.offset();
-  }
+    kernel_args.insert(kernel_args.begin() + 3 * (col_idx + 1), {&data_ptrs[col_idx], &mask_ptrs[col_idx], &offsets[col_idx]});
 
-  int idx = 3;
-  for (int i = 0; i < n_cols; i++) {
-    results[idx] = &data_ptrs[i];
-    results[idx + 1] = &mask_ptrs[i];
-    results[idx + 2] = &offsets[i];
-    idx += 3;
   }
-  
+
 
   rmm::cuda_stream_view generic_stream;
   cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
     .get_kernel(
       generic_kernel_name, {}, {{"transform/jit/operation-udf.hpp", generic_cuda_source}}, {"-arch=sm_."})  //
     ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())                                   //
-    ->launch(results.data());
+    ->launch(kernel_args.data());
 
 }
 

From 1fa3cab50e2755ae53fa772998cf761f7e9ce8e7 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Jun 2021 07:54:02 -0700
Subject: [PATCH 088/132] use offset_type rather than hardcoding int64_t
 incorrectly

---
 cpp/src/transform/transform.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 777be859180..10bbe6dc45a 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -70,7 +70,9 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
 {
   int entries_per_col = 3; // type, mask, offset
   std::string mask_type = "uint32_t*";
-  std::string offset_type = "int64_t";
+
+  // int32_t
+  std::string offset_type = cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
 
   std::vector<std::string> template_types(
     // output type comes first and is one extra
@@ -115,7 +117,7 @@ void generalized_operation(table_view data_view,
 
   std::vector<const void*> data_ptrs(data_view.num_columns());
   std::vector<cudf::bitmask_type const*> mask_ptrs(data_view.num_columns());
-  std::vector<int64_t> offsets(data_view.num_columns());
+  std::vector<cudf::offset_type> offsets(data_view.num_columns());
 
   column_view col;
   for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
@@ -129,7 +131,6 @@ void generalized_operation(table_view data_view,
 
   }
 
-
   rmm::cuda_stream_view generic_stream;
   cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
     .get_kernel(

From 6125dc0bf1bce7b767adda70dfb91cd7fea660e0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Jun 2021 09:15:34 -0700
Subject: [PATCH 089/132] a little bit more refactoring

---
 cpp/src/transform/transform.cpp | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 10bbe6dc45a..1a1f4d2128b 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -68,23 +68,21 @@ void unary_operation(mutable_column_view output,
 
 std::vector<std::string> make_template_types(column_view outcol_view, table_view data_view)
 {
-  int entries_per_col = 3; // type, mask, offset
-  std::string mask_type = "uint32_t*";
-
-  // int32_t
+  std::string mskptr_type = cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
   std::string offset_type = cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
 
-  std::vector<std::string> template_types(
-    // output type comes first and is one extra
-    (data_view.num_columns() * entries_per_col) + 1
-  );
+  std::vector<std::string> template_types(1);
   template_types[0] = cudf::jit::get_type_name(outcol_view.type());
 
-  for (int i = 0; i < data_view.num_columns(); i++) {
-    int offset = (i * entries_per_col) + 1;
-    template_types[offset] = cudf::jit::get_type_name(data_view.column(i).type()) + "*";
-    template_types[offset + 1] = mask_type;
-    template_types[offset + 2] = offset_type;
+  for (auto const& col: data_view) {
+    auto these_types = {
+       cudf::jit::get_type_name(col.type()) + "*",
+       mskptr_type,
+       offset_type
+    };
+
+    template_types.insert(template_types.end(), these_types);
+    
   }
   return template_types;
 }
@@ -99,6 +97,10 @@ void generalized_operation(table_view data_view,
 {
 
   std::vector<std::string> template_types = make_template_types(outcol_view, data_view);
+  for (size_t i = 0; i < template_types.size(); i++) {
+      std::cout << template_types[i] << std::endl;
+  };
+  std::cout << template_types.size() << std::endl;
 
   std::string generic_kernel_name = 
   jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")

From 59e1209c47def4eb6756c2d86c0ec2eb118a537c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Jun 2021 11:18:44 -0700
Subject: [PATCH 090/132] remove debugging code

---
 cpp/src/transform/transform.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 1a1f4d2128b..03d8fa11977 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -97,10 +97,6 @@ void generalized_operation(table_view data_view,
 {
 
   std::vector<std::string> template_types = make_template_types(outcol_view, data_view);
-  for (size_t i = 0; i < template_types.size(); i++) {
-      std::cout << template_types[i] << std::endl;
-  };
-  std::cout << template_types.size() << std::endl;
 
   std::string generic_kernel_name = 
   jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")

From c1324b8ebbcab090e84d52453859099e60558b38 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Jun 2021 11:38:55 -0700
Subject: [PATCH 091/132] move repeated imports to their own function

---
 python/cudf/cudf/core/udf/_ops.py     | 21 +++++++++++++++++++++
 python/cudf/cudf/core/udf/lowering.py | 21 +--------------------
 python/cudf/cudf/core/udf/typing.py   | 23 +----------------------
 3 files changed, 23 insertions(+), 42 deletions(-)
 create mode 100644 python/cudf/cudf/core/udf/_ops.py

diff --git a/python/cudf/cudf/core/udf/_ops.py b/python/cudf/cudf/core/udf/_ops.py
new file mode 100644
index 00000000000..0fce3fdbafb
--- /dev/null
+++ b/python/cudf/cudf/core/udf/_ops.py
@@ -0,0 +1,21 @@
+import operator
+
+arith_ops = [
+    operator.add,
+    operator.sub,
+    operator.mul,
+    operator.truediv,
+    operator.floordiv,
+    operator.mod,
+    operator.pow
+
+]
+
+comparison_ops = [
+    operator.eq,
+    operator.ne,
+    operator.lt,
+    operator.le,
+    operator.gt,
+    operator.ge
+]
diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index e617b7cc5a7..3fffc8ac972 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -11,26 +11,7 @@
 from numba.extending import lower_builtin, types
 from llvmlite import ir
 
-arith_ops = [
-    operator.add,
-    operator.sub,
-    operator.mul,
-    operator.truediv,
-    operator.floordiv,
-    operator.mod,
-    operator.pow
-
-]
-
-comparison_ops = [
-    operator.eq,
-    operator.ne,
-    operator.lt,
-    operator.le,
-    operator.gt,
-    operator.ge
-]
-
+from ._ops import arith_ops, comparison_ops
 
 @cuda_lowering_registry.lower_constant(NAType)
 def constant_na(context, builder, ty, pyval):
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 49668869eff..3ed0d0fd265 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -11,28 +11,7 @@
 
 
 import operator
-
-arith_ops = [
-    operator.add,
-    operator.sub,
-    operator.mul,
-    operator.truediv,
-    operator.floordiv,
-    operator.mod,
-    operator.pow
-
-]
-
-
-comparison_ops = [
-    operator.eq,
-    operator.ne,
-    operator.lt,
-    operator.le,
-    operator.gt,
-    operator.ge
-]
-
+from ._ops import arith_ops, comparison_ops
 
 class MaskedType(types.Type):
     '''

From ed79368b27fe9f6fc57a7738d9fc79a5ff40f3eb Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Jun 2021 11:52:26 -0700
Subject: [PATCH 092/132] remove old ipython notebook

---
 notebooks/udf-nulls-demo-final.ipynb | 1574 --------------------------
 1 file changed, 1574 deletions(-)
 delete mode 100644 notebooks/udf-nulls-demo-final.ipynb

diff --git a/notebooks/udf-nulls-demo-final.ipynb b/notebooks/udf-nulls-demo-final.ipynb
deleted file mode 100644
index bb72fd04378..00000000000
--- a/notebooks/udf-nulls-demo-final.ipynb
+++ /dev/null
@@ -1,1574 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "brutal-princeton",
-   "metadata": {},
-   "source": [
-    "# NA in cuDF UDFs\n",
-    "cuDFs design decision to store null informtion in bitmasks is really smart, and makes things very performant and tractable in memory bound circumstances such as GPU operations. However this design when coupled with the natural inefficiencies that arise from any kind of serial iteration over our data has made `<NA>` support in general user defined functions hard to solution for. This notebook offers an approach based on jitting a UDF's arguments as a special custom Numba type to produce a generic PTX function. This function is than inlined into a general kernel in libcudf and passed the relevant data and masks inside of libcudf.  "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "scenic-laugh",
-   "metadata": {},
-   "source": [
-    "#### Problem setup: concrete example"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "above-athletics",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import cudf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "declared-correspondence",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>x</th>\n",
-       "      <th>y</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      x     y\n",
-       "0     1     1\n",
-       "1  <NA>     2\n",
-       "2     3  <NA>"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = cudf.DataFrame({\n",
-    "    'x': [1, None, 3],\n",
-    "    'y': [1, 2, None]\n",
-    "})\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "expected-milwaukee",
-   "metadata": {},
-   "source": [
-    "Consider the following UDF on two variables adapted from https://docs.rapids.ai/api/cudf/stable/guide-to-udfs.html. This API is fairly different from the pandas API, for several reasons:\n",
-    "- In cuDF, We need to write a loop over arrays in classic numba syntax\n",
-    "- In cuDF, the function returns into an output column we provide as an argument\n",
-    "- The result is different!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "tired-niger",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>x</th>\n",
-       "      <th>y</th>\n",
-       "      <th>out</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>2</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>3</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      x     y   out\n",
-       "0     1     1     2\n",
-       "1  <NA>     2  <NA>\n",
-       "2     3  <NA>     3"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "def pandas_add(x, y):\n",
-    "    if x is not pd.NA and x < 2:\n",
-    "        return x + y\n",
-    "    else:\n",
-    "        return x\n",
-    "    \n",
-    "pandas_df = df.to_pandas(nullable=True)\n",
-    "pandas_df['out'] = pandas_df.apply(lambda row: pandas_add(row['x'], row['y']), axis=1)\n",
-    "pandas_df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "employed-offer",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def gpu_add(x, y, out):\n",
-    "    for i, (xi, yi) in enumerate(zip(x, y)):\n",
-    "        if xi < 2:\n",
-    "            out[i] = xi + yi\n",
-    "        else:\n",
-    "            out[i] = xi"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "animal-gateway",
-   "metadata": {},
-   "source": [
-    "Problem: The null mask of `y` needs to only be considered if `x > 0`. But it"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "occupied-upgrade",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>x</th>\n",
-       "      <th>y</th>\n",
-       "      <th>out</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>2</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>3</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "      x     y   out\n",
-       "0     1     1   2.0\n",
-       "1  <NA>     2  <NA>\n",
-       "2     3  <NA>  <NA>"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = df.apply_rows(gpu_add,\n",
-    "              incols=['x', 'y'],\n",
-    "              outcols={'out':np.float64},\n",
-    "              kwargs={})\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "western-relief",
-   "metadata": {},
-   "source": [
-    "We also don't support comparing `cudf.NA` in any of our UDFs, in any way."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "metallic-macedonia",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def gpu_add_error(x, y, out):\n",
-    "    for i, (xi, yi) in enumerate(zip(x, y)):\n",
-    "        if xi is pd.NA:\n",
-    "            return 5\n",
-    "        else:\n",
-    "            return xi + yi"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "scenic-graduate",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "TypingError",
-     "evalue": "Failed in nopython mode pipeline (step: nopython frontend)\n\u001b[1m\u001b[1m\u001b[1mNo implementation of function Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f7cb83dca90>) found for signature:\n \n >>> gpu_add_error <CUDA device function>(array(int64, 1d, A), array(int64, 1d, A), array(float64, 1d, A))\n \nThere are 2 candidate implementations:\n\u001b[1m  - Of which 2 did not match due to:\n  Overload in function 'gpu_add_error <CUDA device function>': File: ../../../../../../ipynb/<ipython-input-6-e11feefd7c2c>: Line 1.\n    With argument(s): '(array(int64, 1d, A), array(int64, 1d, A), array(float64, 1d, A))':\u001b[0m\n\u001b[1m   Rejected as the implementation raised a specific error:\n     TypingError: Failed in nopython mode pipeline (step: nopython frontend)\n   \u001b[1m\u001b[1mUnknown attribute 'NA' of type Module(<module 'pandas' from '/home/nfs/brmiller/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/pandas/__init__.py'>)\n   \u001b[1m\n   File \"<ipython-input-6-e11feefd7c2c>\", line 3:\u001b[0m\n   \u001b[1mdef gpu_add_error(x, y, out):\n       <source elided>\n       for i, (xi, yi) in enumerate(zip(x, y)):\n   \u001b[1m        if xi is pd.NA:\n   \u001b[0m        \u001b[1m^\u001b[0m\u001b[0m\n   \u001b[0m\n   \u001b[0m\u001b[1mDuring: typing of get attribute at <ipython-input-6-e11feefd7c2c> (3)\u001b[0m\n   \u001b[1m\n   File \"<ipython-input-6-e11feefd7c2c>\", line 3:\u001b[0m\n   \u001b[1mdef gpu_add_error(x, y, out):\n       <source elided>\n       for i, (xi, yi) in enumerate(zip(x, y)):\n   \u001b[1m        if xi is pd.NA:\n   \u001b[0m        \u001b[1m^\u001b[0m\u001b[0m\n\u001b[0m\n  raised from /home/nfs/brmiller/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typeinfer.py:1071\n\u001b[0m\n\u001b[0m\u001b[1mDuring: resolving callee type: Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f7cb83dca90>)\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of call at <string> (8)\n\u001b[0m\n\u001b[1m\nFile \"<string>\", line 8:\u001b[0m\n\u001b[1m<source missing, REPL/exec in use?>\u001b[0m\n",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypingError\u001b[0m                               Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-7-ee244a44092a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m               \u001b[0mincols\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'x'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'y'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m               \u001b[0moutcols\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'out'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m               kwargs={})\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/repos/cudf/python/cudf/cudf/core/dataframe.py\u001b[0m in \u001b[0;36mapply_rows\u001b[0;34m(self, func, incols, outcols, kwargs, pessimistic_nulls, cache_key)\u001b[0m\n\u001b[1;32m   4776\u001b[0m             \u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4777\u001b[0m             \u001b[0mpessimistic_nulls\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4778\u001b[0;31m             \u001b[0mcache_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_key\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4779\u001b[0m         )\n\u001b[1;32m   4780\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/repos/cudf/python/cudf/cudf/utils/applyutils.py\u001b[0m in \u001b[0;36mapply_rows\u001b[0;34m(df, func, incols, outcols, kwargs, pessimistic_nulls, cache_key)\u001b[0m\n\u001b[1;32m     83\u001b[0m         \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mincols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutcols\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpessimistic_nulls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_key\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     84\u001b[0m     )\n\u001b[0;32m---> 85\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mapplyrows\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/repos/cudf/python/cudf/cudf/utils/applyutils.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, df, **launch_params)\u001b[0m\n\u001b[1;32m    168\u001b[0m         \u001b[0mbound\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    169\u001b[0m         \u001b[0;31m# Launch kernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 170\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlaunch_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbound\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mlaunch_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    171\u001b[0m         \u001b[0;31m# Prepare pessimistic nullmask\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    172\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpessimistic_nulls\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/repos/cudf/python/cudf/cudf/utils/applyutils.py\u001b[0m in \u001b[0;36mlaunch_kernel\u001b[0;34m(self, df, args)\u001b[0m\n\u001b[1;32m    195\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    196\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mlaunch_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 197\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    198\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    199\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m    370\u001b[0m             \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    371\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 372\u001b[0;31m             \u001b[0mkernel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspecialize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    373\u001b[0m         \u001b[0mblockdim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compute_thread_per_block\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkernel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    374\u001b[0m         \u001b[0mgriddim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mntasks\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mblockdim\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0mblockdim\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mspecialize\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m    897\u001b[0m         \u001b[0mtargetoptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'link'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlink\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    898\u001b[0m         specialization = Dispatcher(self.py_func, [types.void(*argtypes)],\n\u001b[0;32m--> 899\u001b[0;31m                                     self._bind, targetoptions)\n\u001b[0m\u001b[1;32m    900\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspecializations\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margtypes\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspecialization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    901\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mspecialization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, func, sigs, bind, targetoptions)\u001b[0m\n\u001b[1;32m    809\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msigs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    810\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Only one signature supported at present\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 811\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msigs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    812\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_compile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    813\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mcompile\u001b[0;34m(self, sig)\u001b[0m\n\u001b[1;32m    947\u001b[0m             kernel = compile_kernel(self.py_func, argtypes,\n\u001b[1;32m    948\u001b[0m                                     \u001b[0mlink\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlink\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 949\u001b[0;31m                                     **self.targetoptions)\n\u001b[0m\u001b[1;32m    950\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdefinitions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkernel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    951\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_bind\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     30\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mcompile_kernel\u001b[0;34m(pyfunc, args, link, debug, inline, fastmath, extensions, max_registers, opt)\u001b[0m\n\u001b[1;32m     55\u001b[0m def compile_kernel(pyfunc, args, link, debug=False, inline=False,\n\u001b[1;32m     56\u001b[0m                    fastmath=False, extensions=[], max_registers=None, opt=True):\n\u001b[0;32m---> 57\u001b[0;31m     \u001b[0mcres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompile_cuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpyfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvoid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdebug\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minline\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     58\u001b[0m     \u001b[0mfname\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfndesc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mllvm_func_name\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     59\u001b[0m     lib, kernel = cres.target_context.prepare_cuda_kernel(cres.library, fname,\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     30\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/cuda/compiler.py\u001b[0m in \u001b[0;36mcompile_cuda\u001b[0;34m(pyfunc, return_type, args, debug, inline)\u001b[0m\n\u001b[1;32m     44\u001b[0m                                   \u001b[0mreturn_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_type\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     45\u001b[0m                                   \u001b[0mflags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m                                   locals={})\n\u001b[0m\u001b[1;32m     47\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     48\u001b[0m     \u001b[0mlibrary\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcres\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlibrary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36mcompile_extra\u001b[0;34m(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)\u001b[0m\n\u001b[1;32m    600\u001b[0m     pipeline = pipeline_class(typingctx, targetctx, library,\n\u001b[1;32m    601\u001b[0m                               args, return_type, flags, locals)\n\u001b[0;32m--> 602\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompile_extra\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    603\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    604\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36mcompile_extra\u001b[0;34m(self, func)\u001b[0m\n\u001b[1;32m    350\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlifted\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    351\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlifted_from\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 352\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compile_bytecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    353\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    354\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mcompile_ir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc_ir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlifted\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlifted_from\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36m_compile_bytecode\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    412\u001b[0m         \"\"\"\n\u001b[1;32m    413\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunc_ir\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 414\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compile_core\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    415\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    416\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_compile_ir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36m_compile_core\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    392\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfail_reason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    393\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mis_final_pipeline\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 394\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    395\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    396\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mCompilerError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"All available pipelines exhausted\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler.py\u001b[0m in \u001b[0;36m_compile_core\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    383\u001b[0m             \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    384\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 385\u001b[0;31m                 \u001b[0mpm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    386\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcr\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    387\u001b[0m                     \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_machinery.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m    337\u001b[0m                     \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpipeline_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpass_desc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    338\u001b[0m                 \u001b[0mpatched_exception\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_patch_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 339\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mpatched_exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    341\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mdependency_analysis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_machinery.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m    328\u001b[0m                 \u001b[0mpass_inst\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_pass_registry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpass_inst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    329\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpass_inst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCompilerPass\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 330\u001b[0;31m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_runPass\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpass_inst\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    331\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    332\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mBaseException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Legacy pass in use\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     30\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     33\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_machinery.py\u001b[0m in \u001b[0;36m_runPass\u001b[0;34m(self, index, pss, internal_state)\u001b[0m\n\u001b[1;32m    287\u001b[0m             \u001b[0mmutated\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_initialization\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    288\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mSimpleTimer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpass_time\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 289\u001b[0;31m             \u001b[0mmutated\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_pass\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    290\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mSimpleTimer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfinalize_time\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    291\u001b[0m             \u001b[0mmutated\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_finalizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/compiler_machinery.py\u001b[0m in \u001b[0;36mcheck\u001b[0;34m(func, compiler_state)\u001b[0m\n\u001b[1;32m    260\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    261\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mcheck\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcompiler_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 262\u001b[0;31m             \u001b[0mmangled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompiler_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    263\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mmangled\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    264\u001b[0m                 msg = (\"CompilerPass implementations should return True/False. \"\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typed_passes.py\u001b[0m in \u001b[0;36mrun_pass\u001b[0;34m(self, state)\u001b[0m\n\u001b[1;32m     98\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreturn_type\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     99\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlocals\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 100\u001b[0;31m                 raise_errors=self._raise_errors)\n\u001b[0m\u001b[1;32m    101\u001b[0m             \u001b[0mstate\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtypemap\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtypemap\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    102\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_errors\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typed_passes.py\u001b[0m in \u001b[0;36mtype_inference_stage\u001b[0;34m(typingctx, interp, args, return_type, locals, raise_errors)\u001b[0m\n\u001b[1;32m     70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     71\u001b[0m         \u001b[0minfer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_constraint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 72\u001b[0;31m         \u001b[0minfer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpropagate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     73\u001b[0m         \u001b[0mtypemap\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrestype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcalltypes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minfer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     74\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typeinfer.py\u001b[0m in \u001b[0;36mpropagate\u001b[0;34m(self, raise_errors)\u001b[0m\n\u001b[1;32m   1069\u001b[0m                                   if isinstance(e, ForceLiteralArg)]\n\u001b[1;32m   1070\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mforce_lit_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1071\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1072\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1073\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moperator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mor_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_lit_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mTypingError\u001b[0m: Failed in nopython mode pipeline (step: nopython frontend)\n\u001b[1m\u001b[1m\u001b[1mNo implementation of function Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f7cb83dca90>) found for signature:\n \n >>> gpu_add_error <CUDA device function>(array(int64, 1d, A), array(int64, 1d, A), array(float64, 1d, A))\n \nThere are 2 candidate implementations:\n\u001b[1m  - Of which 2 did not match due to:\n  Overload in function 'gpu_add_error <CUDA device function>': File: ../../../../../../ipynb/<ipython-input-6-e11feefd7c2c>: Line 1.\n    With argument(s): '(array(int64, 1d, A), array(int64, 1d, A), array(float64, 1d, A))':\u001b[0m\n\u001b[1m   Rejected as the implementation raised a specific error:\n     TypingError: Failed in nopython mode pipeline (step: nopython frontend)\n   \u001b[1m\u001b[1mUnknown attribute 'NA' of type Module(<module 'pandas' from '/home/nfs/brmiller/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/pandas/__init__.py'>)\n   \u001b[1m\n   File \"<ipython-input-6-e11feefd7c2c>\", line 3:\u001b[0m\n   \u001b[1mdef gpu_add_error(x, y, out):\n       <source elided>\n       for i, (xi, yi) in enumerate(zip(x, y)):\n   \u001b[1m        if xi is pd.NA:\n   \u001b[0m        \u001b[1m^\u001b[0m\u001b[0m\n   \u001b[0m\n   \u001b[0m\u001b[1mDuring: typing of get attribute at <ipython-input-6-e11feefd7c2c> (3)\u001b[0m\n   \u001b[1m\n   File \"<ipython-input-6-e11feefd7c2c>\", line 3:\u001b[0m\n   \u001b[1mdef gpu_add_error(x, y, out):\n       <source elided>\n       for i, (xi, yi) in enumerate(zip(x, y)):\n   \u001b[1m        if xi is pd.NA:\n   \u001b[0m        \u001b[1m^\u001b[0m\u001b[0m\n\u001b[0m\n  raised from /home/nfs/brmiller/anaconda3/envs/cudf_dev/lib/python3.7/site-packages/numba/core/typeinfer.py:1071\n\u001b[0m\n\u001b[0m\u001b[1mDuring: resolving callee type: Function(<numba.cuda.compiler.DeviceFunctionTemplate object at 0x7f7cb83dca90>)\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of call at <string> (8)\n\u001b[0m\n\u001b[1m\nFile \"<string>\", line 8:\u001b[0m\n\u001b[1m<source missing, REPL/exec in use?>\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "df = df.apply_rows(gpu_add_error,\n",
-    "              incols=['x', 'y'],\n",
-    "              outcols={'out':np.float64},\n",
-    "              kwargs={})\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "material-financing",
-   "metadata": {},
-   "source": [
-    "#### Why\n",
-    "This is because nulls are generally handled \"pessimistically\", which roughly means the value of the output mask bit is just set to be a big `or` between all the input column's bitmasks. This isn't a problem in pandas, because the UDF is applied by looping through the rows and individually passing each value elementwise through the UDF. When it encounters a null, the value that gets passed is `pd.NA`, which behaves the way it needs to for the function to return the correct value for that row. \n",
-    "#### The two things we want to do then are:\n",
-    "- Make the API feel a little more natural\n",
-    "- explicitly be able to handle nulls in a dynamic way"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "informal-wisdom",
-   "metadata": {},
-   "source": [
-    "# Detour: The cuDF UnaryOp Compilation Pipeline"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "mineral-roman",
-   "metadata": {},
-   "source": [
-    "```\n",
-    "Python Function -> Numba -> PTX Code -> libcudf parser -> inlineable function -> Jitify -> Execution\n",
-    "                                                                                    |                                         \n",
-    "                                                             data pointers ---------^\n",
-    "                                                             headers ---------------^\n",
-    "                                                             extra kernel code -----^\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "overall-discretion",
-   "metadata": {},
-   "source": [
-    "The proposed solution to this problem draws heavily on the existing concepts in cuDF's unaryop machinery. This is a situation where the API feels really natural and is quite compatible with pandas, even though ours is named `applymap` and theirs is named `apply` for some reason."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "expired-finder",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "x = cudf.Series([1, None, 3])\n",
-    "\n",
-    "def f(x):\n",
-    "    return x + 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "attended-mount",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0       2\n",
-       "1    <NA>\n",
-       "2       4\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "x.applymap(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "guided-slave",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0       1\n",
-       "1    <NA>\n",
-       "2       3\n",
-       "dtype: Int64"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "x_pd = x.to_pandas(nullable=True)\n",
-    "x_pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "modern-barrier",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0       2\n",
-       "1    <NA>\n",
-       "2       4\n",
-       "dtype: object"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "x_pd.apply(f)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "general-madagascar",
-   "metadata": {},
-   "source": [
-    "The null handling here is pretty simple - it's always a copy of the original bitmask. But that's not why we're here. Let's pop the hood. From https://github.com/rapidsai/cudf/blob/branch-0.19/python/cudf/cudf/core/column/numerical.py#L721-L726:\n",
-    "\n",
-    "```\n",
-    "def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase:\n",
-    "    if callable(op):\n",
-    "        return libcudf.transform.transform(operand, op)\n",
-    "\n",
-    "    op = libcudf.unary.UnaryOp[op.upper()]\n",
-    "    return libcudf.unary.unary_operation(operand, op)\n",
-    "```\n",
-    "\n",
-    "From here the `transform` cython picks up the callable python function as well as the `Column` to which it is to be applied. Here's some pseudocode for what happens inside it:\n",
-    "\n",
-    "```\n",
-    "def transform(Column input, op):\n",
-    "    signature = get_signature(input)\n",
-    "    compiled_op = cudautils.compile_udf(op, signature)\n",
-    "    c_str = compiled_op[0].encode('UTF-8')\n",
-    "\n",
-    "    c_output = move(\n",
-    "                 libcudf_transform(\n",
-    "                    input,\n",
-    "                    c_str\n",
-    "                 )\n",
-    "\n",
-    "    )\n",
-    "\n",
-    "    return Column.from_unique_ptr(move(c_output))\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "explicit-lebanon",
-   "metadata": {},
-   "source": [
-    "What we have so far then is:\n",
-    "1. Our input column\n",
-    "2. A PTX function compiled by Numba based off the python function, and the type of the argument being passed\n",
-    "\n",
-    "`cudautils.compile_udf` calls out to Numba to transform the pure python function into PTX code through LLVM IR and a series of compilation steps. The pipeline inside Numba goes something like:\n",
-    "\n",
-    "```\n",
-    "Python function -> python bytecode -> type inference -> lowering -> LLVM IR -> PTX code\n",
-    "```\n",
-    "\n",
-    "#### Python bytecode: These are instructions for the python interpreter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "fifteen-device",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  4           0 LOAD_FAST                0 (x)\n",
-      "              2 LOAD_CONST               1 (1)\n",
-      "              4 BINARY_ADD\n",
-      "              6 RETURN_VALUE\n"
-     ]
-    }
-   ],
-   "source": [
-    "import dis\n",
-    "dis.dis(f)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "latter-helping",
-   "metadata": {},
-   "source": [
-    "#### Type inference: Assembly level languages only operate in terms of primitive types. \n",
-    "\n",
-    "Thus to generate LLVM IR, Numba needs to know the types of every variable at every point during the function, from arguments to return values. This is one of the reasons `signature` is a required arg to `compile_udf`. We only get this information at runtime, because the user can pass anything into their UDF. When they apply their UDF to a `Series`, it's only at that point Numba can know that the `x` in `f(x)` is of type `int64` for instance - and only then can it actually complete the type inference portion of the process.\n",
-    "\n",
-    "#### Lowering\n",
-    "Once type inference is complete and Numba knows the types of all the input, output, and intermediate variables, it combines that with the algorithmic information from the python function's bytecode and produces LLVM IR in a process called \"lowering\". LLVM IR is like a platform independent assembly language. One can compile from LLVM IR to assembly code for any platform, including into PTX code for NVIDIA GPUs\n",
-    "\n",
-    "\n",
-    "#### What does this mean for us?\n",
-    "It means that what we get out of `cudautils.compile_udf` is an actual string containing a PTX function, compiled by Numba for arguments of the type `input.dtype`. It is important to note that this function is a function that operates, like the original function, on a single element. It does NOT contain a kernel. In fact, here's exactly what it is:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "direct-venezuela",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from cudf.utils.cudautils import compile_udf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "engaged-coaching",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "//\n",
-      "// Generated by NVIDIA NVVM Compiler\n",
-      "//\n",
-      "// Compiler Build ID: CL-27506705\n",
-      "// Cuda compilation tools, release 10.2, V10.2.89\n",
-      "// Based on LLVM 3.4svn\n",
-      "//\n",
-      "\n",
-      ".version 6.5\n",
-      ".target sm_70\n",
-      ".address_size 64\n",
-      "\n",
-      "\t// .globl\t_ZN8__main__5f$248Ex\n",
-      ".common .global .align 8 .u64 _ZN08NumbaEnv8__main__5f$248Ex;\n",
-      "\n",
-      ".visible .func  (.param .b32 func_retval0) _ZN8__main__5f$248Ex(\n",
-      "\t.param .b64 _ZN8__main__5f$248Ex_param_0,\n",
-      "\t.param .b64 _ZN8__main__5f$248Ex_param_1\n",
-      ")\n",
-      "{\n",
-      "\t.reg .b32 \t%r<2>;\n",
-      "\t.reg .b64 \t%rd<4>;\n",
-      "\n",
-      "\n",
-      "\tld.param.u64 \t%rd1, [_ZN8__main__5f$248Ex_param_0];\n",
-      "\tld.param.u64 \t%rd2, [_ZN8__main__5f$248Ex_param_1];\n",
-      "\tadd.s64 \t%rd3, %rd2, 1;\n",
-      "\tst.u64 \t[%rd1], %rd3;\n",
-      "\tmov.u32 \t%r1, 0;\n",
-      "\tst.param.b32\t[func_retval0+0], %r1;\n",
-      "\tret;\n",
-      "}\n",
-      "\n",
-      "\n",
-      "\u0000\n"
-     ]
-    }
-   ],
-   "source": [
-    "from numba.np import numpy_support\n",
-    "numba_type = numpy_support.from_dtype(np.dtype('int64'))\n",
-    "ptx, _ = compile_udf(f, (numba_type,))\n",
-    "\n",
-    "print(ptx)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "inclusive-endorsement",
-   "metadata": {},
-   "source": [
-    "```\n",
-    "// .globl main\n",
-    "\n",
-    ".visible .func  (.param .b32 return_value) main(\n",
-    "\t.param .b64 param_0,    # TWO input parameters\n",
-    "\t.param .b64 param_1      \n",
-    ")\n",
-    "{\n",
-    "\t.reg .b32 \t%r<2>;      # declare two 32-bit registers, named %r1 and %r2\n",
-    "\t.reg .b64 \t%rd<4>;     # declare 4 64-bit registers named %rd1, %rd2, %rd3, %rd4\n",
-    "\n",
-    "\n",
-    "\tld.param.u64 \t%rd1, [param_0];       # load param_0 into %rd1\n",
-    "\tld.param.u64 \t%rd2, [param_1];       # load param_1 into %rd2\n",
-    "\tadd.s64 \t%rd3, %rd2, 1;             # take the value of %rd2 (e.g. param_1) add 1, place in %rd3\n",
-    "\tst.u64 \t[%rd1], %rd3;                  # store the value of %rd3 into the location pointer to by %rd1\n",
-    "\tmov.u32 \t%r1, 0;                    # move 0 into %r1\n",
-    "\tst.param.b32\t[func_retval0+0], %r1; # place the value of %r1 into the return value\n",
-    "\tret;                                   # return 0\n",
-    "}\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "imperial-quality",
-   "metadata": {},
-   "source": [
-    "# What happens next?\n",
-    "Libcudf takes it from here. Broadly speaking, what happens at this point is libcudf hacks together a string (which consists of several elements) which ends up being handed off to jitify and compiled into a final kernel. Jitify then launches that kernel, taking the pointer to the beginning of the actual data column to be transformed as an argument. The three elements are:\n",
-    "\n",
-    "1. A header\n",
-    "2. A outer \"calling\" kernel that generically calls the PTX function\n",
-    "3. A processed version of the PTX function that inlines it directly into CUDA\n",
-    "\n",
-    "The libcudf parser essentially takes the PTX function as above and turns it into a generically callable inlinable function. The final file that gets passed off to jitify to be compiled looks like this:\n",
-    "\n",
-    "```\n",
-    "\n",
-    "#pragma once\n",
-    "\n",
-    "// Include Jitify's cstddef header first\n",
-    "#include <cstddef>\n",
-    "\n",
-    "#include <cuda/std/climits>\n",
-    "#include <cuda/std/cstddef>\n",
-    "#include <cuda/std/limits>\n",
-    "#include <cudf/types.hpp>\n",
-    "#include <cudf/wrappers/timestamps.hpp>\n",
-    "#include <cudf/utilities/bit.hpp>\n",
-    "\n",
-    "template <typename TypeOut, typename TypeIn>\n",
-    "    __global__\n",
-    "    void kernel(cudf::size_type size,\n",
-    "                    TypeOut* out_data, TypeIn* in_data) {\n",
-    "        int tid = threadIdx.x;\n",
-    "        int blkid = blockIdx.x;\n",
-    "        int blksz = blockDim.x;\n",
-    "        int gridsz = gridDim.x;\n",
-    "\n",
-    "        int start = tid + blkid * blksz;\n",
-    "        int step = blksz * gridsz;\n",
-    "\n",
-    "        for (cudf::size_type i=start; i<size; i+=step) {\n",
-    "          GENERIC_UNARY_OP(&out_data[i], in_data[i]);  \n",
-    "        }\n",
-    "    }\n",
-    "\n",
-    "\n",
-    "__device__ __inline__ void GENERIC_UNARY_OP (\n",
-    "  int64_t* _ZN8__main__5f_241Ex_param_0, \n",
-    "  long int _ZN8__main__5f_241Ex_param_1\n",
-    "){\n",
-    "\n",
-    " asm volatile (\"{\");  asm volatile (\"  .reg .b32 _r<2>;\");\n",
-    "   /**   .reg .b32 \t%r<2>  */\n",
-    "  asm volatile (\"  .reg .b64 _rd<4>;\");\n",
-    "   /**   .reg .b64 \t%rd<4>  */\n",
-    "  asm volatile (\"  mov.u64 _rd1,  %0;\": : \"l\"(_ZN8__main__5f_241Ex_param_0));\n",
-    "   /**   ld.param.u64 \t%rd1, [_ZN8__main__5f$241Ex_param_0]\n",
-    "  asm volatile (\"  mov.u64 _rd2,  %0;\": : \"l\"(_ZN8__main__5f_241Ex_param_1));\n",
-    "   /**   ld.param.u64 \t%rd2, [_ZN8__main__5f$241Ex_param_1] \n",
-    "  asm volatile (\"  add.s64 _rd3, _rd2, 1;\");\n",
-    "   /**   add.s64 \t%rd3, %rd2, 1  */\n",
-    "  asm volatile (\"  st.u64 [_rd1], _rd3;\");\n",
-    "   /**   st.u64 \t[%rd1], %rd3  */\n",
-    "  asm volatile (\"  mov.u32 _r1, 0;\");\n",
-    "   /**   mov.u32 \t%r1, 0  */\n",
-    "  asm volatile (\" /** *** SNIP. *** */\");\n",
-    "   /**   st.param.b32\t[func_retval0+0], %r1  */\n",
-    "  asm volatile (\"bra RETTGT;\");\n",
-    " asm volatile (\"RETTGT:}\");\n",
-    "\n",
-    "}\n",
-    "\n",
-    "```\n",
-    "\n",
-    "Apart from returning the data back to the user, that's more or less the process. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "musical-membership",
-   "metadata": {},
-   "source": [
-    "# What does all this have to do with NAs?\n",
-    "The pipeline is based on the idea that anything you could want to do with a single value of `x` arithmatically is expressable as a generic PTX function of `x` as along with some type information, that numba can generate for you. The rest of the machinery is just meant to deliver the data to this function threadwise. We're going to extend this concept to a function of four variables instead of one: a masked binary operation `x + y` where the four arguments are:\n",
-    "\n",
-    "1. `x`\n",
-    "2. `y`\n",
-    "3. `x.mask`\n",
-    "4. `y.mask`\n",
-    "\n",
-    "\n",
-    "We're going to modify the general kernel that calls `GENERIC_UNARY_OP` and generalizes it to accept these four arguments and call a `GENERIC_BINARY_OP` instead (with two extra arguments - the mask bools)\n",
-    "\n",
-    "# Creating a Numba extension type\n",
-    "\n",
-    "Remember how Numba produces PTX code from a python function and some type information? We're going to create a new Type in Numba that is build around a Struct:\n",
-    "\n",
-    "```\n",
-    "struct Masked {\n",
-    "    int64_t value;\n",
-    "    bool valid;\n",
-    "}\n",
-    "```\n",
-    "And we're going to add an overload of `add` (`+`, `operator.add`) to Numba's registry of function signatures that correctly handles null semantics. Then we're going to JIT the incoming python function and use a `Masked` type for every argument. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "qualified-communications",
-   "metadata": {},
-   "source": [
-    "#### Tell Numba that a `MaskedType` exists, and not much else"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "boring-orleans",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from numba.core.extending import types\n",
-    "class MaskedType(types.Type):\n",
-    "    # A corresponding MaskedType for numba\n",
-    "    # numba can only generate LLVM IR for things\n",
-    "    # that it recognizes. This is the most basic\n",
-    "    # thing needed for numba to recognize the type,\n",
-    "    # all it really says is \"there's a type, \n",
-    "    # called MaskedType\". name is for __repr__\n",
-    "    def __init__(self):\n",
-    "        super().__init__(name=\"Masked\")\n",
-    "        \n",
-    "numba_masked = MaskedType()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "annual-regard",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from numba.core.extending import make_attribute_wrapper\n",
-    "\n",
-    "make_attribute_wrapper(MaskedType, \"value\", \"value\")\n",
-    "make_attribute_wrapper(MaskedType, \"valid\", \"valid\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "persistent-palace",
-   "metadata": {},
-   "source": [
-    "#### Tell Numba what this type looks like. In our case, it's a struct. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "opening-specific",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from numba.core.extending import register_model, models\n",
-    "\n",
-    "@register_model(MaskedType)\n",
-    "class MaskedModel(models.StructModel):\n",
-    "    def __init__(self, dmm, fe_type):\n",
-    "        members = [(\"value\", types.int64), (\"valid\", types.bool_)]\n",
-    "        models.StructModel.__init__(self, dmm, fe_type, members)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "posted-confirmation",
-   "metadata": {},
-   "source": [
-    "#### Register an overload of `operator.add` with Numba's registry of `CUDA` functions. \n",
-    "This is part of the typing phase. When we pass `f(x, y): return x + y` into Numba and say that `x` and `y` are of type `Masked`, it hits the `x + y` statement and goes looking for an overload of `add` with a signature matching those operands. It works by either finding a match and the end or not having one. This piece of code conditionally emits the signature it needs to find, when prompted with two arguments of type `Masked`. One can see how they might dynamically return different types depending on arguments. But this roughly says \"when Numba looks for an overload of `add` that takes two `Masked` as arguments, let it know that there is one, and it will return a `Masked`. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "subjective-marshall",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from numba.cuda.cudadecl import registry as cuda_registry\n",
-    "import operator\n",
-    "from numba.core.typing.templates import AbstractTemplate\n",
-    "\n",
-    "\n",
-    "@cuda_registry.register_global(operator.add)\n",
-    "class MaskedScalarAdd(AbstractTemplate):\n",
-    "    # abstracttemplate vs concretetemplate\n",
-    "    def generic(self, args, kws):\n",
-    "        if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):\n",
-    "            return signature(numba_masked, numba_masked, numba_masked)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "compliant-strengthening",
-   "metadata": {},
-   "source": [
-    "#### Implement Masked + Masked\n",
-    "So far, Numba knows:\n",
-    "- There's a `MaskedType`. \n",
-    "- There's an overload of `operator.add` that accepts two `MaskedType` and returns a `MaskedType`\n",
-    "\n",
-    "Now it essentially needs an implementation for that overload of `operator.add`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "treated-pastor",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from numba.cuda.cudaimpl import lower as cuda_lower\n",
-    "\n",
-    "@cuda_lower(operator.add, MaskedType, MaskedType)\n",
-    "def masked_scalar_add_impl(context, builder, sig, args):\n",
-    "    # get the types from the signature\n",
-    "    masked_type_1, masked_type_2 = sig.args\n",
-    "    masked_return_type = sig.return_type\n",
-    "\n",
-    "    # create LLVM IR structs\n",
-    "    m1 = cgutils.create_struct_proxy(masked_type_1)(\n",
-    "        context, builder, value=args[0]\n",
-    "    )\n",
-    "    m2 = cgutils.create_struct_proxy(masked_type_2)(\n",
-    "        context, builder, value=args[1]\n",
-    "    )\n",
-    "    result = cgutils.create_struct_proxy(masked_return_type)(context, builder)\n",
-    "\n",
-    "    valid = builder.and_(m1.valid, m2.valid)\n",
-    "    result.valid = valid\n",
-    "    with builder.if_then(valid):\n",
-    "        result.value = builder.add(m1.value, m2.value)\n",
-    "\n",
-    "    return result._getvalue()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "clear-assessment",
-   "metadata": {},
-   "source": [
-    "# Testing it Out"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "formed-soccer",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from numba import cuda\n",
-    "def compile_masked(func):\n",
-    "    signature = (numba_masked, numba_masked)\n",
-    "    ptx, _ = cuda.compile_ptx_for_current_device(func, signature, device=True)\n",
-    "    return ptx"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "essential-shade",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def f(x, y):\n",
-    "    return x + y"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "understanding-firmware",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#ptx = compile_masked(f)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "wound-communications",
-   "metadata": {},
-   "source": [
-    "```\n",
-    "        // .globl       _ZN8__main__6f$2411E6Masked6Masked                                                                                                                                                                                                                   \n",
-    ".common .global .align 8 .u64 _ZN08NumbaEnv8__main__6f$2411E6Masked6Masked;                                                                                                                                                                                                  \n",
-    "                                                                                                                                                                                                                                                                             \n",
-    ".visible .func  (.param .b32 func_retval0) _ZN8__main__6f$2411E6Masked6Masked(                                                                                                                                                                                               \n",
-    "        .param .b64 _ZN8__main__6f$2411E6Masked6Masked_param_0,                                                                                                                                                                                                              \n",
-    "        .param .b64 _ZN8__main__6f$2411E6Masked6Masked_param_1,                                                                                                                                                                                                              \n",
-    "        .param .b32 _ZN8__main__6f$2411E6Masked6Masked_param_2,                                                                                                                                                                                                              \n",
-    "        .param .b64 _ZN8__main__6f$2411E6Masked6Masked_param_3,                                                                                                                                                                                                              \n",
-    "        .param .b32 _ZN8__main__6f$2411E6Masked6Masked_param_4                                                                                                                                                                                                               \n",
-    ")                                                                                                                                                                                                                                                                            \n",
-    "{                                                                                                                                                                                                                                                                            \n",
-    "        .reg .pred      %p<4>;                                                                                                                                                                                                                                               \n",
-    "        .reg .b16       %rs<4>;                                                                                                                                                                                                                                              \n",
-    "        .reg .b32       %r<2>;                                                                                                                                                                                                                                               \n",
-    "        .reg .b64       %rd<6>;                                                                                                                                                                                                                                              \n",
-    "                                                                                                                                                                                                                                                                             \n",
-    "                                                                                                                                                                                                                                                                             \n",
-    "        ld.param.u64    %rd1, [_ZN8__main__6f$2411E6Masked6Masked_param_0];                                                                                                                                                                                                  \n",
-    "        ld.param.u64    %rd2, [_ZN8__main__6f$2411E6Masked6Masked_param_1];                                                                                                                                                                                                  \n",
-    "        ld.param.u64    %rd3, [_ZN8__main__6f$2411E6Masked6Masked_param_3];                                                                                                                                                                                                  \n",
-    "        ld.param.u8     %rs1, [_ZN8__main__6f$2411E6Masked6Masked_param_2];                                                                                                                                                                                                  \n",
-    "        setp.ne.s16     %p1, %rs1, 0;                                                                                                                                                                                                                                        \n",
-    "        ld.param.u8     %rs2, [_ZN8__main__6f$2411E6Masked6Masked_param_4];                                                                                                                                                                                                  \n",
-    "        setp.ne.s16     %p2, %rs2, 0;                                                                                                                                                                                                                                        \n",
-    "        and.pred        %p3, %p1, %p2;                                                                                                                                                                                                                                       \n",
-    "        add.s64         %rd4, %rd3, %rd2;                                                                                                                                                                                                                                    \n",
-    "        selp.b64        %rd5, %rd4, 0, %p3;                                                                                                                                                                                                                                  \n",
-    "        selp.u16        %rs3, 1, 0, %p3;                                                                                                                                                                                                                                     \n",
-    "        st.u64  [%rd1], %rd5;                                                                                                                                                                                                                                                \n",
-    "        st.u8   [%rd1+8], %rs3;                                                                                                                                                                                                                                              \n",
-    "        mov.u32         %r1, 0;                                                                                                                                                                                                                                              \n",
-    "        st.param.b32    [func_retval0+0], %r1;                                                                                            \n",
-    "        ret;                                                                                                                                                                                                                                                                 \n",
-    "}        \n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "atmospheric-database",
-   "metadata": {},
-   "source": [
-    "#### Then, this is the whole file being passed to jitify:\n",
-    "\n",
-    "```cuda\n",
-    "\n",
-    "    #pragma once\n",
-    "\n",
-    "    // Include Jitify's cstddef header first\n",
-    "    #include <cstddef>\n",
-    "\n",
-    "    #include <cuda/std/climits>\n",
-    "    #include <cuda/std/cstddef>\n",
-    "    #include <cuda/std/limits>\n",
-    "    #include <cudf/types.hpp>\n",
-    "    #include <cudf/wrappers/timestamps.hpp>\n",
-    "    #include <cudf/utilities/bit.hpp>\n",
-    "\n",
-    "    struct Masked {\n",
-    "      int64_t value;\n",
-    "      bool valid;\n",
-    "    };\n",
-    "    \n",
-    "   \n",
-    "\n",
-    "    void null_kernel(cudf::size_type size,\n",
-    "                     TypeOut* out_data, \n",
-    "                     TypeLhs* lhs_data,\n",
-    "                     TypeRhs* rhs_data,\n",
-    "                     bool* out_mask,\n",
-    "                     cudf::bitmask_type const* lhs_mask,\n",
-    "                     cudf::size_type lhs_offset,\n",
-    "                     cudf::bitmask_type const* rhs_mask,\n",
-    "                     cudf::size_type rhs_offset\n",
-    "    ) {\n",
-    "        int tid = threadIdx.x;\n",
-    "        int blkid = blockIdx.x;\n",
-    "        int blksz = blockDim.x;\n",
-    "        int gridsz = gridDim.x;\n",
-    "\n",
-    "        int start = tid + blkid * blksz;\n",
-    "        int step = blksz * gridsz;\n",
-    "\n",
-    "        Masked output;\n",
-    "\n",
-    "        char l_valid;\n",
-    "        char r_valid;\n",
-    "\n",
-    "        long int l_data;\n",
-    "        long int r_data;\n",
-    "\n",
-    "        for (cudf::size_type i=start; i<size; i+=step) {\n",
-    "          l_valid = lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true;\n",
-    "          r_valid = rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true;\n",
-    "          l_data = lhs_data[i];\n",
-    "          r_data = rhs_data[i];\n",
-    "\n",
-    "          GENERIC_BINARY_OP(&output.value, lhs_data[i], l_valid, rhs_data[i], r_valid);\n",
-    "\n",
-    "          out_data[i] = output.value;\n",
-    "          out_mask[i] = output.valid;\n",
-    "             \n",
-    "      }   \n",
-    "      \n",
-    "__device__ __inline__ void GENERIC_BINARY_OP(                                                                                                                                                                                                                                \n",
-    "  int64_t* _ZN8__main__6f_2413E6Masked6Masked_param_0,                                                                                                                                                                                                                       \n",
-    "  long int _ZN8__main__6f_2413E6Masked6Masked_param_1,                                                                                                                                                                                                                       \n",
-    "  char _ZN8__main__6f_2413E6Masked6Masked_param_2,                                                                                                                                                                                                                           \n",
-    "  long int _ZN8__main__6f_2413E6Masked6Masked_param_3,                                                                                                                                                                                                                       \n",
-    "  char _ZN8__main__6f_2413E6Masked6Masked_param_4                                                                                                                                                                                                                            \n",
-    "){                                                                                                                                                                                                                                                                           \n",
-    "                                                                                                                                                                                                                                                                             \n",
-    " asm volatile (\"{\");  asm volatile (\"  .reg .pred _p<4>;\");                                                                                                                                                                                                                  \n",
-    "   /**   .reg .pred     %p<4>  */                                                                                                                                                                                                                                            \n",
-    "                                                                                                                                                                                                                                                                             \n",
-    "  asm volatile (\"  .reg .b16 _rs<4>;\");                                                                                                                                                                                                                                      \n",
-    "   /**   .reg .b16      %rs<4>  */                                                                                                                                                                                                                                           \n",
-    "                                                                                                                                                                                                                                                                             \n",
-    "  asm volatile (\"  .reg .b32 _r<2>;\");                                                                                                                                                                                                                                       \n",
-    "   /**   .reg .b32      %r<2>  */                                                                                                                                                                                                                                            \n",
-    "                                                                                                                                                                                                                                                                             \n",
-    "  asm volatile (\"  .reg .b64 _rd<6>;\");                                                                                                                                                                                                                                      \n",
-    "   /**   .reg .b64      %rd<6>  */                                                                                                                                                                                                                                           \n",
-    "\n",
-    "  asm volatile (\"  mov.u64 _rd1,  %0;\": : \"l\"(_ZN8__main__6f_2413E6Masked6Masked_param_0));\n",
-    "   /**   ld.param.u64   %rd1, [_ZN8__main__6f$2413E6Masked6Masked_param_0]  */\n",
-    "\n",
-    "  asm volatile (\"  mov.u64 _rd2,  %0;\": : \"l\"(_ZN8__main__6f_2413E6Masked6Masked_param_1));\n",
-    "   /**   ld.param.u64   %rd2, [_ZN8__main__6f$2413E6Masked6Masked_param_1]  */\n",
-    "\n",
-    "  asm volatile (\"  mov.u64 _rd3,  %0;\": : \"l\"(_ZN8__main__6f_2413E6Masked6Masked_param_3));\n",
-    "   /**   ld.param.u64   %rd3, [_ZN8__main__6f$2413E6Masked6Masked_param_3]  */\n",
-    "\n",
-    "  asm volatile (\"  cvt.u8.u8 _rs1,  %0;\": : \"h\"( static_cast<short>(_ZN8__main__6f_2413E6Masked6Masked_param_2)));\n",
-    "   /**   ld.param.u8    %rs1, [_ZN8__main__6f$2413E6Masked6Masked_param_2]  */\n",
-    "\n",
-    "  asm volatile (\"  setp.ne.s16 _p1, _rs1, 0;\");\n",
-    "   /**   setp.ne.s16    %p1, %rs1, 0  */\n",
-    "\n",
-    "  asm volatile (\"  cvt.u8.u8 _rs2,  %0;\": : \"h\"( static_cast<short>(_ZN8__main__6f_2413E6Masked6Masked_param_4)));\n",
-    "   /**   ld.param.u8    %rs2, [_ZN8__main__6f$2413E6Masked6Masked_param_4]  */\n",
-    "\n",
-    "  asm volatile (\"  setp.ne.s16 _p2, _rs2, 0;\");\n",
-    "   /**   setp.ne.s16    %p2, %rs2, 0  */\n",
-    "\n",
-    "  asm volatile (\"  and.pred _p3, _p1, _p2;\");\n",
-    "   /**   and.pred       %p3, %p1, %p2  */\n",
-    "\n",
-    "  asm volatile (\"  add.s64 _rd4, _rd3, _rd2;\");\n",
-    "   /**   add.s64        %rd4, %rd3, %rd2  */\n",
-    "\n",
-    "  asm volatile (\"  selp.b64 _rd5, _rd4, 0, _p3;\");\n",
-    "   /**   selp.b64       %rd5, %rd4, 0, %p3  */\n",
-    "\n",
-    "  asm volatile (\"  selp.u16 _rs3, 1, 0, _p3;\");\n",
-    "   /**   selp.u16       %rs3, 1, 0, %p3  */\n",
-    "\n",
-    "  asm volatile (\"  st.u64 [_rd1], _rd5;\");\n",
-    "   /**   st.u64         [%rd1], %rd5  */\n",
-    "\n",
-    "  asm volatile (\"  st.u8 [_rd1+8], _rs3;\");\n",
-    "   /**   st.u8  [%rd1+8], %rs3  */\n",
-    "\n",
-    "  asm volatile (\"  mov.u32 _r1, 0;\");\n",
-    "   /**   mov.u32        %r1, 0  */\n",
-    "\n",
-    "  asm volatile (\" /** *** The way we parse the CUDA PTX assumes the function returns the return value through the first function parameter. Thus the `st.param.***` instructions are not processed. *** */\");\n",
-    "   /**   st.param.b32   [func_retval0+0], %r1  */\n",
-    "\n",
-    "  asm volatile (\"bra RETTGT;\");\n",
-    "\n",
-    "\n",
-    " asm volatile (\"RETTGT:}\");}      \n",
-    "          \n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "concrete-pillow",
-   "metadata": {},
-   "source": [
-    "# Test it\n",
-    "Here are some very basic cython bindings just used for the purposes of testing this exact functionality\n",
-    "```\n",
-    "def masked_binary_op(Column A, Column B, op, Column output_column, Column output_mask):\n",
-    "    cdef column_view A_view = A.view()\n",
-    "    cdef column_view B_view = B.view()\n",
-    "\n",
-    "    cdef string c_str\n",
-    "    cdef type_id c_tid\n",
-    "    cdef data_type c_dtype\n",
-    "\n",
-    "    if A.dtype != np.dtype('int64') or B.dtype != np.dtype('int64'):\n",
-    "        raise TypeError('int64 please')\n",
-    "    \n",
-    " \n",
-    "    from cudf.core.udf import compile_udf\n",
-    "    c_str = compile_udf(op).encode('UTF-8')\n",
-    "\n",
-    "    c_tid = <type_id> (\n",
-    "        <underlying_type_t_type_id> np_to_cudf_types[np.dtype('int64')]\n",
-    "    )\n",
-    "    c_dtype = data_type(c_tid)\n",
-    "\n",
-    "    cdef column_view outcol_view = output_column.view()\n",
-    "    cdef column_view outmsk_view = output_mask.view()\n",
-    "\n",
-    "    with nogil:\n",
-    "        c_output = move(libcudf_transform.masked_binary_op(\n",
-    "            A_view,\n",
-    "            B_view,\n",
-    "            c_str,\n",
-    "            c_dtype,\n",
-    "            outcol_view,\n",
-    "            outmsk_view\n",
-    "        ))\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "after-fellow",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from cudf._lib.transform import masked_binary_op\n",
-    "\n",
-    "def demo_udf(func, s1, s2):\n",
-    "    col1, col2 = s1._column, s2._column\n",
-    "\n",
-    "    output_column = cudf.core.column.as_column(np.arange(8), dtype='int64')\n",
-    "    output_mask = cudf.core.column.as_column([False] * 8)\n",
-    "\n",
-    "    result_col = masked_binary_op(col1, col2, func, output_column, output_mask)\n",
-    "    return cudf.Series(result_col)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "phantom-square",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0       2\n",
-       "1    <NA>\n",
-       "2    <NA>\n",
-       "3    <NA>\n",
-       "4       6\n",
-       "5    <NA>\n",
-       "6      10\n",
-       "7    <NA>\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "def f(x, y):\n",
-    "    return x + y\n",
-    "\n",
-    "s1 = cudf.Series([1, None, 3,    None, 2, 2,    5, None])\n",
-    "s2 = cudf.Series([1, 2,    None, None, 4, None, 5, None])\n",
-    "\n",
-    "demo_udf(f, s1, s2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "southern-stationery",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0       2\n",
-       "1    <NA>\n",
-       "2    <NA>\n",
-       "3    <NA>\n",
-       "4       6\n",
-       "5    <NA>\n",
-       "6      10\n",
-       "7    <NA>\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "s1 + s2"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "recovered-premiere",
-   "metadata": {},
-   "source": [
-    "# `cudf.NA`\n",
-    "In general, we wan't the capability to work with `cudf.NA` inside our functions directly. To do this, we're just going to reapply the same machinery to overload what happens when we add a `MaskedType` to `cudf.NA`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "filled-malta",
-   "metadata": {},
-   "source": [
-    "#### Create an NAType"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "chemical-trick",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from cudf.core.scalar import _NAType\n",
-    "class NAType(types.Type):\n",
-    "    # \"There is a type called NAType\"\n",
-    "    def __init__(self):\n",
-    "        super().__init__(name=\"NA\")\n",
-    "\n",
-    "numba_na = NAType()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "southern-prague",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "numba.core.datamodel.models.OpaqueModel"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from numba.core.extending import typeof_impl\n",
-    "@typeof_impl.register(_NAType)\n",
-    "def typeof_na(val, c):\n",
-    "    # instances of _NAType will be \n",
-    "    # treaded as instances of NAType. \n",
-    "    return numba_na\n",
-    "\n",
-    "\n",
-    "\n",
-    "register_model(NAType)(models.OpaqueModel)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "accessible-monthly",
-   "metadata": {},
-   "source": [
-    "#### `operator.add` typing for Masked <-> NA"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "harmful-glossary",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@cuda_registry.register_global(operator.add)\n",
-    "class MaskedScalarAddNull(AbstractTemplate):\n",
-    "    def generic(self, args, kws):\n",
-    "        if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):\n",
-    "            return signature(numba_masked, numba_masked, numba_na)\n",
-    "  "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "brown-cheese",
-   "metadata": {},
-   "source": [
-    "#### Lowering: AKA what to actually do when this is requested\n",
-    "This says that when `+` is invoked between a `MaskedType` and an `NAType`, to make a new `MaskedType`, set it's validity to zero and return it."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "encouraging-reynolds",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from numba.cuda.cudaimpl import registry as cuda_lowering_registry\n",
-    "\n",
-    "@cuda_lower(operator.add, MaskedType, NAType)\n",
-    "def masked_scalar_add_na_impl(context, builder, sig, args):\n",
-    "#    return_type = sig.return_type\n",
-    "    # use context to get llvm type for a bool\n",
-    "    result = cgutils.create_struct_proxy(numba_masked)(context, builder)\n",
-    "    result.valid = context.get_constant(types.boolean, 0)\n",
-    "    return result._getvalue()\n",
-    "\n",
-    "\n",
-    "@cuda_lowering_registry.lower_constant(NAType)\n",
-    "def constant_dummy(context, builder, ty, pyval):\n",
-    "    # This handles None, etc.\n",
-    "    return context.get_dummy_value()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "productive-rings",
-   "metadata": {},
-   "source": [
-    "# Constants\n",
-    "At this point the pattern is familiar. Register an overload that emits a signature if the operands match a `MaskedType` and a constant. The lowering is logically fairly simple. If the `MaskedType` is null, the answer is null, else the answer is a new `MaskedType` whose `value` is the sum of the inputs `value` and the constant."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "short-makeup",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from llvmlite import ir\n",
-    "\n",
-    "@cuda_registry.register_global(operator.add)\n",
-    "class MaskedScalarAddConstant(AbstractTemplate):\n",
-    "    def generic(self, args, kws):\n",
-    "        if isinstance(args[0], MaskedType) and isinstance(args[1], types.Integer):\n",
-    "            return signature(numba_masked, numba_masked, types.int64)\n",
-    "\n",
-    "@cuda_lower(operator.add, MaskedType, types.Integer)\n",
-    "def masked_scalar_add_constant_impl(context, builder, sig, input_values):\n",
-    "    masked_type, const_type = sig.args\n",
-    "\n",
-    "    indata = cgutils.create_struct_proxy(masked_type)(context, builder, value=input_values[0])\n",
-    "    result = cgutils.create_struct_proxy(numba_masked)(context, builder)\n",
-    "    #to_add_const = context.get_constant(const_type, input_values[1])\n",
-    "\n",
-    "    result.valid = context.get_constant(types.boolean, 0)\n",
-    "    with builder.if_then(indata.valid):\n",
-    "        result.value = builder.add(indata.value, input_values[1])\n",
-    "        result.valid = context.get_constant(types.boolean, 1)\n",
-    "\n",
-    "    return result._getvalue()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "entitled-wealth",
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [],
-   "source": [
-    "\n",
-    "def f(x, y):\n",
-    "    return x + y + cudf.NA\n",
-    "\n",
-    "s1 = cudf.Series([1, None, 3,    None, 2, 2,    5, None])\n",
-    "s2 = cudf.Series([1, 2,    None, None, 4, None, 5, None])\n",
-    "\n",
-    "result = demo_udf(f, s1, s2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "genuine-davis",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0    <NA>\n",
-       "1    <NA>\n",
-       "2    <NA>\n",
-       "3    <NA>\n",
-       "4    <NA>\n",
-       "5    <NA>\n",
-       "6    <NA>\n",
-       "7    <NA>\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "polyphonic-second",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def f(x, y):\n",
-    "    return x + y + 1\n",
-    "\n",
-    "s1 = cudf.Series([1, None, 3,    None, 2, 2,    5, None])\n",
-    "s2 = cudf.Series([1, 2,    None, None, 4, None, 5, None])\n",
-    "\n",
-    "result = demo_udf(f, s1, s2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "sporting-campbell",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0       3\n",
-       "1    <NA>\n",
-       "2    <NA>\n",
-       "3    <NA>\n",
-       "4       7\n",
-       "5    <NA>\n",
-       "6      11\n",
-       "7    <NA>\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "result"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "focal-castle",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "romance-complaint",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "speaking-aquatic",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From 62ddca757cb21949db31b2f9436f9d1ce1fa907f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Jun 2021 13:13:49 -0700
Subject: [PATCH 093/132] cleanup

---
 python/cudf/cudf/core/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 86abfffa697..b4440805618 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1501,7 +1501,7 @@ def rank(
         pct : bool, default False
             Whether or not to display the returned rankings in percentile
             form.
-f
+
         Returns
         -------
         same type as caller

From 821d11d9f58e9564c38762cf7a3dcfe2e653aaf6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 1 Jun 2021 13:15:52 -0700
Subject: [PATCH 094/132] cpp style fixes

---
 cpp/include/cudf/transform.hpp             |  2 -
 cpp/src/transform/jit/masked_udf_kernel.cu | 82 ++++++++++------------
 cpp/src/transform/transform.cpp            | 79 +++++++++------------
 3 files changed, 71 insertions(+), 92 deletions(-)

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 35ba7fd53f6..61718254849 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -53,14 +53,12 @@ std::unique_ptr<column> transform(
   bool is_ptx,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-
 std::unique_ptr<column> generalized_masked_op(
   table_view data_view,
   std::string const& binary_udf,
   data_type output_type,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
  * preserving existing null values and also returns new null_count.
diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index d384ffc61b1..9d62093e4d2 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -28,11 +28,11 @@
 #include <transform/jit/operation-udf.hpp>
 
 #include <cudf/types.hpp>
-#include <cudf/wrappers/timestamps.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/wrappers/timestamps.hpp>
 
-#include <tuple>
 #include <cuda/std/tuple>
+#include <tuple>
 
 namespace cudf {
 namespace transformation {
@@ -45,58 +45,48 @@ struct Masked {
 };
 
 template <typename TypeIn, typename MaskType, typename OffsetType>
-__device__ auto make_args(cudf::size_type id,
-                          TypeIn in_ptr,
-                          MaskType in_mask,
-                          OffsetType in_offset) 
+__device__ auto make_args(cudf::size_type id, TypeIn in_ptr, MaskType in_mask, OffsetType in_offset)
 {
-    bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-    return cuda::std::make_tuple(in_ptr[id], valid);
+  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
+  return cuda::std::make_tuple(in_ptr[id], valid);
 }
 
-template <typename InType, typename MaskType, typename OffsetType, typename ... Arguments>
-__device__ auto make_args(cudf::size_type id, 
-                          InType in_ptr, 
-                          MaskType in_mask,     // in practice, always cudf::bitmask_type const* 
+template <typename InType, typename MaskType, typename OffsetType, typename... Arguments>
+__device__ auto make_args(cudf::size_type id,
+                          InType in_ptr,
+                          MaskType in_mask,      // in practice, always cudf::bitmask_type const*
                           OffsetType in_offset,  // in practice, always cudf::size_type
-                          Arguments ... args) {
-
-    bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-    return cuda::std::tuple_cat(
-        cuda::std::make_tuple(in_ptr[id], valid),
-        make_args(id, args...)
-    );
+                          Arguments... args)
+{
+  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
+  return cuda::std::tuple_cat(cuda::std::make_tuple(in_ptr[id], valid), make_args(id, args...));
 }
 
-template <typename TypeOut, typename ... Arguments>
-__global__
-void generic_udf_kernel(cudf::size_type size, 
-                        TypeOut* out_data, 
-                        bool* out_mask, 
-                        Arguments ... args)
-{   
-
-    int tid = threadIdx.x;
-    int blkid = blockIdx.x;
-    int blksz = blockDim.x;
-    int gridsz = gridDim.x;
-    int start = tid + blkid * blksz;
-    int step = blksz * gridsz;
-
-    Masked<TypeOut> output;
-    for (cudf::size_type i=start; i<size; i+=step) {
-      auto func_args = cuda::std::tuple_cat(
-          cuda::std::make_tuple(&output.value),
-          make_args(i, args...) // passed int64*, bool*, int64, int64*, bool*, int64
-      );
-      cuda::std::apply(GENERIC_OP, func_args);
-      out_data[i] = output.value;
-      out_mask[i] = output.valid;
-    }
-
+template <typename TypeOut, typename... Arguments>
+__global__ void generic_udf_kernel(cudf::size_type size,
+                                   TypeOut* out_data,
+                                   bool* out_mask,
+                                   Arguments... args)
+{
+  int tid    = threadIdx.x;
+  int blkid  = blockIdx.x;
+  int blksz  = blockDim.x;
+  int gridsz = gridDim.x;
+  int start  = tid + blkid * blksz;
+  int step   = blksz * gridsz;
+
+  Masked<TypeOut> output;
+  for (cudf::size_type i = start; i < size; i += step) {
+    auto func_args = cuda::std::tuple_cat(
+      cuda::std::make_tuple(&output.value),
+      make_args(i, args...)  // passed int64*, bool*, int64, int64*, bool*, int64
+    );
+    cuda::std::apply(GENERIC_OP, func_args);
+    out_data[i] = output.value;
+    out_mask[i] = output.valid;
+  }
 }
 
-
 }  // namespace jit
 }  // namespace transformation
 }  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 03d8fa11977..a83a0051ce1 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -17,7 +17,6 @@
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 #include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
 
-
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
 #include <jit/type.hpp>
@@ -27,9 +26,9 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -68,47 +67,42 @@ void unary_operation(mutable_column_view output,
 
 std::vector<std::string> make_template_types(column_view outcol_view, table_view data_view)
 {
-  std::string mskptr_type = cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
-  std::string offset_type = cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
+  std::string mskptr_type =
+    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
+  std::string offset_type =
+    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
 
   std::vector<std::string> template_types(1);
   template_types[0] = cudf::jit::get_type_name(outcol_view.type());
 
-  for (auto const& col: data_view) {
-    auto these_types = {
-       cudf::jit::get_type_name(col.type()) + "*",
-       mskptr_type,
-       offset_type
-    };
+  for (auto const& col : data_view) {
+    auto these_types = {cudf::jit::get_type_name(col.type()) + "*", mskptr_type, offset_type};
 
     template_types.insert(template_types.end(), these_types);
-    
   }
   return template_types;
 }
 
-
 void generalized_operation(table_view data_view,
-                           std::string const& binary_udf, 
-                           data_type output_type, 
+                           std::string const& binary_udf,
+                           data_type output_type,
                            column_view const& outcol_view,
                            column_view const& outmsk_view,
                            rmm::mr::device_memory_resource* mr)
 {
-
   std::vector<std::string> template_types = make_template_types(outcol_view, data_view);
 
-  std::string generic_kernel_name = 
-  jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
-    .instantiate(template_types);
+  std::string generic_kernel_name =
+    jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
+      .instantiate(template_types);
 
   std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
-                     binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
+    binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
 
   // {size, out_ptr, out_mask_ptr, col0_ptr, col0_mask_ptr, col0_offset, col1_ptr...}
   std::vector<void*> kernel_args((data_view.num_columns() * 3) + 3);
 
-  cudf::size_type size = outcol_view.size();
+  cudf::size_type size   = outcol_view.size();
   const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
   const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
   kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr});
@@ -123,19 +117,20 @@ void generalized_operation(table_view data_view,
 
     data_ptrs[col_idx] = cudf::jit::get_data_ptr(col);
     mask_ptrs[col_idx] = col.null_mask();
-    offsets[col_idx] = col.offset();
-
-    kernel_args.insert(kernel_args.begin() + 3 * (col_idx + 1), {&data_ptrs[col_idx], &mask_ptrs[col_idx], &offsets[col_idx]});
+    offsets[col_idx]   = col.offset();
 
+    kernel_args.insert(kernel_args.begin() + 3 * (col_idx + 1),
+                       {&data_ptrs[col_idx], &mask_ptrs[col_idx], &offsets[col_idx]});
   }
 
   rmm::cuda_stream_view generic_stream;
   cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
-    .get_kernel(
-      generic_kernel_name, {}, {{"transform/jit/operation-udf.hpp", generic_cuda_source}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())                                   //
+    .get_kernel(generic_kernel_name,
+                {},
+                {{"transform/jit/operation-udf.hpp", generic_cuda_source}},
+                {"-arch=sm_."})                                    //
+    ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())  //
     ->launch(kernel_args.data());
-
 }
 
 }  // namespace jit
@@ -164,31 +159,27 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view data_view, 
-                                               std::string const& udf, 
-                                               data_type output_type, 
-                                               rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> generalized_masked_op(table_view data_view,
+                                              std::string const& udf,
+                                              data_type output_type,
+                                              rmm::mr::device_memory_resource* mr)
 {
-  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
-  std::unique_ptr<column> output = make_fixed_width_column(
-    output_type, data_view.num_rows()
-  );
-  std::unique_ptr<column> output_mask = make_fixed_width_column(
-    cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows()
-  );
+  rmm::cuda_stream_view stream   = rmm::cuda_stream_default;
+  std::unique_ptr<column> output = make_fixed_width_column(output_type, data_view.num_rows());
+  std::unique_ptr<column> output_mask =
+    make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());
 
-  mutable_column_view output_view = *output;
+  mutable_column_view output_view      = *output;
   mutable_column_view output_mask_view = *output_mask;
 
-  transformation::jit::generalized_operation(data_view, udf, output_type, output_view, output_mask_view, mr);
+  transformation::jit::generalized_operation(
+    data_view, udf, output_type, output_view, output_mask_view, mr);
 
   auto final_output_mask = cudf::bools_to_mask(output_mask_view);
   output.get()->set_null_mask(std::move(*(final_output_mask.first)));
   return output;
 }
 
-
-
 }  // namespace detail
 
 std::unique_ptr<column> transform(column_view const& input,
@@ -202,8 +193,8 @@ std::unique_ptr<column> transform(column_view const& input,
 }
 
 std::unique_ptr<column> generalized_masked_op(table_view data_view,
-                                              std::string const& binary_udf, 
-                                              data_type output_type, 
+                                              std::string const& binary_udf,
+                                              data_type output_type,
                                               rmm::mr::device_memory_resource* mr)
 {
   return detail::generalized_masked_op(data_view, binary_udf, output_type, mr);

From fb8f1cf2ef277765f709ad7ffcd23c9c4417039b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 7 Jun 2021 13:57:25 -0700
Subject: [PATCH 095/132] cache ptx

---
 python/cudf/cudf/core/frame.py        |  3 ++-
 python/cudf/cudf/core/udf/pipeline.py | 10 +++++-----
 python/cudf/cudf/utils/cudautils.py   |  7 +++++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index b4440805618..81e861e31b9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1454,11 +1454,12 @@ def _quantiles(
         result._copy_type_metadata(self)
         return result
 
+    @annotate("APPLY", color="purple", domain="cudf_python")
     def _apply(self, func):
         '''
         Apply `func` across the rows of the frame. 
         '''
-        output_dtype, ptx = cudf.core.udf.pipeline.compile_udf(func, self.dtypes)
+        output_dtype, ptx = cudf.core.udf.pipeline.compile_masked_udf(func, self.dtypes)
         result = cudf._lib.transform.masked_udf(self, ptx, output_dtype)
         return result
 
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
index ab80400eb29..bb49182fd94 100644
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -1,9 +1,11 @@
 from cudf.core.udf.typing import MaskedType
 from numba.np import numpy_support
 from numba import cuda
+from cudf.utils import cudautils
+from nvtx import annotate
 
-
-def compile_udf(func, dtypes):
+@annotate("NUMBA JIT", color="green", domain="cudf_python")
+def compile_masked_udf(func, dtypes):
     '''
     Generate an inlineable PTX function that will be injected into
     a variadic kernel inside libcudf
@@ -16,9 +18,7 @@ def compile_udf(func, dtypes):
         for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)
     )
     # Get the inlineable PTX function
-    ptx, numba_output_type = cuda.compile_ptx_for_current_device(
-        func, to_compiler_sig, device=True
-    )
+    ptx, numba_output_type = cudautils.compile_udf(func, to_compiler_sig)
     numpy_output_type = numpy_support.as_dtype(
         numba_output_type.value_type
     )
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 262fe304dd8..312fbc425dd 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -262,10 +262,13 @@ def compile_udf(udf, type_signature):
     ptx_code, return_type = cuda.compile_ptx_for_current_device(
         udf, type_signature, device=True
     )
-    output_type = numpy_support.as_dtype(return_type)
+    if not isinstance(return_type, cudf.core.udf.typing.MaskedType):
+        output_type = numpy_support.as_dtype(return_type).type
+    else:
+        output_type = return_type
 
     # Populate the cache for this function
-    res = (ptx_code, output_type.type)
+    res = (ptx_code, output_type)
     _udf_code_cache[key] = res
 
     return res

From 5d77b2ba63072c09815bd11ba341ff4a2c4fb75f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 7 Jun 2021 15:25:43 -0700
Subject: [PATCH 096/132] partially address reviews

---
 cpp/src/transform/transform.cpp | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index a83a0051ce1..f9707de183f 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -72,12 +72,12 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
   std::string offset_type =
     cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
 
-  std::vector<std::string> template_types(1);
-  template_types[0] = cudf::jit::get_type_name(outcol_view.type());
+  std::vector<std::string> template_types;
+  template_types.reserve(data_view.num_columns() + 1);
 
+  template_types.push_back(cudf::jit::get_type_name(outcol_view.type()));
   for (auto const& col : data_view) {
     auto these_types = {cudf::jit::get_type_name(col.type()) + "*", mskptr_type, offset_type};
-
     template_types.insert(template_types.end(), these_types);
   }
   return template_types;
@@ -86,8 +86,8 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
 void generalized_operation(table_view data_view,
                            std::string const& binary_udf,
                            data_type output_type,
-                           column_view const& outcol_view,
-                           column_view const& outmsk_view,
+                           mutable_column_view outcol_view,
+                           mutable_column_view outmsk_view,
                            rmm::mr::device_memory_resource* mr)
 {
   std::vector<std::string> template_types = make_template_types(outcol_view, data_view);
@@ -100,16 +100,21 @@ void generalized_operation(table_view data_view,
     binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
 
   // {size, out_ptr, out_mask_ptr, col0_ptr, col0_mask_ptr, col0_offset, col1_ptr...}
-  std::vector<void*> kernel_args((data_view.num_columns() * 3) + 3);
+  std::vector<void*> kernel_args;
+  kernel_args.reserve((data_view.num_columns() * 3) + 3);
 
   cudf::size_type size   = outcol_view.size();
   const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
   const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
   kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr});
 
-  std::vector<const void*> data_ptrs(data_view.num_columns());
-  std::vector<cudf::bitmask_type const*> mask_ptrs(data_view.num_columns());
-  std::vector<cudf::offset_type> offsets(data_view.num_columns());
+  std::vector<const void*> data_ptrs;
+  std::vector<cudf::bitmask_type const*> mask_ptrs;
+  std::vector<cudf::offset_type> offsets;
+
+  data_ptrs.reserve(data_view.num_columns());
+  mask_ptrs.reserve(data_view.num_columns());
+  offsets.reserve(data_view.num_columns());
 
   column_view col;
   for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
@@ -169,13 +174,10 @@ std::unique_ptr<column> generalized_masked_op(table_view data_view,
   std::unique_ptr<column> output_mask =
     make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());
 
-  mutable_column_view output_view      = *output;
-  mutable_column_view output_mask_view = *output_mask;
-
   transformation::jit::generalized_operation(
-    data_view, udf, output_type, output_view, output_mask_view, mr);
+    data_view, udf, output_type, *output, *output_mask, mr);
 
-  auto final_output_mask = cudf::bools_to_mask(output_mask_view);
+  auto final_output_mask = cudf::bools_to_mask(*output_mask);
   output.get()->set_null_mask(std::move(*(final_output_mask.first)));
   return output;
 }

From f863ba1a96428f3dc245ea4e2065f337ea3dcd9e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 10 Jun 2021 14:04:42 -0700
Subject: [PATCH 097/132] switch to push_back

---
 cpp/src/transform/transform.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index f9707de183f..6512a5aabe9 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -77,8 +77,9 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
 
   template_types.push_back(cudf::jit::get_type_name(outcol_view.type()));
   for (auto const& col : data_view) {
-    auto these_types = {cudf::jit::get_type_name(col.type()) + "*", mskptr_type, offset_type};
-    template_types.insert(template_types.end(), these_types);
+    template_types.push_back(cudf::jit::get_type_name(col.type()) + "*");
+    template_types.push_back(mskptr_type);
+    template_types.push_back(offset_type);
   }
   return template_types;
 }
@@ -120,9 +121,9 @@ void generalized_operation(table_view data_view,
   for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
     col = data_view.column(col_idx);
 
-    data_ptrs[col_idx] = cudf::jit::get_data_ptr(col);
-    mask_ptrs[col_idx] = col.null_mask();
-    offsets[col_idx]   = col.offset();
+    data_ptrs.push_back(cudf::jit::get_data_ptr(col));
+    mask_ptrs.push_back(col.null_mask());
+    offsets.push_back(col.offset());
 
     kernel_args.insert(kernel_args.begin() + 3 * (col_idx + 1),
                        {&data_ptrs[col_idx], &mask_ptrs[col_idx], &offsets[col_idx]});

From 92cd6eb1117aa220b7e1f6271d3b2083ece17ff7 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 10 Jun 2021 14:49:23 -0700
Subject: [PATCH 098/132] more pushing back

---
 cpp/src/transform/transform.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 6512a5aabe9..5ed5461662f 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -125,8 +125,10 @@ void generalized_operation(table_view data_view,
     mask_ptrs.push_back(col.null_mask());
     offsets.push_back(col.offset());
 
-    kernel_args.insert(kernel_args.begin() + 3 * (col_idx + 1),
-                       {&data_ptrs[col_idx], &mask_ptrs[col_idx], &offsets[col_idx]});
+    kernel_args.push_back(&data_ptrs[col_idx]);
+    kernel_args.push_back(&mask_ptrs[col_idx]);
+    kernel_args.push_back(&offsets[col_idx]);
+
   }
 
   rmm::cuda_stream_view generic_stream;

From 4b08c51237878e92380a63b795255c41b07c789d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 10 Jun 2021 14:51:26 -0700
Subject: [PATCH 099/132] xfail pow tests due to issue cudf/8470

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 6b50ca8dd7e..7d32daf63f1 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -12,8 +12,7 @@
     operator.truediv,
     operator.floordiv,
     operator.mod,
-    operator.pow
-
+    pytest.param(operator.pow, marks=pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/8470"))
 ]
 
 comparison_ops = [

From 48733b2fe09f2a5870f7f56aacfe55f541f6fe6a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 15 Jun 2021 06:08:19 -0700
Subject: [PATCH 100/132] style fixes

---
 cpp/src/transform/transform.cpp               |   1 -
 python/cudf/cudf/_lib/transform.pyx           |   1 +
 python/cudf/cudf/core/dataframe.py            |   1 -
 python/cudf/cudf/core/frame.py                |  11 +-
 python/cudf/cudf/core/udf/lowering.py         |  75 +++++-----
 python/cudf/cudf/core/udf/pipeline.py         |  22 +--
 python/cudf/cudf/core/udf/typing.py           | 137 ++++++++----------
 python/cudf/cudf/tests/test_udf_masked_ops.py | 135 ++++++++---------
 8 files changed, 181 insertions(+), 202 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 5ed5461662f..fcd2b6da5a4 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -128,7 +128,6 @@ void generalized_operation(table_view data_view,
     kernel_args.push_back(&data_ptrs[col_idx]);
     kernel_args.push_back(&mask_ptrs[col_idx]);
     kernel_args.push_back(&offsets[col_idx]);
-
   }
 
   rmm::cuda_stream_view generic_stream;
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 23854cd3e98..3ba9aac5687 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -122,6 +122,7 @@ def transform(Column input, op):
 
     return Column.from_unique_ptr(move(c_output))
 
+
 def masked_udf(Table incols, op, output_type):
     cdef table_view data_view = incols.data_view()
     cdef string c_str = op.encode("UTF-8")
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 78cff4af763..75ae2fb13db 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4740,7 +4740,6 @@ def apply(self, func, axis=1):
 
         return func(self)
 
-
     @applyutils.doc_apply()
     def apply_rows(
         self,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c4716d28e86..e1eafae4eca 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1456,14 +1456,15 @@ def _quantiles(
 
     @annotate("APPLY", color="purple", domain="cudf_python")
     def _apply(self, func):
-        '''
-        Apply `func` across the rows of the frame. 
-        '''
-        output_dtype, ptx = cudf.core.udf.pipeline.compile_masked_udf(func, self.dtypes)
+        """
+        Apply `func` across the rows of the frame.
+        """
+        output_dtype, ptx = cudf.core.udf.pipeline.compile_masked_udf(
+            func, self.dtypes
+        )
         result = cudf._lib.transform.masked_udf(self, ptx, output_dtype)
         return result
 
-
     def rank(
         self,
         axis=0,
diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 3fffc8ac972..e18ee37e04a 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -1,18 +1,21 @@
-from . import classes
+import operator
+
+from llvmlite import ir
+from numba.core import cgutils
+from numba.core.typing import signature as nb_signature
 from numba.cuda.cudaimpl import (
     lower as cuda_lower,
+    registry as cuda_impl_registry,
     registry as cuda_lowering_registry,
 )
-from numba.core.typing import signature as nb_signature
-from cudf.core.udf.typing import MaskedType, NAType
-from numba.core import cgutils
-from numba.cuda.cudaimpl import registry as cuda_impl_registry
-import operator
 from numba.extending import lower_builtin, types
-from llvmlite import ir
 
+from cudf.core.udf.typing import MaskedType, NAType
+
+from . import classes
 from ._ops import arith_ops, comparison_ops
 
+
 @cuda_lowering_registry.lower_constant(NAType)
 def constant_na(context, builder, ty, pyval):
     # This handles None, etc.
@@ -25,15 +28,17 @@ def constant_na(context, builder, ty, pyval):
 # the implementation details of how to do that. This is where
 # we can involve both validities in constructing the answer
 
+
 def make_arithmetic_op(op):
-    '''
+    """
     Make closures that implement arithmetic operations. See
     register_arithmetic_op for details.
-    '''
+    """
+
     def masked_scalar_op_impl(context, builder, sig, args):
-        '''
+        """
         Implement `MaskedType` + `MaskedType`
-        '''
+        """
         # MaskedType(...), MaskedType(...)
         masked_type_1, masked_type_2 = sig.args
         # MaskedType(...)
@@ -49,8 +54,9 @@ def masked_scalar_op_impl(context, builder, sig, args):
         )
 
         # we will return an output struct
-        result = cgutils.create_struct_proxy(masked_return_type)(context,
-                                                                 builder)
+        result = cgutils.create_struct_proxy(masked_return_type)(
+            context, builder
+        )
 
         # compute output validity
         valid = builder.and_(m1.valid, m2.valid)
@@ -65,16 +71,17 @@ def masked_scalar_op_impl(context, builder, sig, args):
                 nb_signature(
                     masked_return_type.value_type,
                     masked_type_1.value_type,
-                    masked_type_2.value_type
+                    masked_type_2.value_type,
                 ),
-                (m1.value, m2.value)
+                (m1.value, m2.value),
             )
         return result._getvalue()
+
     return masked_scalar_op_impl
 
 
 def register_arithmetic_op(op):
-    '''
+    """
     Register a lowering implementation for the
     arithmetic op `op`.
 
@@ -85,17 +92,17 @@ def register_arithmetic_op(op):
 
     This function makes and lowers a closure for one op.
 
-    '''
+    """
     to_lower_op = make_arithmetic_op(op)
     cuda_lower(op, MaskedType, MaskedType)(to_lower_op)
 
 
 def masked_scalar_null_op_impl(context, builder, sig, args):
-    '''
+    """
     Implement `MaskedType` + `NAType`
     The answer to this is known up front so no actual addition
     needs to take place
-    '''
+    """
 
     return_type = sig.return_type  # MaskedType(...)
     result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))(
@@ -109,9 +116,9 @@ def masked_scalar_null_op_impl(context, builder, sig, args):
 
 def make_const_op(op):
     def masked_scalar_const_op_impl(context, builder, sig, args):
-        '''
+        """
         Implement `MaskedType` + constant
-        '''
+        """
         masked_type, const_type = sig.args
         masked_value, numeric_value = args
 
@@ -130,17 +137,17 @@ def masked_scalar_const_op_impl(context, builder, sig, args):
                 builder,
                 lambda x, y: op(x, y),
                 nb_signature(
-                    return_type.value_type,
-                    masked_type.value_type,
-                    const_type
+                    return_type.value_type, masked_type.value_type, const_type
                 ),
-                (indata.value, numeric_value)
+                (indata.value, numeric_value),
             )
             result.valid = context.get_constant(types.boolean, 1)
 
         return result._getvalue()
+
     return masked_scalar_const_op_impl
 
+
 def make_reflected_const_op(op):
     def masked_scalar_reflected_const_op_impl(context, builder, sig, args):
         const_type, masked_type = sig.args
@@ -161,17 +168,16 @@ def masked_scalar_reflected_const_op_impl(context, builder, sig, args):
                 builder,
                 lambda x, y: op(x, y),
                 nb_signature(
-                    return_type.value_type,
-                    const_type,
-                    masked_type.value_type
+                    return_type.value_type, const_type, masked_type.value_type
                 ),
-                (numeric_value, indata.value)
+                (numeric_value, indata.value),
             )
             result.valid = context.get_constant(types.boolean, 1)
 
         return result._getvalue()
+
     return masked_scalar_reflected_const_op_impl
-        
+
 
 def register_const_op(op):
     to_lower_op = make_const_op(op)
@@ -193,9 +199,9 @@ def register_const_op(op):
 @cuda_lower(operator.is_, MaskedType, NAType)
 @cuda_lower(operator.is_, NAType, MaskedType)
 def masked_scalar_is_null_impl(context, builder, sig, args):
-    '''
+    """
     Implement `MaskedType` is `NA`
-    '''
+    """
     if isinstance(sig.args[1], NAType):
         masked_type, na = sig.args
         value = args[0]
@@ -255,8 +261,9 @@ def cast_na_to_masked(context, builder, fromty, toty, val):
 @cuda_impl_registry.lower_cast(MaskedType, MaskedType)
 def cast_masked_to_masked(context, builder, fromty, toty, val):
     operand = cgutils.create_struct_proxy(fromty)(context, builder, value=val)
-    casted = context.cast(builder, operand.value, fromty.value_type,
-                          toty.value_type)
+    casted = context.cast(
+        builder, operand.value, fromty.value_type, toty.value_type
+    )
     ext = cgutils.create_struct_proxy(toty)(context, builder)
     ext.value = casted
     ext.valid = operand.valid
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
index bb49182fd94..c7b8be92c00 100644
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -1,33 +1,32 @@
-from cudf.core.udf.typing import MaskedType
 from numba.np import numpy_support
-from numba import cuda
-from cudf.utils import cudautils
 from nvtx import annotate
 
+from cudf.core.udf.typing import MaskedType
+from cudf.utils import cudautils
+
+
 @annotate("NUMBA JIT", color="green", domain="cudf_python")
 def compile_masked_udf(func, dtypes):
-    '''
+    """
     Generate an inlineable PTX function that will be injected into
     a variadic kernel inside libcudf
 
     assume all input types are `MaskedType(input_col.dtype)` and then
     compile the requestied PTX function as a function over those types
-    '''
+    """
     to_compiler_sig = tuple(
         MaskedType(arg)
         for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)
     )
     # Get the inlineable PTX function
     ptx, numba_output_type = cudautils.compile_udf(func, to_compiler_sig)
-    numpy_output_type = numpy_support.as_dtype(
-        numba_output_type.value_type
-    )
+    numpy_output_type = numpy_support.as_dtype(numba_output_type.value_type)
 
     return numpy_output_type, ptx
 
 
 def nulludf(func):
-    '''
+    """
     Mimic pandas API:
 
     def f(x, y):
@@ -38,13 +37,16 @@ def f(x, y):
     `DataFrame` sends `self` in as `row` and subsequently
     we end up calling `f` on the resulting columns since
     the dataframe is dict-like
-    '''
+    """
+
     def wrapper(*args):
         from cudf import DataFrame
+
         # This probably creates copies but is fine for now
         to_udf_table = DataFrame(
             {idx: arg for idx, arg in zip(range(len(args)), args)}
         )
         # Frame._apply
         return to_udf_table._apply(func)
+
     return wrapper
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 3ed0d0fd265..726664077c5 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -1,23 +1,33 @@
-from . import classes
+import operator
+
 from numba import types
-from cudf.core.scalar import _NAType
-from numba.core.extending import (typeof_impl, register_model, models,
-                                  make_attribute_wrapper)
-from numba.cuda.cudadecl import registry as cuda_decl_registry
-from numba.core.typing.templates import (AbstractTemplate, AttributeTemplate,
-                                         ConcreteTemplate)
+from numba.core.extending import (
+    make_attribute_wrapper,
+    models,
+    register_model,
+    typeof_impl,
+)
 from numba.core.typing import signature as nb_signature
+from numba.core.typing.templates import (
+    AbstractTemplate,
+    AttributeTemplate,
+    ConcreteTemplate,
+)
 from numba.core.typing.typeof import typeof
+from numba.cuda.cudadecl import registry as cuda_decl_registry
 
+from cudf.core.scalar import _NAType
 
-import operator
+from . import classes
 from ._ops import arith_ops, comparison_ops
 
+
 class MaskedType(types.Type):
-    '''
+    """
     A Numba type consisting of a value of some primitive type
     and a validity boolean, over which we can define math ops
-    '''
+    """
+
     def __init__(self, value):
         # MaskedType in Numba shall be parameterized
         # with a value type
@@ -30,14 +40,14 @@ def __repr__(self):
         return f"MaskedType({self.value_type})"
 
     def __hash__(self):
-        '''
+        """
         Needed so that numba caches type instances with different
         `value_type` separately.
-        '''
+        """
         return self.__repr__().__hash__()
 
     def unify(self, context, other):
-        '''
+        """
         Logic for sorting out what to do when the UDF conditionally
         returns a `MaskedType`, an `NAType`, or a literal based off
         the data at runtime.
@@ -65,14 +75,16 @@ def f(x, y):
         numba now sees this as
         f(x: MaskedType(dtype_1), y: MaskedType(dtype_2))
           -> MaskedType(dtype_unified)
-        '''
+        """
 
         # If we have Masked and NA, the output should be a
         # MaskedType with the original type as its value_type
         if isinstance(other, NAType):
             return self
         elif isinstance(other, MaskedType):
-            return MaskedType(context.unify_pairs(self.value_type, other.value_type))
+            return MaskedType(
+                context.unify_pairs(self.value_type, other.value_type)
+            )
 
         # if we have MaskedType and something that results in a
         # scalar, unify between the MaskedType's value_type
@@ -108,13 +120,15 @@ def typeof_masked(val, c):
 class MaskedConstructor(ConcreteTemplate):
     key = classes.Masked
 
-    cases = [nb_signature(MaskedType(t), t, types.boolean)
-             for t in (types.integer_domain | types.real_domain)]
+    cases = [
+        nb_signature(MaskedType(t), t, types.boolean)
+        for t in (types.integer_domain | types.real_domain)
+    ]
 
 
 # Provide access to `m.value` and `m.valid` in a kernel for a Masked `m`.
-make_attribute_wrapper(MaskedType, 'value', 'value')
-make_attribute_wrapper(MaskedType, 'valid', 'valid')
+make_attribute_wrapper(MaskedType, "value", "value")
+make_attribute_wrapper(MaskedType, "valid", "valid")
 
 
 # Typing for `classes.Masked`
@@ -129,8 +143,9 @@ def resolve_Masked(self, mod):
 # Registration of the global is also needed for Numba to type classes.Masked
 cuda_decl_registry.register_global(classes, types.Module(classes))
 # For typing bare Masked (as in `from .classes import Masked`
-cuda_decl_registry.register_global(classes.Masked,
-                                   types.Function(MaskedConstructor))
+cuda_decl_registry.register_global(
+    classes.Masked, types.Function(MaskedConstructor)
+)
 
 
 # Tell numba how `MaskedType` is constructed on the backend in terms
@@ -146,22 +161,23 @@ def __init__(self, dmm, fe_type):
 
 
 class NAType(types.Type):
-    '''
+    """
     A type for handling ops against nulls
     Exists so we can:
     1. Teach numba that all occurances of `cudf.NA` are
        to be read as instances of this type instead
     2. Define ops like `if x is cudf.NA` where `x` is of
        type `Masked` to mean `if x.valid is False`
-    '''
+    """
+
     def __init__(self):
         super().__init__(name="NA")
 
     def unify(self, context, other):
-        '''
+        """
         Masked  <-> NA works from above
         Literal <-> NA -> Masked
-        '''
+        """
         if isinstance(other, MaskedType):
             # bounce to MaskedType.unify
             return None
@@ -177,11 +193,11 @@ def unify(self, context, other):
 
 @typeof_impl.register(_NAType)
 def typeof_na(val, c):
-    '''
+    """
     Tie instances of _NAType (cudf.NA) to our NAType.
     Effectively make it so numba sees `cudf.NA` as an
     instance of this NAType -> handle it accordingly.
-    '''
+    """
     return na_type
 
 
@@ -199,11 +215,11 @@ def typeof_na(val, c):
 # are parameterized with `value_type` and what flavor of `Masked` to return.
 class MaskedScalarArithOp(AbstractTemplate):
     def generic(self, args, kws):
-        '''
+        """
         Typing for `Masked` + `Masked`
         Numba expects a valid numba type to be returned if typing is successful
         else `None` signifies the error state (this is common across numba)
-        '''
+        """
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
             # In the case of op(Masked, Masked), the return type is a Masked
             # such that Masked.value is the primitive type that would have
@@ -211,41 +227,29 @@ def generic(self, args, kws):
             return_type = self.context.resolve_function_type(
                 self.key, (args[0].value_type, args[1].value_type), kws
             ).return_type
-            return nb_signature(
-                MaskedType(return_type),
-                args[0],
-                args[1],
-            )
+            return nb_signature(MaskedType(return_type), args[0], args[1],)
 
 
 class MaskedScalarNullOp(AbstractTemplate):
     def generic(self, args, kws):
-        '''
+        """
         Typing for `Masked` + `NA`
         Handles situations like `x + cudf.NA`
-        '''
+        """
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
             # In the case of op(Masked, NA), the result has the same
             # dtype as the original regardless of what it is
-            return nb_signature(
-                args[0],
-                args[0],
-                na_type,
-            )
+            return nb_signature(args[0], args[0], na_type,)
         elif isinstance(args[0], NAType) and isinstance(args[1], MaskedType):
-            return nb_signature(
-                args[1],
-                na_type,
-                args[1]
-            )
+            return nb_signature(args[1], na_type, args[1])
 
 
 class MaskedScalarScalarOp(AbstractTemplate):
     def generic(self, args, kws):
-        '''
+        """
         Typing for `Masked` + a scalar.
         handles situations like `x + 1`
-        '''
+        """
         if isinstance(args[0], MaskedType) and isinstance(
             args[1], types.Number
         ):
@@ -254,56 +258,41 @@ def generic(self, args, kws):
             return_type = self.context.resolve_function_type(
                 self.key, (args[0].value_type, args[1]), kws
             ).return_type
-            return nb_signature(
-                MaskedType(return_type),
-                args[0],
-                args[1],
-            )
+            return nb_signature(MaskedType(return_type), args[0], args[1],)
         elif isinstance(args[0], types.Number) and isinstance(
             args[1], MaskedType
         ):
             return_type = self.context.resolve_function_type(
                 self.key, (args[1].value_type, args[0]), kws
             ).return_type
-            return nb_signature(
-                MaskedType(return_type),
-                args[0],
-                args[1],
-            )
+            return nb_signature(MaskedType(return_type), args[0], args[1],)
 
 
 @cuda_decl_registry.register_global(operator.is_)
 class MaskedScalarIsNull(AbstractTemplate):
-    '''
+    """
     Typing for `Masked is cudf.NA`
-    '''
+    """
+
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType) and isinstance(args[1], NAType):
-            return nb_signature(
-                types.boolean,
-                args[0],
-                na_type)
+            return nb_signature(types.boolean, args[0], na_type)
         elif isinstance(args[1], MaskedType) and isinstance(args[0], NAType):
-            return nb_signature(
-                types.boolean,
-                na_type,
-                args[1])
+            return nb_signature(types.boolean, na_type, args[1])
 
 
 @cuda_decl_registry.register_global(operator.truth)
 class MaskedScalarTruth(AbstractTemplate):
-    '''
+    """
     Typing for `if Masked`
     Used for `if x > y`
     The truthiness of a MaskedType shall be the truthiness
     of the `value` stored therein
-    '''
+    """
+
     def generic(self, args, kws):
         if isinstance(args[0], MaskedType):
-            return nb_signature(
-                types.boolean,
-                MaskedType(types.boolean)
-            )
+            return nb_signature(types.boolean, MaskedType(types.boolean))
 
 
 for op in arith_ops + comparison_ops:
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 7d32daf63f1..e1da9c4e73d 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -1,9 +1,11 @@
-import cudf
-from cudf.core.udf.pipeline import nulludf
-from cudf.tests.utils import assert_eq, NUMERIC_TYPES
+import operator
+
 import pandas as pd
 import pytest
-import operator
+
+import cudf
+from cudf.core.udf.pipeline import nulludf
+from cudf.tests.utils import NUMERIC_TYPES, assert_eq
 
 arith_ops = [
     operator.add,
@@ -12,7 +14,12 @@
     operator.truediv,
     operator.floordiv,
     operator.mod,
-    pytest.param(operator.pow, marks=pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/8470"))
+    pytest.param(
+        operator.pow,
+        marks=pytest.mark.xfail(
+            reason="https://github.com/rapidsai/cudf/issues/8470"
+        ),
+    ),
 ]
 
 comparison_ops = [
@@ -21,7 +28,7 @@
     operator.lt,
     operator.le,
     operator.gt,
-    operator.ge
+    operator.ge,
 ]
 
 
@@ -30,21 +37,15 @@ def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
     pdf = data.to_pandas(nullable=True)
 
     expect = pdf.apply(
-        lambda row: func_pdf(
-            *[row[i] for i in data.columns]
-        ),
-        axis=1
+        lambda row: func_pdf(*[row[i] for i in data.columns]), axis=1
     )
     obtain = gdf.apply(
-        lambda row: func_gdf(
-            *[row[i] for i in data.columns]
-        ),
-        axis=1
+        lambda row: func_gdf(*[row[i] for i in data.columns]), axis=1
     )
     assert_eq(expect, obtain, **kwargs)
 
 
-@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize("op", arith_ops)
 def test_arith_masked_vs_masked(op):
     # This test should test all the typing
     # and lowering for arithmetic ops between
@@ -56,14 +57,11 @@ def func_pdf(x, y):
     def func_gdf(x, y):
         return op(x, y)
 
-    gdf = cudf.DataFrame({
-        'a': [1, None, 3, None],
-        'b': [4, 5, None, None]
-    })
+    gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]})
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
-@pytest.mark.parametrize('op', comparison_ops)
+@pytest.mark.parametrize("op", comparison_ops)
 def test_compare_masked_vs_masked(op):
     # this test should test all the
     # typing and lowering for comparisons
@@ -78,15 +76,14 @@ def func_gdf(x, y):
 
     # we should get:
     # [?, ?, <NA>, <NA>, <NA>]
-    gdf = cudf.DataFrame({
-        'a': [1, 0, None, 1, None],
-        'b': [0, 1, 0, None, None]
-    })
+    gdf = cudf.DataFrame(
+        {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]}
+    )
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
-@pytest.mark.parametrize('op', arith_ops)
-@pytest.mark.parametrize('constant', [1, 1.5])
+@pytest.mark.parametrize("op", arith_ops)
+@pytest.mark.parametrize("constant", [1, 1.5])
 def test_arith_masked_vs_constant(op, constant):
     def func_pdf(x):
         return op(x, constant)
@@ -96,14 +93,13 @@ def func_gdf(x):
         return op(x, constant)
 
     # Just a single column -> result will be all NA
-    gdf = cudf.DataFrame({
-        'data': [1, 2, None]
-    })
+    gdf = cudf.DataFrame({"data": [1, 2, None]})
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
-@pytest.mark.parametrize('op', arith_ops)
-@pytest.mark.parametrize('constant', [1, 1.5])
+
+@pytest.mark.parametrize("op", arith_ops)
+@pytest.mark.parametrize("constant", [1, 1.5])
 def test_arith_masked_vs_constant_reflected(op, constant):
     def func_pdf(x):
         return op(constant, x)
@@ -113,14 +109,12 @@ def func_gdf(x):
         return op(constant, x)
 
     # Just a single column -> result will be all NA
-    gdf = cudf.DataFrame({
-        'data': [1, 2, None]
-    })
+    gdf = cudf.DataFrame({"data": [1, 2, None]})
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
-@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize("op", arith_ops)
 def test_arith_masked_vs_null(op):
     def func_pdf(x):
         return op(x, pd.NA)
@@ -129,13 +123,11 @@ def func_pdf(x):
     def func_gdf(x):
         return op(x, cudf.NA)
 
-    gdf = cudf.DataFrame({
-        'data': [1, None, 3]
-    })
+    gdf = cudf.DataFrame({"data": [1, None, 3]})
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
-@pytest.mark.parametrize('op', arith_ops)
+@pytest.mark.parametrize("op", arith_ops)
 def test_arith_masked_vs_null_reflected(op):
     def func_pdf(x):
         return op(pd.NA, x)
@@ -144,9 +136,7 @@ def func_pdf(x):
     def func_gdf(x):
         return op(cudf.NA, x)
 
-    gdf = cudf.DataFrame({
-        'data': [1, None, 3]
-    })
+    gdf = cudf.DataFrame({"data": [1, None, 3]})
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
@@ -164,21 +154,18 @@ def func_gdf(x, y):
         else:
             return x + y
 
-    gdf = cudf.DataFrame({
-        'a': [1, None, 3, None],
-        'b': [4, 5, None, None]
-    })
+    gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]})
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
-@pytest.mark.parametrize('dtype_a', list(NUMERIC_TYPES))
-@pytest.mark.parametrize('dtype_b', list(NUMERIC_TYPES))
+@pytest.mark.parametrize("dtype_a", list(NUMERIC_TYPES))
+@pytest.mark.parametrize("dtype_b", list(NUMERIC_TYPES))
 def test_apply_mixed_dtypes(dtype_a, dtype_b):
-    '''
-    Test that operations can be performed between columns 
+    """
+    Test that operations can be performed between columns
     of different dtypes and return a column with the correct
     values and nulls
-    '''
+    """
     # TODO: Parameterize over the op here
     def func_pdf(x, y):
         return x + y
@@ -187,25 +174,21 @@ def func_pdf(x, y):
     def func_gdf(x, y):
         return x + y
 
-    gdf = cudf.DataFrame({
-        'a': [1.5, None, 3, None],
-        'b': [4, 5, None, None]
-    })
-    gdf['a'] = gdf['a'].astype(dtype_a)
-    gdf['b'] = gdf['b'].astype(dtype_b)
+    gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]})
+    gdf["a"] = gdf["a"].astype(dtype_a)
+    gdf["b"] = gdf["b"].astype(dtype_b)
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
-@pytest.mark.parametrize('val', [
-    5, 5.5
-])
+@pytest.mark.parametrize("val", [5, 5.5])
 def test_apply_return_literal(val):
-    '''
+    """
     Test unification codepath for scalars and MaskedType
     makes sure that numba knows how to cast a scalar value
     to a MaskedType
-    '''
+    """
+
     def func_pdf(x, y):
         if x is not pd.NA and x < 2:
             return val
@@ -214,23 +197,21 @@ def func_pdf(x, y):
 
     @nulludf
     def func_gdf(x, y):
-        if x  is not cudf.NA and x < 2:
+        if x is not cudf.NA and x < 2:
             return val
         else:
             return x + y
 
-    gdf = cudf.DataFrame({
-        'a': [1, None, 3, None],
-        'b': [4, 5, None, None]
-    })
+    gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]})
 
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
 def test_apply_return_null():
-    '''
+    """
     Tests casting / unification of Masked and NA
-    '''
+    """
+
     def func_pdf(x):
         if x is pd.NA:
             return pd.NA
@@ -244,7 +225,7 @@ def func_gdf(x):
         else:
             return x
 
-    gdf = cudf.DataFrame({'a': [1, None, 3]})
+    gdf = cudf.DataFrame({"a": [1, None, 3]})
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
@@ -262,7 +243,7 @@ def func_gdf(x):
         else:
             return cudf.NA
 
-    gdf = cudf.DataFrame({'a': [1, 3, 6]})
+    gdf = cudf.DataFrame({"a": [1, 3, 6]})
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
 
 
@@ -277,7 +258,7 @@ def func_pdf(w, x, y, z):
         elif x + y is pd.NA:
             return 2.5
         else:
-            return (y > 2)
+            return y > 2
 
     @nulludf
     def func_gdf(w, x, y, z):
@@ -290,14 +271,14 @@ def func_gdf(w, x, y, z):
         elif x + y is cudf.NA:
             return 2.5
         else:
-            return (y > 2)
+            return y > 2
 
     gdf = cudf.DataFrame(
         {
-            'a': [1, 3, 6, 0, None, 5, None],
-            'b': [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0],
-            'c': [2, 3, 6, 0, None, 5, None],
-            'd': [4, None, 6, 0, None, 5, None],
+            "a": [1, 3, 6, 0, None, 5, None],
+            "b": [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0],
+            "c": [2, 3, 6, 0, None, 5, None],
+            "d": [4, None, 6, 0, None, 5, None],
         }
     )
     run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)

From 16018f66086639e3b612adef47640466dd2bc548 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 15 Jun 2021 06:13:56 -0700
Subject: [PATCH 101/132] more style fixes

---
 python/cudf/cudf/core/dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 75ae2fb13db..fa0e381a3e7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4719,7 +4719,7 @@ def apply(self, func, axis=1):
         defined function row wise over a dataframe, with true null
         handling. Works with UDFs using `core.udf.pipeline.nulludf`
         and returns a single series. Uses numba to jit compile the
-        function to PTX via LLVM. 
+        function to PTX via LLVM.
 
         Parameters
         ----------

From c91737eef2c5ea6c031efdca911cc8f11499c167 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 15 Jun 2021 06:19:31 -0700
Subject: [PATCH 102/132] update tests and _ops

---
 python/cudf/cudf/core/udf/_ops.py             |  5 +-
 .../cudf/tests/test_extension_compilation.py  | 88 +++++++++----------
 2 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/python/cudf/cudf/core/udf/_ops.py b/python/cudf/cudf/core/udf/_ops.py
index 0fce3fdbafb..25201356fd9 100644
--- a/python/cudf/cudf/core/udf/_ops.py
+++ b/python/cudf/cudf/core/udf/_ops.py
@@ -7,8 +7,7 @@
     operator.truediv,
     operator.floordiv,
     operator.mod,
-    operator.pow
-
+    operator.pow,
 ]
 
 comparison_ops = [
@@ -17,5 +16,5 @@
     operator.lt,
     operator.le,
     operator.gt,
-    operator.ge
+    operator.ge,
 ]
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index efd0509f475..e527fd0af17 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,13 +1,12 @@
 import operator
-import pytest
 
+import pytest
 from numba import cuda, types
 from numba.cuda import compile_ptx
 
 from cudf import NA
-from cudf.core.udf.typing import MaskedType
 from cudf.core.udf.classes import Masked
-
+from cudf.core.udf.typing import MaskedType
 
 arith_ops = (
     operator.add,
@@ -25,12 +24,10 @@
     operator.eq,
     operator.ne,
     operator.ge,
-    operator.gt
+    operator.gt,
 )
 
-unary_ops = (
-    operator.truth,
-)
+unary_ops = (operator.truth,)
 
 ops = arith_ops + comparison_ops
 
@@ -57,10 +54,9 @@
 number_ids = tuple(str(t) for t in number_types)
 
 
-@pytest.mark.parametrize('op', unary_ops)
-@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+@pytest.mark.parametrize("op", unary_ops)
+@pytest.mark.parametrize("ty", number_types, ids=number_ids)
 def test_compile_masked_unary(op, ty):
-
     def func(x):
         return op(x)
 
@@ -68,10 +64,9 @@ def func(x):
     ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
 
 
-@pytest.mark.parametrize('op', arith_ops)
-@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+@pytest.mark.parametrize("op", arith_ops)
+@pytest.mark.parametrize("ty", number_types, ids=number_ids)
 def test_execute_masked_binary(op, ty):
-
     @cuda.jit(device=True)
     def func(x, y):
         return op(x, y)
@@ -92,21 +87,20 @@ def test_kernel(x, y):
         # Check masks are as expected, and unmasked result matches masked
         # result
         if r0.valid:
-            raise RuntimeError('Expected r0 to be invalid')
+            raise RuntimeError("Expected r0 to be invalid")
         if not r1.valid:
-            raise RuntimeError('Expected r1 to be valid')
+            raise RuntimeError("Expected r1 to be valid")
         if u != r1.value:
-            print('Values: ', u, r1.value)
-            raise RuntimeError('u != r1.value')
+            print("Values: ", u, r1.value)
+            raise RuntimeError("u != r1.value")
 
     test_kernel[1, 1](1, 2)
 
 
-@pytest.mark.parametrize('op', ops)
-@pytest.mark.parametrize('ty', number_types, ids=number_ids)
-@pytest.mark.parametrize('constant', [1, 1.5])
+@pytest.mark.parametrize("op", ops)
+@pytest.mark.parametrize("ty", number_types, ids=number_ids)
+@pytest.mark.parametrize("constant", [1, 1.5])
 def test_compile_arith_masked_vs_constant(op, ty, constant):
-
     def func(x):
         return op(x, constant)
 
@@ -120,11 +114,10 @@ def func(x):
     assert resty.value_type == um_resty
 
 
-@pytest.mark.parametrize('op', ops)
-@pytest.mark.parametrize('ty', number_types, ids=number_ids)
-@pytest.mark.parametrize('constant', [1, 1.5])
+@pytest.mark.parametrize("op", ops)
+@pytest.mark.parametrize("ty", number_types, ids=number_ids)
+@pytest.mark.parametrize("constant", [1, 1.5])
 def test_compile_arith_constant_vs_masked(op, ty, constant):
-
     def func(x):
         return op(constant, x)
 
@@ -134,10 +127,9 @@ def func(x):
     assert isinstance(resty, MaskedType)
 
 
-@pytest.mark.parametrize('op', ops)
-@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+@pytest.mark.parametrize("op", ops)
+@pytest.mark.parametrize("ty", number_types, ids=number_ids)
 def test_compile_arith_masked_vs_na(op, ty):
-
     def func(x):
         return op(x, NA)
 
@@ -147,10 +139,9 @@ def func(x):
     assert isinstance(resty, MaskedType)
 
 
-@pytest.mark.parametrize('op', ops)
-@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+@pytest.mark.parametrize("op", ops)
+@pytest.mark.parametrize("ty", number_types, ids=number_ids)
 def test_compile_arith_na_vs_masked(op, ty):
-
     def func(x):
         return op(NA, x)
 
@@ -158,14 +149,15 @@ def func(x):
     ptx, resty = compile_ptx(func, (MaskedType(ty),), cc=cc, device=True)
 
 
-@pytest.mark.parametrize('op', ops)
-@pytest.mark.parametrize('ty1', number_types, ids=number_ids)
-@pytest.mark.parametrize('ty2', number_types, ids=number_ids)
-@pytest.mark.parametrize('masked', ((False, True), (True, False),
-                                    (True, True)),
-                         ids=('um', 'mu', 'mm'))
+@pytest.mark.parametrize("op", ops)
+@pytest.mark.parametrize("ty1", number_types, ids=number_ids)
+@pytest.mark.parametrize("ty2", number_types, ids=number_ids)
+@pytest.mark.parametrize(
+    "masked",
+    ((False, True), (True, False), (True, True)),
+    ids=("um", "mu", "mm"),
+)
 def test_compile_arith_masked_ops(op, ty1, ty2, masked):
-
     def func(x, y):
         return op(x, y)
 
@@ -187,7 +179,7 @@ def func_na_is_x(x):
     return NA is x
 
 
-@pytest.mark.parametrize('fn', (func_x_is_na, func_na_is_x))
+@pytest.mark.parametrize("fn", (func_x_is_na, func_na_is_x))
 def test_is_na(fn):
 
     valid = Masked(1, True)
@@ -201,10 +193,10 @@ def test_kernel():
         invalid_is_na = device_fn(invalid)
 
         if valid_is_na:
-            raise RuntimeError('Valid masked value is NA and should not be')
+            raise RuntimeError("Valid masked value is NA and should not be")
 
         if not invalid_is_na:
-            raise RuntimeError('Invalid masked value is not NA and should be')
+            raise RuntimeError("Invalid masked value is not NA and should be")
 
     test_kernel[1, 1]()
 
@@ -273,8 +265,8 @@ def func_na_le(x):
 )
 
 
-@pytest.mark.parametrize('fn', na_comparison_funcs)
-@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+@pytest.mark.parametrize("fn", na_comparison_funcs)
+@pytest.mark.parametrize("ty", number_types, ids=number_ids)
 def test_na_masked_comparisons(fn, ty):
 
     device_fn = cuda.jit(device=True)(fn)
@@ -289,18 +281,18 @@ def test_kernel():
         invalid_cmp_na = device_fn(invalid_masked)
 
         if valid_cmp_na:
-            raise RuntimeError('Valid masked value compared True with NA')
+            raise RuntimeError("Valid masked value compared True with NA")
 
         if invalid_cmp_na:
-            raise RuntimeError('Invalid masked value compared True with NA')
+            raise RuntimeError("Invalid masked value compared True with NA")
 
     test_kernel[1, 1]()
 
 
 # xfail because scalars do not yet cast for a comparison to NA
 @pytest.mark.xfail
-@pytest.mark.parametrize('fn', na_comparison_funcs)
-@pytest.mark.parametrize('ty', number_types, ids=number_ids)
+@pytest.mark.parametrize("fn", na_comparison_funcs)
+@pytest.mark.parametrize("ty", number_types, ids=number_ids)
 def test_na_scalar_comparisons(fn, ty):
 
     device_fn = cuda.jit(device=True)(fn)
@@ -312,6 +304,6 @@ def test_kernel():
         unmasked_cmp_na = device_fn(unmasked)
 
         if unmasked_cmp_na:
-            raise RuntimeError('Unmasked value compared True with NA')
+            raise RuntimeError("Unmasked value compared True with NA")
 
     test_kernel[1, 1]()

From 0da7fc79a51e827bd0488c0787ecc8bf44d4ebc8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Jun 2021 14:41:05 -0700
Subject: [PATCH 103/132] address reviewsA

---
 cpp/src/transform/jit/masked_udf_kernel.cu | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index 9d62093e4d2..d733e4e98e1 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,9 +22,6 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
-#include <cudf/wrappers/durations.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-
 #include <transform/jit/operation-udf.hpp>
 
 #include <cudf/types.hpp>

From 9048879de53e8acc3a96578431751f2e97122955 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Jun 2021 14:49:20 -0700
Subject: [PATCH 104/132] fix typing for NA

---
 python/cudf/cudf/core/udf/typing.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 726664077c5..9e403185a16 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -15,8 +15,7 @@
 )
 from numba.core.typing.typeof import typeof
 from numba.cuda.cudadecl import registry as cuda_decl_registry
-
-from cudf.core.scalar import _NAType
+from pandas._libs.missing import NAType as _NAType
 
 from . import classes
 from ._ops import arith_ops, comparison_ops

From 9fa05a32b69a1aabba2d341a511a215e19df7b0e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 17 Jun 2021 14:56:16 -0700
Subject: [PATCH 105/132] minor name change

---
 cpp/src/transform/transform.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index fcd2b6da5a4..a5169dbd94d 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -85,7 +85,7 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
 }
 
 void generalized_operation(table_view data_view,
-                           std::string const& binary_udf,
+                           std::string const& udf,
                            data_type output_type,
                            mutable_column_view outcol_view,
                            mutable_column_view outmsk_view,
@@ -98,7 +98,7 @@ void generalized_operation(table_view data_view,
       .instantiate(template_types);
 
   std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
-    binary_udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
+    udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
 
   // {size, out_ptr, out_mask_ptr, col0_ptr, col0_mask_ptr, col0_offset, col1_ptr...}
   std::vector<void*> kernel_args;
@@ -197,11 +197,11 @@ std::unique_ptr<column> transform(column_view const& input,
 }
 
 std::unique_ptr<column> generalized_masked_op(table_view data_view,
-                                              std::string const& binary_udf,
+                                              std::string const& udf,
                                               data_type output_type,
                                               rmm::mr::device_memory_resource* mr)
 {
-  return detail::generalized_masked_op(data_view, binary_udf, output_type, mr);
+  return detail::generalized_masked_op(data_view, udf, output_type, mr);
 }
 
 }  // namespace cudf

From b80753462b5fc167b442b11d07f7900ea4302e33 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 23 Jun 2021 16:32:25 -0500
Subject: [PATCH 106/132] Update cpp/src/transform/jit/masked_udf_kernel.cu

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>
---
 cpp/src/transform/jit/masked_udf_kernel.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index d733e4e98e1..c87f9e1630e 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -65,12 +65,12 @@ __global__ void generic_udf_kernel(cudf::size_type size,
                                    bool* out_mask,
                                    Arguments... args)
 {
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-  int start  = tid + blkid * blksz;
-  int step   = blksz * gridsz;
+  int const tid    = threadIdx.x;
+  int const blkid  = blockIdx.x;
+  int const blksz  = blockDim.x;
+  int const gridsz = gridDim.x;
+  int const start  = tid + blkid * blksz;
+  int const step   = blksz * gridsz;
 
   Masked<TypeOut> output;
   for (cudf::size_type i = start; i < size; i += step) {

From f56ffbb7e786520ce4a998be88806cce84c322f9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 29 Jun 2021 07:08:56 -0700
Subject: [PATCH 107/132] add back missing header

---
 cpp/src/transform/jit/masked_udf_kernel.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index c87f9e1630e..3853201abc8 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -22,6 +22,8 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
+#include <limits>
+
 #include <transform/jit/operation-udf.hpp>
 
 #include <cudf/types.hpp>

From 7f07452f2179c3658ad372979edaef4780064cff Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 29 Jun 2021 13:03:21 -0700
Subject: [PATCH 108/132] revise headers again

---
 cpp/src/transform/jit/masked_udf_kernel.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index 3853201abc8..603802282b9 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -22,13 +22,12 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
-#include <limits>
+#include <cstdint>
 
 #include <transform/jit/operation-udf.hpp>
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <cudf/wrappers/timestamps.hpp>
 
 #include <cuda/std/tuple>
 #include <tuple>

From 968e91b0192dfb60a9cd8496415f3b4050a31749 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 1 Jul 2021 12:01:37 -0700
Subject: [PATCH 109/132] update docstring with examples

---
 python/cudf/cudf/core/dataframe.py | 125 ++++++++++++++++++++++++++++-
 1 file changed, 124 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c39eac7fd4f..108129f2221 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4718,13 +4718,136 @@ def apply(self, func, axis=1):
               Note: axis=0 is not yet supported.
             * 1 or 'columns': apply function to each row.
 
+        Examples
+        ----------
+
+        Simple function of a single variable which could be NA
+
+        >>> from cudf.core.udf.pipeline import nulludf
+        >>> @nulludf
+        ... def f(x):
+        ...     if x is cudf.NA:
+        ...             return 0
+        ...     else:
+        ...             return x + 1
+        ... 
+        >>> df = cudf.DataFrame({'a': [1, cudf.NA, 3]})
+        >>> df.apply(lambda row: f(row['a']))
+        0    2
+        1    0
+        2    4
+        dtype: int64
+
+        Function of multiple variables will operate in 
+        a null aware manner
+
+        >>> @nulludf
+        ... def f(x, y):
+        ...     return x - y
+        ... 
+        >>> df = cudf.DataFrame({
+        ...     'a': [1, cudf.NA, 3, cudf.NA],
+        ...     'b': [5, 6, cudf.NA, cudf.NA]
+        ... })
+        >>> df.apply(lambda row: f(row['a'], row['b']))
+        0      -4
+        1    <NA>
+        2    <NA>
+        3    <NA>
+        dtype: int64
+
+        Functions may conditionally return NA as in pandas
+
+        >>> @nulludf
+        ... def f(x, y):
+        ...     if x + y > 3:
+        ...             return cudf.NA
+        ...     else:
+        ...             return x + y
+        ... 
+        >>> df = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [2, 1, 1]
+        ... })
+        >>> df.apply(lambda row: f(row['a'], row['b']))
+        0       3
+        1       3
+        2    <NA>
+        dtype: int64        
+
+        Mixed types are allowed, but will return the common
+        type, rather than object as in pandas
+
+        >>> @nulludf
+        ... def f(x, y):
+        ...     return x + y
+        ... 
+        >>> df = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [0.5, cudf.NA, 3.14]
+        ... })
+        >>> df.apply(lambda row: f(row['a'], row['b']))
+        0     1.5
+        1    <NA>
+        2    6.14
+        dtype: float64
+
+        Functions may also return scalar values, however the 
+        result will be promoted to a safe type regardless of
+        the data
+
+        >>> @nulludf
+        ... def f(x):
+        ...     if x > 3:
+        ...             return x
+        ...     else:
+        ...             return 1.5
+        ... 
+        >>> df = cudf.DataFrame({
+        ...     'a': [1, 3, 5]
+        ... })
+        >>> df.apply(lambda row: f(row['a']))
+        0    1.5
+        1    1.5
+        2    5.0
+        dtype: float64
+
+        Ops against N columns are supported generally
+
+        >>> @nulludf
+        ... def f(v, w, x, y, z):
+        ...     return x + (y - (z / w)) % v
+        ... 
+        >>> df = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'c': [cudf.NA, 4, 4],
+        ...     'd': [8, 7, 8],
+        ...     'e': [7, 1, 6]
+        ... })
+        >>> df.apply(
+        ...     lambda row: f(
+        ...             row['a'],
+        ...             row['b'],
+        ...             row['c'],
+        ...             row['d'],
+        ...             row['e']
+        ...     )
+        ... )
+        0    <NA>
+        1     4.8
+        2     5.0
+        dtype: float64
+
         """
+
+
         if axis != 1:
             raise ValueError(
                 "DataFrame.apply currently only supports row wise ops"
             )
 
-        return func(self)
+        return cudf.Series(func(self))
 
     @applyutils.doc_apply()
     def apply_rows(

From 699239dc66249cc46bb1b4b59ce454ba081bae61 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 1 Jul 2021 12:23:36 -0700
Subject: [PATCH 110/132] add error checking

---
 python/cudf/cudf/core/dataframe.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 108129f2221..11644aa026d 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4730,7 +4730,7 @@ def apply(self, func, axis=1):
         ...             return 0
         ...     else:
         ...             return x + 1
-        ... 
+        ...
         >>> df = cudf.DataFrame({'a': [1, cudf.NA, 3]})
         >>> df.apply(lambda row: f(row['a']))
         0    2
@@ -4738,13 +4738,13 @@ def apply(self, func, axis=1):
         2    4
         dtype: int64
 
-        Function of multiple variables will operate in 
+        Function of multiple variables will operate in
         a null aware manner
 
         >>> @nulludf
         ... def f(x, y):
         ...     return x - y
-        ... 
+        ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, cudf.NA, 3, cudf.NA],
         ...     'b': [5, 6, cudf.NA, cudf.NA]
@@ -4773,7 +4773,7 @@ def apply(self, func, axis=1):
         0       3
         1       3
         2    <NA>
-        dtype: int64        
+        dtype: int64
 
         Mixed types are allowed, but will return the common
         type, rather than object as in pandas
@@ -4781,7 +4781,7 @@ def apply(self, func, axis=1):
         >>> @nulludf
         ... def f(x, y):
         ...     return x + y
-        ... 
+        ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, 2, 3],
         ...     'b': [0.5, cudf.NA, 3.14]
@@ -4792,7 +4792,7 @@ def apply(self, func, axis=1):
         2    6.14
         dtype: float64
 
-        Functions may also return scalar values, however the 
+        Functions may also return scalar values, however the
         result will be promoted to a safe type regardless of
         the data
 
@@ -4802,7 +4802,7 @@ def apply(self, func, axis=1):
         ...             return x
         ...     else:
         ...             return 1.5
-        ... 
+        ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, 3, 5]
         ... })
@@ -4817,7 +4817,7 @@ def apply(self, func, axis=1):
         >>> @nulludf
         ... def f(v, w, x, y, z):
         ...     return x + (y - (z / w)) % v
-        ... 
+        ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, 2, 3],
         ...     'b': [4, 5, 6],
@@ -4841,6 +4841,15 @@ def apply(self, func, axis=1):
 
         """
 
+        for dtype in self.dtypes:
+            if (
+                isinstance(dtype, cudf.core.dtypes._BaseDtype)
+                or dtype == "object"
+            ):
+                raise TypeError(
+                    "DataFrame.apply currently only "
+                    "supports non decimal numeric types"
+                )
 
         if axis != 1:
             raise ValueError(

From 2d071522d9c0b4d0abe5ebe09e4aeead7eb3704c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 1 Jul 2021 14:09:32 -0700
Subject: [PATCH 111/132] Address reviews

---
 python/cudf/cudf/core/udf/classes.py  | 11 +++++++++++
 python/cudf/cudf/core/udf/lowering.py | 27 ++++++++++++++++++---------
 python/cudf/cudf/core/udf/typing.py   | 11 +++++------
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/udf/classes.py b/python/cudf/cudf/core/udf/classes.py
index 1e979253fba..b3b6f2cbbdf 100644
--- a/python/cudf/cudf/core/udf/classes.py
+++ b/python/cudf/cudf/core/udf/classes.py
@@ -1,4 +1,15 @@
 class Masked:
+    """
+    Most of the time, MaskedType as defined in typing.py
+    combined with the ops defined to operate on them are
+    enough to fulfill the obligations of DataFrame.apply
+    However sometimes we need to refer to an instance of
+    a masked scalar outside the context of a UDF like as
+    a global variable. To get numba to identify that var
+    a of type MaskedType and treat it as such we need to
+    have an actual python class we can tie to MaskedType
+    This is that class
+    """
     def __init__(self, value, valid):
         self.value = value
         self.valid = valid
diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index e18ee37e04a..1cda3903fe4 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -5,7 +5,6 @@
 from numba.core.typing import signature as nb_signature
 from numba.cuda.cudaimpl import (
     lower as cuda_lower,
-    registry as cuda_impl_registry,
     registry as cuda_lowering_registry,
 )
 from numba.extending import lower_builtin, types
@@ -120,7 +119,7 @@ def masked_scalar_const_op_impl(context, builder, sig, args):
         Implement `MaskedType` + constant
         """
         masked_type, const_type = sig.args
-        masked_value, numeric_value = args
+        masked_value, const_value = args
 
         return_type = sig.return_type
         masked_input_type = MaskedType(masked_type.value_type)
@@ -139,7 +138,7 @@ def masked_scalar_const_op_impl(context, builder, sig, args):
                 nb_signature(
                     return_type.value_type, masked_type.value_type, const_type
                 ),
-                (indata.value, numeric_value),
+                (indata.value, const_value),
             )
             result.valid = context.get_constant(types.boolean, 1)
 
@@ -151,7 +150,7 @@ def masked_scalar_const_op_impl(context, builder, sig, args):
 def make_reflected_const_op(op):
     def masked_scalar_reflected_const_op_impl(context, builder, sig, args):
         const_type, masked_type = sig.args
-        numeric_value, masked_value = args
+        const_value, masked_value = args
 
         return_type = sig.return_type
         masked_input_type = MaskedType(masked_type.value_type)
@@ -170,7 +169,7 @@ def masked_scalar_reflected_const_op_impl(context, builder, sig, args):
                 nb_signature(
                     return_type.value_type, const_type, masked_type.value_type
                 ),
-                (numeric_value, indata.value),
+                (const_value, indata.value),
             )
             result.valid = context.get_constant(types.boolean, 1)
 
@@ -241,7 +240,7 @@ def masked_scalar_bool_impl(context, builder, sig, args):
 # To handle the unification, we need to support casting from any type to an
 # extension type. The cast implementation takes the value passed in and returns
 # an extension struct wrapping that value.
-@cuda_impl_registry.lower_cast(types.Any, MaskedType)
+@cuda_lowering_registry.lower_cast(types.Any, MaskedType)
 def cast_primitive_to_masked(context, builder, fromty, toty, val):
     casted = context.cast(builder, val, fromty, toty.value_type)
     ext = cgutils.create_struct_proxy(toty)(context, builder)
@@ -250,7 +249,7 @@ def cast_primitive_to_masked(context, builder, fromty, toty, val):
     return ext._getvalue()
 
 
-@cuda_impl_registry.lower_cast(NAType, MaskedType)
+@cuda_lowering_registry.lower_cast(NAType, MaskedType)
 def cast_na_to_masked(context, builder, fromty, toty, val):
     result = cgutils.create_struct_proxy(toty)(context, builder)
     result.valid = context.get_constant(types.boolean, 0)
@@ -258,8 +257,18 @@ def cast_na_to_masked(context, builder, fromty, toty, val):
     return result._getvalue()
 
 
-@cuda_impl_registry.lower_cast(MaskedType, MaskedType)
+@cuda_lowering_registry.lower_cast(MaskedType, MaskedType)
 def cast_masked_to_masked(context, builder, fromty, toty, val):
+    """
+    When numba encounters an op that expects a certain type and
+    the input to the op is not of the expected type it will try
+    to cast the input to the appropriate type. But, in our case
+    the input may be a MaskedType, which numba doesn't natively
+    know how to cast to a different MaskedType with a different
+    `value_type`. This implements and registers that cast.
+    """
+
+    # We will 
     operand = cgutils.create_struct_proxy(fromty)(context, builder, value=val)
     casted = context.cast(
         builder, operand.value, fromty.value_type, toty.value_type
@@ -281,7 +290,7 @@ def masked_constructor(context, builder, sig, args):
     return masked._getvalue()
 
 
-@cuda_impl_registry.lower_constant(MaskedType)
+@cuda_lowering_registry.lower_constant(MaskedType)
 def lower_constant_masked(context, builder, ty, val):
     masked = cgutils.create_struct_proxy(ty)(context, builder)
     masked.value = context.get_constant(ty.value_type, val.value)
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 9e403185a16..2ed86d635de 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -30,13 +30,12 @@ class MaskedType(types.Type):
     def __init__(self, value):
         # MaskedType in Numba shall be parameterized
         # with a value type
-        super().__init__(name="Masked")
-        if isinstance(value, MaskedType):
-            breakpoint()
+        if not isinstance(value, (types.Number, types.Boolean)):
+            raise TypeError(
+                "value_type must be a numeric scalar type"
+            )
         self.value_type = value
-
-    def __repr__(self):
-        return f"MaskedType({self.value_type})"
+        super().__init__(name=f"Masked{self.value_type}")
 
     def __hash__(self):
         """

From 95098e6f1f40826100560ea0bc212b213342c170 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 1 Jul 2021 16:17:59 -0500
Subject: [PATCH 112/132] Apply suggestions from code review

Co-authored-by: Graham Markall <535640+gmarkall@users.noreply.github.com>
---
 python/cudf/cudf/core/udf/lowering.py | 17 +++++++++--------
 python/cudf/cudf/core/udf/typing.py   | 22 ++++++++++++----------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 1cda3903fe4..e5684a69f80 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -36,7 +36,7 @@ def make_arithmetic_op(op):
 
     def masked_scalar_op_impl(context, builder, sig, args):
         """
-        Implement `MaskedType` + `MaskedType`
+        Implement `MaskedType` <op> `MaskedType`
         """
         # MaskedType(...), MaskedType(...)
         masked_type_1, masked_type_2 = sig.args
@@ -61,7 +61,7 @@ def masked_scalar_op_impl(context, builder, sig, args):
         valid = builder.and_(m1.valid, m2.valid)
         result.valid = valid
         with builder.if_then(valid):
-            # Let numba handle generating the extra LLVM needed to perform
+            # Let numba handle generating the extra IR needed to perform
             # operations on mixed types, by compiling the final core op between
             # the two primitive values as a separate function and calling it
             result.value = context.compile_internal(
@@ -98,8 +98,9 @@ def register_arithmetic_op(op):
 
 def masked_scalar_null_op_impl(context, builder, sig, args):
     """
-    Implement `MaskedType` + `NAType`
-    The answer to this is known up front so no actual addition
+    Implement `MaskedType` <op> `NAType`
+    or `NAType` <op> `MaskedType`
+    The answer to this is known up front so no actual operation
     needs to take place
     """
 
@@ -116,7 +117,7 @@ def masked_scalar_null_op_impl(context, builder, sig, args):
 def make_const_op(op):
     def masked_scalar_const_op_impl(context, builder, sig, args):
         """
-        Implement `MaskedType` + constant
+        Implement `MaskedType` <op> constant
         """
         masked_type, const_type = sig.args
         masked_value, const_value = args
@@ -237,9 +238,9 @@ def masked_scalar_bool_impl(context, builder, sig, args):
     return indata.value
 
 
-# To handle the unification, we need to support casting from any type to an
-# extension type. The cast implementation takes the value passed in and returns
-# an extension struct wrapping that value.
+# To handle the unification, we need to support casting from any type to a
+# masked type. The cast implementation takes the value passed in and returns
+# a masked type struct wrapping that value.
 @cuda_lowering_registry.lower_cast(types.Any, MaskedType)
 def cast_primitive_to_masked(context, builder, fromty, toty, val):
     casted = context.cast(builder, val, fromty, toty.value_type)
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 2ed86d635de..53c34abf1af 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -173,7 +173,7 @@ def __init__(self):
 
     def unify(self, context, other):
         """
-        Masked  <-> NA works from above
+        Masked  <-> NA is deferred to MaskedType.unify()
         Literal <-> NA -> Masked
         """
         if isinstance(other, MaskedType):
@@ -202,30 +202,32 @@ def typeof_na(val, c):
 register_model(NAType)(models.OpaqueModel)
 
 
-# Ultimately, we want numba to produce PTX code that specifies how to add
-# two singular `Masked` structs together, which is defined as producing a
+# Ultimately, we want numba to produce PTX code that specifies how to implement
+# an operation on two singular `Masked` structs together, which is defined as producing a
 # new `Masked` with the right validity and if valid, the correct value.
 # This happens in two phases:
-#   1. Specify that `Masked` + `Masked` exists and what it should return
+#   1. Specify that `Masked` <op> `Masked` exists and what it should return
 #   2. Implement how to actually do (1) at the LLVM level
 # The following code accomplishes (1) - it is really just a way of specifying
-# that the `+` operation has a CUDA overload that accepts two `Masked` that
+# that the <op> has a CUDA overload that accepts two `Masked` that
 # are parameterized with `value_type` and what flavor of `Masked` to return.
 class MaskedScalarArithOp(AbstractTemplate):
     def generic(self, args, kws):
         """
-        Typing for `Masked` + `Masked`
+        Typing for `Masked` <op> `Masked`
         Numba expects a valid numba type to be returned if typing is successful
-        else `None` signifies the error state (this is common across numba)
+        else `None` signifies the error state (this pattern is commonly used
+        in Numba)
         """
         if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType):
             # In the case of op(Masked, Masked), the return type is a Masked
             # such that Masked.value is the primitive type that would have
-            # been resolved if we were just adding the `value_type`s.
+            # been resolved if we were just operating on the
+            # `value_type`s.
             return_type = self.context.resolve_function_type(
                 self.key, (args[0].value_type, args[1].value_type), kws
             ).return_type
-            return nb_signature(MaskedType(return_type), args[0], args[1],)
+            return nb_signature(MaskedType(return_type), args[0], args[1])
 
 
 class MaskedScalarNullOp(AbstractTemplate):
@@ -245,7 +247,7 @@ def generic(self, args, kws):
 class MaskedScalarScalarOp(AbstractTemplate):
     def generic(self, args, kws):
         """
-        Typing for `Masked` + a scalar.
+        Typing for `Masked` <op> a scalar (and vice-versa).
         handles situations like `x + 1`
         """
         if isinstance(args[0], MaskedType) and isinstance(

From b724410ee7c62f54bcc8ec1e20977a7eaec88764 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 1 Jul 2021 14:24:42 -0700
Subject: [PATCH 113/132] address more revies

---
 python/cudf/cudf/core/udf/lowering.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index e5684a69f80..67ded626788 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -290,7 +290,8 @@ def masked_constructor(context, builder, sig, args):
     masked.valid = valid
     return masked._getvalue()
 
-
+# Allows us to make an instance of MaskedType a global variable
+# and properly use it inside functions we will later compile
 @cuda_lowering_registry.lower_constant(MaskedType)
 def lower_constant_masked(context, builder, ty, val):
     masked = cgutils.create_struct_proxy(ty)(context, builder)

From 593cbd290d504b76fa446624cab7d25b1ea47b84 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 1 Jul 2021 14:43:41 -0700
Subject: [PATCH 114/132] simplify masked/unmasked typing logic

---
 python/cudf/cudf/core/udf/typing.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 53c34abf1af..706b8b2b52c 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -250,22 +250,20 @@ def generic(self, args, kws):
         Typing for `Masked` <op> a scalar (and vice-versa).
         handles situations like `x + 1`
         """
+        # In the case of op(Masked, scalar), we resolve the type between
+        # the Masked value_type and the scalar's type directly
         if isinstance(args[0], MaskedType) and isinstance(
             args[1], types.Number
         ):
-            # In the case of op(Masked, scalar), we resolve the type between
-            # the Masked value_type and the scalar's type directly
-            return_type = self.context.resolve_function_type(
-                self.key, (args[0].value_type, args[1]), kws
-            ).return_type
-            return nb_signature(MaskedType(return_type), args[0], args[1],)
+            to_resolve_types = (args[0].value_type, args[1])
         elif isinstance(args[0], types.Number) and isinstance(
             args[1], MaskedType
         ):
-            return_type = self.context.resolve_function_type(
-                self.key, (args[1].value_type, args[0]), kws
-            ).return_type
-            return nb_signature(MaskedType(return_type), args[0], args[1],)
+            to_resolve_types = (args[1].value_type, args[0])
+        return_type = self.context.resolve_function_type(
+            self.key, to_resolve_types, kws
+        ).return_type
+        return nb_signature(MaskedType(return_type), args[0], args[1],)
 
 
 @cuda_decl_registry.register_global(operator.is_)

From a31c15a39252c2b3549dc54de118f2947c8e3069 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 2 Jul 2021 07:04:34 -0700
Subject: [PATCH 115/132] style fixes

---
 python/cudf/cudf/core/dataframe.py    |  2 +-
 python/cudf/cudf/core/udf/lowering.py |  3 ++-
 python/cudf/cudf/core/udf/typing.py   | 10 ++++------
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 11644aa026d..0842779cf26 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4764,7 +4764,7 @@ def apply(self, func, axis=1):
         ...             return cudf.NA
         ...     else:
         ...             return x + y
-        ... 
+        ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, 2, 3],
         ...     'b': [2, 1, 1]
diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 67ded626788..c490d216ede 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -269,7 +269,7 @@ def cast_masked_to_masked(context, builder, fromty, toty, val):
     `value_type`. This implements and registers that cast.
     """
 
-    # We will 
+    # We will
     operand = cgutils.create_struct_proxy(fromty)(context, builder, value=val)
     casted = context.cast(
         builder, operand.value, fromty.value_type, toty.value_type
@@ -290,6 +290,7 @@ def masked_constructor(context, builder, sig, args):
     masked.valid = valid
     return masked._getvalue()
 
+
 # Allows us to make an instance of MaskedType a global variable
 # and properly use it inside functions we will later compile
 @cuda_lowering_registry.lower_constant(MaskedType)
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 706b8b2b52c..a1f03b854ed 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -31,9 +31,7 @@ def __init__(self, value):
         # MaskedType in Numba shall be parameterized
         # with a value type
         if not isinstance(value, (types.Number, types.Boolean)):
-            raise TypeError(
-                "value_type must be a numeric scalar type"
-            )
+            raise TypeError("value_type must be a numeric scalar type")
         self.value_type = value
         super().__init__(name=f"Masked{self.value_type}")
 
@@ -203,9 +201,9 @@ def typeof_na(val, c):
 
 
 # Ultimately, we want numba to produce PTX code that specifies how to implement
-# an operation on two singular `Masked` structs together, which is defined as producing a
-# new `Masked` with the right validity and if valid, the correct value.
-# This happens in two phases:
+# an operation on two singular `Masked` structs together, which is defined
+# as producing a new `Masked` with the right validity and if valid,
+# the correct value. This happens in two phases:
 #   1. Specify that `Masked` <op> `Masked` exists and what it should return
 #   2. Implement how to actually do (1) at the LLVM level
 # The following code accomplishes (1) - it is really just a way of specifying

From 448e4ea07469d66f28ac1d90d0b9897d50a3ce5d Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 2 Jul 2021 13:03:23 -0700
Subject: [PATCH 116/132] refactor lowering for reflected const ops

---
 python/cudf/cudf/core/udf/lowering.py | 80 +++++++++------------------
 1 file changed, 26 insertions(+), 54 deletions(-)

diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index c490d216ede..1467a61f215 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -116,75 +116,47 @@ def masked_scalar_null_op_impl(context, builder, sig, args):
 
 def make_const_op(op):
     def masked_scalar_const_op_impl(context, builder, sig, args):
-        """
-        Implement `MaskedType` <op> constant
-        """
-        masked_type, const_type = sig.args
-        masked_value, const_value = args
-
         return_type = sig.return_type
-        masked_input_type = MaskedType(masked_type.value_type)
-        indata = cgutils.create_struct_proxy(masked_input_type)(
-            context, builder, value=masked_value
-        )
-        masked_return_type = MaskedType(return_type.value_type)
-        result = cgutils.create_struct_proxy(masked_return_type)(
-            context, builder
-        )
+        result = cgutils.create_struct_proxy(return_type)(context, builder)
         result.valid = context.get_constant(types.boolean, 0)
-        with builder.if_then(indata.valid):
-            result.value = context.compile_internal(
-                builder,
-                lambda x, y: op(x, y),
-                nb_signature(
-                    return_type.value_type, masked_type.value_type, const_type
-                ),
-                (indata.value, const_value),
-            )
-            result.valid = context.get_constant(types.boolean, 1)
-
-        return result._getvalue()
-
-    return masked_scalar_const_op_impl
+        if isinstance(sig.args[0], MaskedType):
+            masked_type, const_type = sig.args
+            masked_value, const_value = args
 
-
-def make_reflected_const_op(op):
-    def masked_scalar_reflected_const_op_impl(context, builder, sig, args):
-        const_type, masked_type = sig.args
-        const_value, masked_value = args
-
-        return_type = sig.return_type
-        masked_input_type = MaskedType(masked_type.value_type)
-        indata = cgutils.create_struct_proxy(masked_input_type)(
-            context, builder, value=masked_value
-        )
-        masked_return_type = MaskedType(return_type.value_type)
-        result = cgutils.create_struct_proxy(masked_return_type)(
-            context, builder
-        )
-        result.valid = context.get_constant(types.boolean, 0)
+            indata = cgutils.create_struct_proxy(masked_type)(
+                context, builder, value=masked_value
+            )
+            nb_sig = nb_signature(
+                return_type.value_type, masked_type.value_type, const_type
+            )
+            compile_args = (indata.value, const_value)
+        else:
+            const_type, masked_type = sig.args
+            const_value, masked_value = args
+            indata = cgutils.create_struct_proxy(masked_type)(
+                context, builder, value=masked_value
+            )
+            nb_sig = nb_signature(
+                return_type.value_type, const_type, masked_type.value_type
+            )
+            compile_args = (const_value, indata.value)
         with builder.if_then(indata.valid):
             result.value = context.compile_internal(
-                builder,
-                lambda x, y: op(x, y),
-                nb_signature(
-                    return_type.value_type, const_type, masked_type.value_type
-                ),
-                (const_value, indata.value),
+                builder, lambda x, y: op(x, y), nb_sig, compile_args
             )
             result.valid = context.get_constant(types.boolean, 1)
-
         return result._getvalue()
 
-    return masked_scalar_reflected_const_op_impl
+    return masked_scalar_const_op_impl
 
 
 def register_const_op(op):
     to_lower_op = make_const_op(op)
     cuda_lower(op, MaskedType, types.Number)(to_lower_op)
+    cuda_lower(op, types.Number, MaskedType)(to_lower_op)
 
-    to_lower_op_reflected = make_reflected_const_op(op)
-    cuda_lower(op, types.Number, MaskedType)(to_lower_op_reflected)
+    # to_lower_op_reflected = make_reflected_const_op(op)
+    # cuda_lower(op, types.Number, MaskedType)(to_lower_op_reflected)
 
 
 # register all lowering at init

From 6780814ece25783a26b4b6bd8bac5f4c2cee49ac Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Fri, 2 Jul 2021 13:29:13 -0700
Subject: [PATCH 117/132] cleanup

---
 python/cudf/cudf/core/udf/classes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cudf/cudf/core/udf/classes.py b/python/cudf/cudf/core/udf/classes.py
index b3b6f2cbbdf..fe2fbd9daad 100644
--- a/python/cudf/cudf/core/udf/classes.py
+++ b/python/cudf/cudf/core/udf/classes.py
@@ -10,6 +10,7 @@ class Masked:
     have an actual python class we can tie to MaskedType
     This is that class
     """
+
     def __init__(self, value, valid):
         self.value = value
         self.valid = valid

From 6bf3cf533c33aad4ce659a1f5592a66613ab27c3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 6 Jul 2021 07:22:04 -0700
Subject: [PATCH 118/132] fix import and address reviews

---
 python/cudf/cudf/core/udf/typing.py           | 46 +++++++++----------
 python/cudf/cudf/tests/test_udf_masked_ops.py |  2 +-
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index a1f03b854ed..6e026412f24 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -44,39 +44,35 @@ def __hash__(self):
 
     def unify(self, context, other):
         """
-        Logic for sorting out what to do when the UDF conditionally
-        returns a `MaskedType`, an `NAType`, or a literal based off
-        the data at runtime.
-
-        In this framework, every input column is treated as having
-        type `MaskedType`. Operations like `x + y` are understood
-        as translating to:
-
-        `Masked(value=x, valid=True) + Masked(value=y, valid=True)`
-
-        This means if the user writes a function such as
-        def f(x, y):
-            return x + y
-
-        numba sees this function as:
-        f(x: MaskedType, y: MaskedType) -> MaskedType
-
-        However if the user writes something like:
-        def f(x, y):
-            if x > 5:
-                return 42
+        Often within a UDF an instance arises where a variable could
+        be a `MaskedType`, an `NAType`, or a literal based off
+        the data at runtime, for examplem the variable `ret` here:
+
+        def f(x):
+            if x == 1:
+                ret = x
+            elif x > 2:
+                ret = 1
             else:
-                return x + y
+                ret = cudf.NA
+            return ret
+
+        When numba analyzes this function it will eventually figure
+        out that the variable `ret` could be any of the three types
+        from above. This scenario will only work if numba knows how
+        to find some kind of common type between the possibilities,
+        and this function implements that - the goal is to return a
+        common type when comparing `self` to other.
 
-        numba now sees this as
-        f(x: MaskedType(dtype_1), y: MaskedType(dtype_2))
-          -> MaskedType(dtype_unified)
         """
 
         # If we have Masked and NA, the output should be a
         # MaskedType with the original type as its value_type
         if isinstance(other, NAType):
             return self
+
+        # two MaskedType unify to a new MaskedType whose value_type
+        # is the result of unifying `self` and `other` `value_type`
         elif isinstance(other, MaskedType):
             return MaskedType(
                 context.unify_pairs(self.value_type, other.value_type)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index e1da9c4e73d..5c89203255f 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -5,7 +5,7 @@
 
 import cudf
 from cudf.core.udf.pipeline import nulludf
-from cudf.tests.utils import NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, assert_eq
 
 arith_ops = [
     operator.add,

From 6ed7a49e325e0149c055a55278b8bbb2f9423ea0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 7 Jul 2021 13:35:00 -0700
Subject: [PATCH 119/132] capture libcudacxx version for debugging

---
 ci/gpu/build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 355b18f4543..7359bebcb25 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -214,6 +214,8 @@ fi
 ################################################################################
 # TEST - Run py.test, notebooks
 ################################################################################
+OUTPUT=$(cat /opt/conda/envs/rapids/include/libcudf/libcudacxx/cuda/std/detail/__config)
+gpuci_logger "${OUTPUT}"
 
 cd "$WORKSPACE/python/cudf"
 gpuci_logger "Python py.test for cuDF"

From 4ab7bd8fd6563fde0b16a1b6fa696b4bea257028 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 12 Jul 2021 07:08:04 -0700
Subject: [PATCH 120/132] error for cuda<11.1

---
 python/cudf/cudf/core/dataframe.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 5c193fe078c..b7dc905fd2a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4839,8 +4839,19 @@ def apply(self, func, axis=1):
         2     5.0
         dtype: float64
 
+        Notes
+        ----------
+        Available only using cuda 11.1+ due to particular required
+        runtime compilation features
+
         """
 
+        # libcudacxx tuples are not compatible with nvrtc 11.0
+        runtime = cuda.cudadrv.runtime.Runtime()
+        mjr, mnr = runtime.get_version()
+        if mjr < 11 or (mjr == 11 and mnr < 1):
+            raise RuntimeError("DataFrame.apply requires CUDA 11.1+")
+
         for dtype in self.dtypes:
             if (
                 isinstance(dtype, cudf.core.dtypes._BaseDtype)

From 1ffce5b01fb6a68e246616644120c642b60726ff Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 12 Jul 2021 07:09:31 -0700
Subject: [PATCH 121/132] remove CI debugging

---
 ci/gpu/build.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 7359bebcb25..355b18f4543 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -214,8 +214,6 @@ fi
 ################################################################################
 # TEST - Run py.test, notebooks
 ################################################################################
-OUTPUT=$(cat /opt/conda/envs/rapids/include/libcudf/libcudacxx/cuda/std/detail/__config)
-gpuci_logger "${OUTPUT}"
 
 cd "$WORKSPACE/python/cudf"
 gpuci_logger "Python py.test for cuDF"

From 169bcf2d2205ba6165dad8cc2f24d1dda6066f05 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Mon, 12 Jul 2021 11:26:00 -0700
Subject: [PATCH 122/132] skip testing cuda 11.0

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 5c89203255f..3ecdc15a4a8 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -2,6 +2,7 @@
 
 import pandas as pd
 import pytest
+from numba import cuda
 
 import cudf
 from cudf.core.udf.pipeline import nulludf
@@ -33,6 +34,13 @@
 
 
 def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
+
+    # Skip testing CUDA 11.0
+    runtime = cuda.cudadrv.runtime.Runtime()
+    mjr, mnr = runtime.get_version()
+    if mjr < 11 or (mjr == 11 and mnr < 1):
+        pytest.skip(reason="Skip testing for CUDA 11.0")
+
     gdf = data
     pdf = data.to_pandas(nullable=True)
 

From aec243d5a602b5dad36200a137971206db5dd633 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 13 Jul 2021 06:29:45 -0700
Subject: [PATCH 123/132] fix pytest

---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 3ecdc15a4a8..f73f1526c7f 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -39,7 +39,7 @@ def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
     runtime = cuda.cudadrv.runtime.Runtime()
     mjr, mnr = runtime.get_version()
     if mjr < 11 or (mjr == 11 and mnr < 1):
-        pytest.skip(reason="Skip testing for CUDA 11.0")
+        pytest.skip("Skip testing for CUDA 11.0")
 
     gdf = data
     pdf = data.to_pandas(nullable=True)

From 993d84188cf57b665df9c7db633f8ad10ffadf3a Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 13 Jul 2021 17:29:28 -0500
Subject: [PATCH 124/132] Apply suggestions from code review

Co-authored-by: Nghia Truong <ttnghia@users.noreply.github.com>
---
 cpp/src/transform/transform.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index a5169dbd94d..dad2b33d91e 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -117,9 +117,8 @@ void generalized_operation(table_view data_view,
   mask_ptrs.reserve(data_view.num_columns());
   offsets.reserve(data_view.num_columns());
 
-  column_view col;
   for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
-    col = data_view.column(col_idx);
+    auto const& col = data_view.column(col_idx);
 
     data_ptrs.push_back(cudf::jit::get_data_ptr(col));
     mask_ptrs.push_back(col.null_mask());
@@ -179,8 +178,8 @@ std::unique_ptr<column> generalized_masked_op(table_view data_view,
   transformation::jit::generalized_operation(
     data_view, udf, output_type, *output, *output_mask, mr);
 
-  auto final_output_mask = cudf::bools_to_mask(*output_mask);
-  output.get()->set_null_mask(std::move(*(final_output_mask.first)));
+  auto [final_output_mask, out_something] = cudf::bools_to_mask(*output_mask);
+  output.get()->set_null_mask(std::move(final_output_mask));
   return output;
 }
 

From 512555bea322e1f67487baa920e7afe5075f64cc Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Tue, 13 Jul 2021 15:30:06 -0700
Subject: [PATCH 125/132] partially address reviews

---
 cpp/src/transform/jit/masked_udf_kernel.cu | 16 ++++++------
 cpp/src/transform/transform.cpp            | 29 ++++++++++++----------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index 603802282b9..13afa20996b 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -14,23 +14,21 @@
  * limitations under the License.
  */
 
-// Include Jitify's cstddef header first
-#include <cstddef>
-
-#include <cuda/std/climits>
-#include <cuda/std/cstddef>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
 
 #include <cstdint>
-
+#include <cstddef>
 #include <transform/jit/operation-udf.hpp>
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 
+#include <cuda/std/climits>
+#include <cuda/std/cstddef>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
 #include <cuda/std/tuple>
-#include <tuple>
+
+
 
 namespace cudf {
 namespace transformation {
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index a5169dbd94d..1ae0c931a4f 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -14,12 +14,6 @@
  * limitations under the License.
  */
 
-#include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
-#include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
-
-#include <jit/cache.hpp>
-#include <jit/parser.hpp>
-#include <jit/type.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -30,6 +24,14 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
+#include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
+
+#include <jit/cache.hpp>
+#include <jit/parser.hpp>
+#include <jit/type.hpp>
+
+
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -89,9 +91,10 @@ void generalized_operation(table_view data_view,
                            data_type output_type,
                            mutable_column_view outcol_view,
                            mutable_column_view outmsk_view,
+                           rmm::cuda_stream_view stream,
                            rmm::mr::device_memory_resource* mr)
 {
-  std::vector<std::string> template_types = make_template_types(outcol_view, data_view);
+  auto const template_types = make_template_types(outcol_view, data_view);
 
   std::string generic_kernel_name =
     jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
@@ -130,13 +133,12 @@ void generalized_operation(table_view data_view,
     kernel_args.push_back(&offsets[col_idx]);
   }
 
-  rmm::cuda_stream_view generic_stream;
   cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
     .get_kernel(generic_kernel_name,
                 {},
                 {{"transform/jit/operation-udf.hpp", generic_cuda_source}},
-                {"-arch=sm_."})                                    //
-    ->configure_1d_max_occupancy(0, 0, 0, generic_stream.value())  //
+                {"-arch=sm_."})
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
     ->launch(kernel_args.data());
 }
 
@@ -169,15 +171,16 @@ std::unique_ptr<column> transform(column_view const& input,
 std::unique_ptr<column> generalized_masked_op(table_view data_view,
                                               std::string const& udf,
                                               data_type output_type,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
-  rmm::cuda_stream_view stream   = rmm::cuda_stream_default;
+
   std::unique_ptr<column> output = make_fixed_width_column(output_type, data_view.num_rows());
   std::unique_ptr<column> output_mask =
     make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());
 
   transformation::jit::generalized_operation(
-    data_view, udf, output_type, *output, *output_mask, mr);
+    data_view, udf, output_type, *output, *output_mask, stream, mr);
 
   auto final_output_mask = cudf::bools_to_mask(*output_mask);
   output.get()->set_null_mask(std::move(*(final_output_mask.first)));
@@ -201,7 +204,7 @@ std::unique_ptr<column> generalized_masked_op(table_view data_view,
                                               data_type output_type,
                                               rmm::mr::device_memory_resource* mr)
 {
-  return detail::generalized_masked_op(data_view, udf, output_type, mr);
+  return detail::generalized_masked_op(data_view, udf, output_type, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf

From 8f1add4736a0001ea71dae6f9febb7475f105c4f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 13 Jul 2021 17:31:12 -0500
Subject: [PATCH 126/132] Apply suggestions from code review

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>
---
 python/cudf/cudf/core/dataframe.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b7dc905fd2a..780466458cc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4719,7 +4719,7 @@ def apply(self, func, axis=1):
             * 1 or 'columns': apply function to each row.
 
         Examples
-        ----------
+        --------
 
         Simple function of a single variable which could be NA
 
@@ -4840,10 +4840,9 @@ def apply(self, func, axis=1):
         dtype: float64
 
         Notes
-        ----------
+        -----
         Available only using cuda 11.1+ due to particular required
         runtime compilation features
-
         """
 
         # libcudacxx tuples are not compatible with nvrtc 11.0

From b061710b05692f31c6cb7ba717b64fa399c4174b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 14 Jul 2021 11:57:13 -0700
Subject: [PATCH 127/132] updates

---
 cpp/src/transform/transform.cpp | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index b177a6ee660..771be891d61 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -75,7 +75,7 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
     cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
 
   std::vector<std::string> template_types;
-  template_types.reserve(data_view.num_columns() + 1);
+  template_types.reserve((3 * data_view.num_columns()) + 1);
 
   template_types.push_back(cudf::jit::get_type_name(outcol_view.type()));
   for (auto const& col : data_view) {
@@ -86,6 +86,14 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
   return template_types;
 }
 
+class Unpacker {
+  public:
+    thrust::tuple<const void*, cudf::bitmask_type const*, cudf::offset_type> operator() (column_view input) {
+      return thrust::make_tuple(cudf::jit::get_data_ptr(input), input.null_mask(), input.offset());
+    }
+};
+
+
 void generalized_operation(table_view data_view,
                            std::string const& udf,
                            data_type output_type,
@@ -103,7 +111,6 @@ void generalized_operation(table_view data_view,
   std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
     udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
 
-  // {size, out_ptr, out_mask_ptr, col0_ptr, col0_mask_ptr, col0_offset, col1_ptr...}
   std::vector<void*> kernel_args;
   kernel_args.reserve((data_view.num_columns() * 3) + 3);
 
@@ -120,13 +127,16 @@ void generalized_operation(table_view data_view,
   mask_ptrs.reserve(data_view.num_columns());
   offsets.reserve(data_view.num_columns());
 
-  for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
-    auto const& col = data_view.column(col_idx);
+  auto zipit_start = thrust::make_zip_iterator(
+    thrust::make_tuple(data_ptrs.begin(), 
+    mask_ptrs.begin(),
+    offsets.begin())
+  );
 
-    data_ptrs.push_back(cudf::jit::get_data_ptr(col));
-    mask_ptrs.push_back(col.null_mask());
-    offsets.push_back(col.offset());
+  Unpacker unpacker;
+  thrust::transform(data_view.begin(), data_view.end(), zipit_start, unpacker);
 
+  for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
     kernel_args.push_back(&data_ptrs[col_idx]);
     kernel_args.push_back(&mask_ptrs[col_idx]);
     kernel_args.push_back(&offsets[col_idx]);
@@ -167,7 +177,7 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view data_view,
+std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
                                               std::string const& udf,
                                               data_type output_type,
                                               rmm::cuda_stream_view stream,
@@ -181,8 +191,8 @@ std::unique_ptr<column> generalized_masked_op(table_view data_view,
   transformation::jit::generalized_operation(
     data_view, udf, output_type, *output, *output_mask, stream, mr);
 
-  auto [final_output_mask, out_something] = cudf::bools_to_mask(*output_mask);
-  output.get()->set_null_mask(std::move(final_output_mask));
+  auto final_output_mask = cudf::bools_to_mask(*output_mask);
+  output.get()->set_null_mask(std::move(*(final_output_mask.first)));
   return output;
 }
 

From 7c722dda1922e4379a7066ca410bf89ec8dac2c0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Wed, 14 Jul 2021 12:36:07 -0700
Subject: [PATCH 128/132] style

---
 cpp/src/transform/jit/masked_udf_kernel.cu |  7 ++-----
 cpp/src/transform/transform.cpp            | 19 +++++++------------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
index 13afa20996b..319ad730c53 100644
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ b/cpp/src/transform/jit/masked_udf_kernel.cu
@@ -14,9 +14,8 @@
  * limitations under the License.
  */
 
-
-#include <cstdint>
 #include <cstddef>
+#include <cstdint>
 #include <transform/jit/operation-udf.hpp>
 
 #include <cudf/types.hpp>
@@ -25,10 +24,8 @@
 #include <cuda/std/climits>
 #include <cuda/std/cstddef>
 #include <cuda/std/limits>
-#include <cuda/std/type_traits>
 #include <cuda/std/tuple>
-
-
+#include <cuda/std/type_traits>
 
 namespace cudf {
 namespace transformation {
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 771be891d61..f67ad4bf8a6 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -31,7 +30,6 @@
 #include <jit/parser.hpp>
 #include <jit/type.hpp>
 
-
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -87,13 +85,14 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
 }
 
 class Unpacker {
-  public:
-    thrust::tuple<const void*, cudf::bitmask_type const*, cudf::offset_type> operator() (column_view input) {
-      return thrust::make_tuple(cudf::jit::get_data_ptr(input), input.null_mask(), input.offset());
-    }
+ public:
+  thrust::tuple<const void*, cudf::bitmask_type const*, cudf::offset_type> operator()(
+    column_view input)
+  {
+    return thrust::make_tuple(cudf::jit::get_data_ptr(input), input.null_mask(), input.offset());
+  }
 };
 
-
 void generalized_operation(table_view data_view,
                            std::string const& udf,
                            data_type output_type,
@@ -128,10 +127,7 @@ void generalized_operation(table_view data_view,
   offsets.reserve(data_view.num_columns());
 
   auto zipit_start = thrust::make_zip_iterator(
-    thrust::make_tuple(data_ptrs.begin(), 
-    mask_ptrs.begin(),
-    offsets.begin())
-  );
+    thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin()));
 
   Unpacker unpacker;
   thrust::transform(data_view.begin(), data_view.end(), zipit_start, unpacker);
@@ -183,7 +179,6 @@ std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
-
   std::unique_ptr<column> output = make_fixed_width_column(output_type, data_view.num_rows());
   std::unique_ptr<column> output_mask =
     make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());

From 7a7ee8376d42f1fe29b00af22ca186d92cfb1465 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 15 Jul 2021 13:12:09 -0700
Subject: [PATCH 129/132] use table_view const&

---
 cpp/include/cudf/transform.hpp          | 2 +-
 cpp/src/transform/transform.cpp         | 6 +++---
 python/cudf/cudf/_lib/cpp/transform.pxd | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 50efea4e2e5..f5880e9b37f 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -54,7 +54,7 @@ std::unique_ptr<column> transform(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> generalized_masked_op(
-  table_view data_view,
+  table_view const& data_view,
   std::string const& binary_udf,
   data_type output_type,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index f67ad4bf8a6..98364217491 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -65,7 +65,7 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-std::vector<std::string> make_template_types(column_view outcol_view, table_view data_view)
+std::vector<std::string> make_template_types(column_view outcol_view, table_view const& data_view)
 {
   std::string mskptr_type =
     cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
@@ -93,7 +93,7 @@ class Unpacker {
   }
 };
 
-void generalized_operation(table_view data_view,
+void generalized_operation(table_view const& data_view,
                            std::string const& udf,
                            data_type output_type,
                            mutable_column_view outcol_view,
@@ -203,7 +203,7 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view data_view,
+std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
                                               std::string const& udf,
                                               data_type output_type,
                                               rmm::mr::device_memory_resource* mr)
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index c8e84a3a2e9..9cb5bc10162 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -39,7 +39,7 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
     ) except +
 
     cdef unique_ptr[column] generalized_masked_op(
-        table_view data_view,
+        const table_view& data_view,
         string udf,
         data_type output_type,
     ) except +

From a20d630ff6737e3148f6cc196c401fe87e3424cf Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 15 Jul 2021 14:09:20 -0700
Subject: [PATCH 130/132] switch to a lambda

---
 cpp/src/transform/transform.cpp | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 98364217491..a2269a16d2f 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -84,15 +84,6 @@ std::vector<std::string> make_template_types(column_view outcol_view, table_view
   return template_types;
 }
 
-class Unpacker {
- public:
-  thrust::tuple<const void*, cudf::bitmask_type const*, cudf::offset_type> operator()(
-    column_view input)
-  {
-    return thrust::make_tuple(cudf::jit::get_data_ptr(input), input.null_mask(), input.offset());
-  }
-};
-
 void generalized_operation(table_view const& data_view,
                            std::string const& udf,
                            data_type output_type,
@@ -129,14 +120,14 @@ void generalized_operation(table_view const& data_view,
   auto zipit_start = thrust::make_zip_iterator(
     thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin()));
 
-  Unpacker unpacker;
-  thrust::transform(data_view.begin(), data_view.end(), zipit_start, unpacker);
-
-  for (int col_idx = 0; col_idx < data_view.num_columns(); col_idx++) {
+  int col_idx = 0;
+  std::transform(data_view.begin(), data_view.end(), zipit_start, [&](column_view col) {
     kernel_args.push_back(&data_ptrs[col_idx]);
     kernel_args.push_back(&mask_ptrs[col_idx]);
     kernel_args.push_back(&offsets[col_idx]);
-  }
+    col_idx++;
+    return thrust::make_tuple(cudf::jit::get_data_ptr(col), col.null_mask(), col.offset());
+  });
 
   cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
     .get_kernel(generic_kernel_name,

From a13e935abe13763887365b0d0dc755713643c32b Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 15 Jul 2021 21:52:44 -0500
Subject: [PATCH 131/132] Update cpp/src/transform/transform.cpp

Co-authored-by: Nghia Truong <ttnghia@users.noreply.github.com>
---
 cpp/src/transform/transform.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index a2269a16d2f..4a3aa5da3c3 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -117,15 +117,16 @@ void generalized_operation(table_view const& data_view,
   mask_ptrs.reserve(data_view.num_columns());
   offsets.reserve(data_view.num_columns());
 
-  auto zipit_start = thrust::make_zip_iterator(
+  auto const iters = thrust::make_zip_iterator(
     thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin()));
 
-  int col_idx = 0;
-  std::transform(data_view.begin(), data_view.end(), zipit_start, [&](column_view col) {
-    kernel_args.push_back(&data_ptrs[col_idx]);
-    kernel_args.push_back(&mask_ptrs[col_idx]);
-    kernel_args.push_back(&offsets[col_idx]);
-    col_idx++;
+  std::for_each(iters, iters + data_view.num_columns(), [](auto const& tuple_vals) {
+    kernel_args.push_back(thrust::get<0>(tuple_vals));
+    kernel_args.push_back(thrust::get<1>(tuple_vals));
+    kernel_args.push_back(thrust::get<2>(tuple_vals));
+  });
+
+  std::transform(data_view.begin(), data_view.end(), iters, [&](column_view const& col) {
     return thrust::make_tuple(cudf::jit::get_data_ptr(col), col.null_mask(), col.offset());
   });
 

From 9acc7a9f9292140d06d5a539ac27292f881dd1d9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <brmiller@nvidia.com>
Date: Thu, 15 Jul 2021 19:58:41 -0700
Subject: [PATCH 132/132] updates

---
 cpp/src/transform/transform.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 4a3aa5da3c3..5230b853a79 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -120,10 +120,10 @@ void generalized_operation(table_view const& data_view,
   auto const iters = thrust::make_zip_iterator(
     thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin()));
 
-  std::for_each(iters, iters + data_view.num_columns(), [](auto const& tuple_vals) {
-    kernel_args.push_back(thrust::get<0>(tuple_vals));
-    kernel_args.push_back(thrust::get<1>(tuple_vals));
-    kernel_args.push_back(thrust::get<2>(tuple_vals));
+  std::for_each(iters, iters + data_view.num_columns(), [&](auto const& tuple_vals) {
+    kernel_args.push_back(&thrust::get<0>(tuple_vals));
+    kernel_args.push_back(&thrust::get<1>(tuple_vals));
+    kernel_args.push_back(&thrust::get<2>(tuple_vals));
   });
 
   std::transform(data_view.begin(), data_view.end(), iters, [&](column_view const& col) {