diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 03d32959c1a..b0c8c255aef 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -137,8 +137,8 @@ include(cmake/thirdparty/CUDF_GetDLPack.cmake)
 include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake)
 # find or install GoogleTest
 include(cmake/thirdparty/CUDF_GetGTest.cmake)
-# Stringify libcudf and libcudacxx headers used in JIT operations
-include(cmake/Modules/StringifyJITHeaders.cmake)
+# preprocess jitify-able kernels
+include(cmake/Modules/JitifyPreprocessKernels.cmake)
 # find cuFile
 include(cmake/Modules/FindcuFile.cmake)
 
@@ -153,9 +153,6 @@ add_library(cudf
     src/ast/transform.cu
     src/binaryop/binaryop.cpp
     src/binaryop/compiled/binary_ops.cu
-    src/binaryop/jit/code/kernel.cpp
-    src/binaryop/jit/code/operation.cpp
-    src/binaryop/jit/code/traits.cpp
     src/labeling/label_bins.cu
     src/bitmask/null_mask.cu
     src/column/column.cu
@@ -256,7 +253,6 @@ add_library(cudf
     src/io/utilities/parsing_utils.cu
     src/io/utilities/type_conversion.cpp
     src/jit/cache.cpp
-    src/jit/launcher.cpp
     src/jit/parser.cpp
     src/jit/type.cpp
     src/join/cross_join.cu
@@ -302,8 +298,6 @@ add_library(cudf
     src/reshape/interleave_columns.cu
     src/reshape/tile.cu
     src/rolling/grouped_rolling.cu
-    src/rolling/jit/code/kernel.cpp
-    src/rolling/jit/code/operation.cpp
     src/rolling/rolling.cu
     src/round/round.cu
     src/scalar/scalar.cpp
@@ -387,7 +381,6 @@ add_library(cudf
     src/text/tokenize.cu
     src/transform/bools_to_mask.cu
     src/transform/encode.cu
-    src/transform/jit/code/kernel.cpp
     src/transform/mask_to_bools.cu
     src/transform/nans_to_nulls.cu
     src/transform/row_bit_count.cu
@@ -465,7 +458,7 @@ endif()
 target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
 
 # Compile stringified JIT sources first
-add_dependencies(cudf stringify_run)
+add_dependencies(cudf jitify_preprocess_run)
 
 # Specify the target module library dependencies
 target_link_libraries(cudf
@@ -476,9 +469,9 @@ target_link_libraries(cudf
                   rmm::rmm)
 
 if(CUDA_STATIC_RUNTIME)
-    target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart_static CUDA::cuda_driver)
+    target_link_libraries(cudf PUBLIC CUDA::cudart_static CUDA::cuda_driver)
 else()
-    target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart CUDA::cuda_driver)
+    target_link_libraries(cudf PUBLIC CUDA::cudart CUDA::cuda_driver)
 endif()
 
 # Add cuFile interface if available
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
new file mode 100644
index 00000000000..7bb5b1d0a14
--- /dev/null
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -0,0 +1,70 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+cmake_minimum_required(VERSION 3.18)
+
+file(MAKE_DIRECTORY "${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files")
+
+# Create `jitify_preprocess` executable
+project(jitify_preprocess VERSION 2.0 LANGUAGES CXX CUDA)
+add_executable(jitify_preprocess "${JITIFY_INCLUDE_DIR}/jitify2_preprocess.cpp")
+
+target_link_libraries(jitify_preprocess CUDA::cudart ${CMAKE_DL_LIBS})
+
+function(jit_preprocess_files)
+    cmake_parse_arguments(ARG
+                          ""
+                          "SOURCE_DIRECTORY"
+                          "FILES"
+                          ${ARGN}
+                          )
+
+    foreach(ARG_FILE ${ARG_FILES})
+        set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit)
+        list(APPEND JIT_PREPROCESSED_FILES "${ARG_OUTPUT}")
+        add_custom_command(WORKING_DIRECTORY ${ARG_SOURCE_DIRECTORY}
+                           DEPENDS jitify_preprocess
+                           OUTPUT ${ARG_OUTPUT}
+                           VERBATIM
+                           COMMAND jitify_preprocess ${ARG_FILE}
+                                    -o ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files
+                                    -v
+                                    -i
+                                    -m
+                                    -std=c++14
+                                    -remove-unused-globals
+                                    -D__CUDACC_RTC__
+                                    -I${CUDF_SOURCE_DIR}/include
+                                    -I${CUDF_SOURCE_DIR}/src
+                                    -I${LIBCUDACXX_INCLUDE_DIR}
+                                    -I${CUDAToolkit_INCLUDE_DIRS}
+                                    --no-preinclude-workarounds
+                                    --no-replace-pragma-once
+                           )
+    endforeach()
+    set(JIT_PREPROCESSED_FILES "${JIT_PREPROCESSED_FILES}" PARENT_SCOPE)
+endfunction()
+
+jit_preprocess_files(SOURCE_DIRECTORY      ${CUDF_SOURCE_DIR}/src
+                     FILES                 binaryop/jit/kernel.cu
+                                           transform/jit/kernel.cu
+                                           rolling/jit/kernel.cu
+                     )
+
+add_custom_target(jitify_preprocess_run DEPENDS ${JIT_PREPROCESSED_FILES})
+
+file(COPY "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx")
+file(COPY "${LIBCXX_INCLUDE_DIR}"      DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx")
diff --git a/cpp/cmake/Modules/StringifyJITHeaders.cmake b/cpp/cmake/Modules/StringifyJITHeaders.cmake
deleted file mode 100644
index d67c546cd30..00000000000
--- a/cpp/cmake/Modules/StringifyJITHeaders.cmake
+++ /dev/null
@@ -1,172 +0,0 @@
-#=============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-file(MAKE_DIRECTORY "${CUDF_GENERATED_INCLUDE_DIR}/include")
-
-# Create `stringify` executable
-add_executable(stringify "${JITIFY_INCLUDE_DIR}/stringify.cpp")
-
-execute_process(WORKING_DIRECTORY ${CUDF_GENERATED_INCLUDE_DIR}
-    COMMAND ${CMAKE_COMMAND} -E make_directory
-        ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include
-    )
-
-# Use `stringify` to convert types.h to c-str for use in JIT code
-add_custom_command(WORKING_DIRECTORY ${CUDF_SOURCE_DIR}/include
-                   COMMENT "Stringify headers for use in JIT compiled code"
-                   DEPENDS stringify
-                   OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.h.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.hpp.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/bit.hpp.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/timestamps.hpp.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/fixed_point.hpp.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/durations.hpp.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/assert.cuh.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/chrono.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/climits.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstddef.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstdint.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ctime.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/limits.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ratio.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/type_traits.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/version.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__config.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_pop.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_push.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__config.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_pop.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_push.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__undef_macros.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/chrono.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/climits.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstddef.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstdint.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ctime.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/limits.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ratio.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/type_traits.jit
-                          ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/version.jit
-                   MAIN_DEPENDENCY ${CUDF_SOURCE_DIR}/include/cudf/types.h
-                                   ${CUDF_SOURCE_DIR}/include/cudf/types.hpp
-                                   ${CUDF_SOURCE_DIR}/include/cudf/utilities/bit.hpp
-                                   ${CUDF_SOURCE_DIR}/include/cudf/wrappers/timestamps.hpp
-                                   ${CUDF_SOURCE_DIR}/include/cudf/fixed_point/fixed_point.hpp
-                                   ${CUDF_SOURCE_DIR}/include/cudf/wrappers/durations.hpp
-                                   ${CUDF_SOURCE_DIR}/include/cudf/detail/utilities/assert.cuh
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/chrono
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/climits
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/cstddef
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/cstdint
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/ctime
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/limits
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/ratio
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/type_traits
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/version
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__config
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__pragma_pop
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__pragma_push
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__config
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__pragma_pop
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__pragma_push
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__undef_macros
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/chrono
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/climits
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/cstddef
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/cstdint
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/ctime
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/limits
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/ratio
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/type_traits
-                                   ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/version
-
-                   # stringified headers are placed underneath the bin include jit directory and end in ".jit"
-                   COMMAND ${CUDF_BINARY_DIR}/stringify cudf/types.h > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.h.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify cudf/types.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.hpp.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify cudf/utilities/bit.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/bit.hpp.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ../src/rolling/rolling_jit_detail.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/rolling_jit_detail.hpp.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify cudf/wrappers/timestamps.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/timestamps.hpp.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify cudf/fixed_point/fixed_point.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/fixed_point.hpp.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify cudf/wrappers/durations.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/durations.hpp.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify cudf/detail/utilities/assert.cuh > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/assert.cuh.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/chrono cuda_std_chrono > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/chrono.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/climits cuda_std_climits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/climits.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/cstddef cuda_std_cstddef > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstddef.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/cstdint cuda_std_cstdint > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstdint.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/ctime cuda_std_ctime > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ctime.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/limits cuda_std_limits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/limits.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/ratio cuda_std_ratio > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ratio.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/type_traits cuda_std_type_traits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/type_traits.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/version cuda_std_version > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/version.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__config cuda_std_detail___config > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__config.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__pragma_pop cuda_std_detail___pragma_pop > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_pop.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__pragma_push cuda_std_detail___pragma_push > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_push.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__config cuda_std_detail_libcxx_include___config > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__config.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__pragma_pop cuda_std_detail_libcxx_include___pragma_pop > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_pop.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__pragma_push cuda_std_detail_libcxx_include___pragma_push > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_push.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__undef_macros cuda_std_detail_libcxx_include___undef_macros > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__undef_macros.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/chrono cuda_std_detail_libcxx_include_chrono > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/chrono.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/climits cuda_std_detail_libcxx_include_climits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/climits.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/cstddef cuda_std_detail_libcxx_include_cstddef > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstddef.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/cstdint cuda_std_detail_libcxx_include_cstdint > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstdint.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/ctime cuda_std_detail_libcxx_include_ctime > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ctime.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/limits cuda_std_detail_libcxx_include_limits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/limits.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/ratio cuda_std_detail_libcxx_include_ratio > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ratio.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/type_traits cuda_std_detail_libcxx_include_type_traits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/type_traits.jit
-                   COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/version cuda_std_detail_libcxx_include_version > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/version.jit
-                   )
-
-add_custom_target(stringify_run DEPENDS
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.h.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.hpp.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/bit.hpp.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/timestamps.hpp.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/fixed_point.hpp.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/durations.hpp.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/assert.cuh.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/chrono.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/climits.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstddef.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstdint.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ctime.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/limits.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ratio.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/type_traits.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/version.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__config.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_pop.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_push.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__config.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_pop.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_push.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__undef_macros.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/chrono.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/climits.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstddef.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstdint.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ctime.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/limits.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ratio.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/type_traits.jit
-                  ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/version.jit
-                  )
-
-###################################################################################################
-# - copy libcu++ ----------------------------------------------------------------------------------
-
-# `${LIBCUDACXX_INCLUDE_DIR}/` specifies that the contents of this directory will be installed (not the directory itself)
-file(COPY "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx")
-file(COPY "${LIBCXX_INCLUDE_DIR}"      DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx")
diff --git a/cpp/cmake/thirdparty/CUDF_GetJitify.cmake b/cpp/cmake/thirdparty/CUDF_GetJitify.cmake
index e041be26d64..6e853816ec5 100644
--- a/cpp/cmake/thirdparty/CUDF_GetJitify.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetJitify.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 
 function(find_and_configure_jitify)
     CPMFindPackage(NAME     jitify
-            VERSION         1.0.0
+            VERSION         2.0.0
             GIT_REPOSITORY  https://github.com/rapidsai/jitify.git
-            GIT_TAG         cudf_0.16
+            GIT_TAG         cudf_0.19
             GIT_SHALLOW     TRUE
             DOWNLOAD_ONLY   TRUE)
     set(JITIFY_INCLUDE_DIR "${jitify_SOURCE_DIR}" PARENT_SCOPE)
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 9557670e967..55a7f7a9f1b 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -18,20 +18,13 @@
  */
 
 #include "compiled/binary_ops.hpp"
-#include "jit/code/code.h"
 #include "jit/util.hpp"
 
-#include <jit/launcher.h>
-#include <jit/parser.h>
-#include <jit/type.h>
+#include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
 
-#include <jit/assert.cuh.jit>
-#include <jit/bit.hpp.jit>
-#include <jit/common_headers.hpp>
-#include <jit/durations.hpp.jit>
-#include <jit/fixed_point.hpp.jit>
-#include <jit/timestamps.hpp.jit>
-#include <jit/types.hpp.jit>
+#include <jit/cache.hpp>
+#include <jit/parser.hpp>
+#include <jit/type.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -57,6 +50,7 @@ namespace cudf {
 
 namespace binops {
 namespace detail {
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  */
@@ -79,70 +73,47 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
 
 namespace jit {
 
-const std::string hash = "prog_binop";
-
-const std::vector<std::string> header_names{"operation.h",
-                                            "traits.h",
-                                            cudf_types_hpp,
-                                            cudf_utilities_bit_hpp,
-                                            cudf_wrappers_timestamps_hpp,
-                                            cudf_wrappers_durations_hpp,
-                                            cudf_fixed_point_fixed_point_hpp,
-                                            cudf_detail_utilities_assert_cuh};
-
-std::istream* headers_code(std::string filename, std::iostream& stream)
-{
-  if (filename == "operation.h") {
-    stream << code::operation;
-    return &stream;
-  }
-  if (filename == "traits.h") {
-    stream << code::traits;
-    return &stream;
-  }
-  auto it = cudf::jit::stringified_headers.find(filename);
-  if (it != cudf::jit::stringified_headers.end()) {
-    return cudf::jit::send_stringified_header(stream, it->second);
-  }
-  return nullptr;
-}
-
 void binary_operation(mutable_column_view& out,
-                      scalar const& lhs,
-                      column_view const& rhs,
+                      column_view const& lhs,
+                      scalar const& rhs,
                       binary_operator op,
+                      OperatorType op_type,
                       rmm::cuda_stream_view stream)
 {
   if (is_null_dependent(op)) {
-    cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
-      .set_kernel_inst("kernel_v_s_with_validity",             // name of the kernel we are
-                                                               // launching
-                       {cudf::jit::get_type_name(out.type()),  // list of template arguments
-                        cudf::jit::get_type_name(rhs.type()),
-                        cudf::jit::get_type_name(lhs.type()),
-                        get_operator_name(op, OperatorType::Reverse)})
-      .launch(out.size(),
-              cudf::jit::get_data_ptr(out),
-              cudf::jit::get_data_ptr(rhs),
-              cudf::jit::get_data_ptr(lhs),
-              out.null_mask(),
-              rhs.null_mask(),
-              rhs.offset(),
-              lhs.is_valid());
+    std::string kernel_name =
+      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity")  //
+        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
+                     cudf::jit::get_type_name(lhs.type()),
+                     cudf::jit::get_type_name(rhs.type()),
+                     get_operator_name(op, op_type));
+
+    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+      .get_kernel(kernel_name)                               //
+      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+      ->launch(out.size(),
+               cudf::jit::get_data_ptr(out),
+               cudf::jit::get_data_ptr(lhs),
+               cudf::jit::get_data_ptr(rhs),
+               out.null_mask(),
+               lhs.null_mask(),
+               lhs.offset(),
+               rhs.is_valid());
   } else {
-    cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
-      .set_kernel_inst("kernel_v_s",                           // name of the kernel we are
-                                                               // launching
-                       {cudf::jit::get_type_name(out.type()),  // list of template arguments
-                        cudf::jit::get_type_name(rhs.type()),
-                        cudf::jit::get_type_name(lhs.type()),
-                        get_operator_name(op, OperatorType::Reverse)})
-      .launch(out.size(),
-              cudf::jit::get_data_ptr(out),
-              cudf::jit::get_data_ptr(rhs),
-              cudf::jit::get_data_ptr(lhs));
+    std::string kernel_name =
+      jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
+        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
+                     cudf::jit::get_type_name(lhs.type()),
+                     cudf::jit::get_type_name(rhs.type()),
+                     get_operator_name(op, op_type));
+
+    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+      .get_kernel(kernel_name)                               //
+      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+      ->launch(out.size(),
+               cudf::jit::get_data_ptr(out),
+               cudf::jit::get_data_ptr(lhs),
+               cudf::jit::get_data_ptr(rhs));
   }
 }
 
@@ -152,37 +123,16 @@ void binary_operation(mutable_column_view& out,
                       binary_operator op,
                       rmm::cuda_stream_view stream)
 {
-  if (is_null_dependent(op)) {
-    cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
-      .set_kernel_inst("kernel_v_s_with_validity",             // name of the kernel we are
-                                                               // launching
-                       {cudf::jit::get_type_name(out.type()),  // list of template arguments
-                        cudf::jit::get_type_name(lhs.type()),
-                        cudf::jit::get_type_name(rhs.type()),
-                        get_operator_name(op, OperatorType::Direct)})
-      .launch(out.size(),
-              cudf::jit::get_data_ptr(out),
-              cudf::jit::get_data_ptr(lhs),
-              cudf::jit::get_data_ptr(rhs),
-              out.null_mask(),
-              lhs.null_mask(),
-              lhs.offset(),
-              rhs.is_valid());
-  } else {
-    cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
-      .set_kernel_inst("kernel_v_s",                           // name of the kernel we are
-                                                               // launching
-                       {cudf::jit::get_type_name(out.type()),  // list of template arguments
-                        cudf::jit::get_type_name(lhs.type()),
-                        cudf::jit::get_type_name(rhs.type()),
-                        get_operator_name(op, OperatorType::Direct)})
-      .launch(out.size(),
-              cudf::jit::get_data_ptr(out),
-              cudf::jit::get_data_ptr(lhs),
-              cudf::jit::get_data_ptr(rhs));
-  }
+  return binary_operation(out, lhs, rhs, op, OperatorType::Direct, stream);
+}
+
+void binary_operation(mutable_column_view& out,
+                      scalar const& lhs,
+                      column_view const& rhs,
+                      binary_operator op,
+                      rmm::cuda_stream_view stream)
+{
+  return binary_operation(out, rhs, lhs, op, OperatorType::Reverse, stream);
 }
 
 void binary_operation(mutable_column_view& out,
@@ -192,36 +142,40 @@ void binary_operation(mutable_column_view& out,
                       rmm::cuda_stream_view stream)
 {
   if (is_null_dependent(op)) {
-    cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
-      .set_kernel_inst("kernel_v_v_with_validity",             // name of the kernel we are
-                                                               // launching
-                       {cudf::jit::get_type_name(out.type()),  // list of template arguments
-                        cudf::jit::get_type_name(lhs.type()),
-                        cudf::jit::get_type_name(rhs.type()),
-                        get_operator_name(op, OperatorType::Direct)})
-      .launch(out.size(),
-              cudf::jit::get_data_ptr(out),
-              cudf::jit::get_data_ptr(lhs),
-              cudf::jit::get_data_ptr(rhs),
-              out.null_mask(),
-              lhs.null_mask(),
-              rhs.offset(),
-              rhs.null_mask(),
-              rhs.offset());
+    std::string kernel_name =
+      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity")  //
+        .instantiate(cudf::jit::get_type_name(out.type()),  // list of template arguments
+                     cudf::jit::get_type_name(lhs.type()),
+                     cudf::jit::get_type_name(rhs.type()),
+                     get_operator_name(op, OperatorType::Direct));
+
+    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+      .get_kernel(kernel_name)                               //
+      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+      ->launch(out.size(),
+               cudf::jit::get_data_ptr(out),
+               cudf::jit::get_data_ptr(lhs),
+               cudf::jit::get_data_ptr(rhs),
+               out.null_mask(),
+               lhs.null_mask(),
+               rhs.offset(),
+               rhs.null_mask(),
+               rhs.offset());
   } else {
-    cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
-      .set_kernel_inst("kernel_v_v",                           // name of the kernel we are
-                                                               // launching
-                       {cudf::jit::get_type_name(out.type()),  // list of template arguments
-                        cudf::jit::get_type_name(lhs.type()),
-                        cudf::jit::get_type_name(rhs.type()),
-                        get_operator_name(op, OperatorType::Direct)})
-      .launch(out.size(),
-              cudf::jit::get_data_ptr(out),
-              cudf::jit::get_data_ptr(lhs),
-              cudf::jit::get_data_ptr(rhs));
+    std::string kernel_name =
+      jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+        .instantiate(cudf::jit::get_type_name(out.type()),            // list of template arguments
+                     cudf::jit::get_type_name(lhs.type()),
+                     cudf::jit::get_type_name(rhs.type()),
+                     get_operator_name(op, OperatorType::Direct));
+
+    cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+      .get_kernel(kernel_name)                               //
+      ->configure_1d_max_occupancy(0, 0, 0, stream.value())  //
+      ->launch(out.size(),
+               cudf::jit::get_data_ptr(out),
+               cudf::jit::get_data_ptr(lhs),
+               cudf::jit::get_data_ptr(rhs));
   }
 }
 
@@ -234,23 +188,24 @@ void binary_operation(mutable_column_view& out,
   std::string const output_type_name = cudf::jit::get_type_name(out.type());
 
   std::string ptx_hash =
-    hash + "." + std::to_string(std::hash<std::string>{}(ptx + output_type_name));
+    "prog_binop." + std::to_string(std::hash<std::string>{}(ptx + output_type_name));
   std::string cuda_source =
-    "\n#include <cudf/types.hpp>\n" +
-    cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name) + code::kernel;
-
-  cudf::jit::launcher(
-    ptx_hash, cuda_source, header_names, cudf::jit::compiler_flags, headers_code, stream)
-    .set_kernel_inst("kernel_v_v",       // name of the kernel
-                                         // we are launching
-                     {output_type_name,  // list of template arguments
-                      cudf::jit::get_type_name(lhs.type()),
-                      cudf::jit::get_type_name(rhs.type()),
-                      get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct)})
-    .launch(out.size(),
-            cudf::jit::get_data_ptr(out),
-            cudf::jit::get_data_ptr(lhs),
-            cudf::jit::get_data_ptr(rhs));
+    cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name);
+
+  std::string kernel_name =
+    jitify2::reflection::Template("cudf::binops::jit::kernel_v_v")  //
+      .instantiate(output_type_name,                                // list of template arguments
+                   cudf::jit::get_type_name(lhs.type()),
+                   cudf::jit::get_type_name(rhs.type()),
+                   get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct));
+
+  cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
+    .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                            //
+    ->launch(out.size(),
+             cudf::jit::get_data_ptr(out),
+             cudf::jit::get_data_ptr(lhs),
+             cudf::jit::get_data_ptr(rhs));
 }
 
 }  // namespace jit
diff --git a/cpp/src/binaryop/jit/code/kernel.cpp b/cpp/src/binaryop/jit/code/kernel.cpp
deleted file mode 100644
index cfa1f1f82d2..00000000000
--- a/cpp/src/binaryop/jit/code/kernel.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *     Copyright 2018 Rommel Quintanilla <rommel@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace cudf {
-namespace binops {
-namespace jit {
-namespace code {
-
-// clang-format off
-const char* kernel =
-  R"***(
-    #include "operation.h"
-
-    #include <cudf/types.hpp>
-    #include <cudf/utilities/bit.hpp>
-    #include <cudf/wrappers/timestamps.hpp>
-    #include <cudf/wrappers/durations.hpp>
-    #include <cudf/fixed_point/fixed_point.hpp>
-
-    template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-    __global__
-    void kernel_v_s_with_validity(cudf::size_type size, TypeOut* out_data, TypeLhs* lhs_data,
-                                  TypeRhs* rhs_data, cudf::bitmask_type* output_mask,
-                                  cudf::bitmask_type const* mask,
-                                  cudf::size_type offset, bool scalar_valid) {
-        int tid = threadIdx.x;
-        int blkid = blockIdx.x;
-        int blksz = blockDim.x;
-        int gridsz = gridDim.x;
-
-        int start = tid + blkid * blksz;
-        int step = blksz * gridsz;
-
-        for (cudf::size_type i=start; i<size; i+=step) {
-            bool output_valid = false;
-            out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(
-                lhs_data[i], rhs_data[0],
-                mask ? cudf::bit_is_set(mask, offset + i) : true, scalar_valid, output_valid);
-            if (output_mask && !output_valid) cudf::clear_bit(output_mask, i);
-        }
-    }
-
-    template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-    __global__
-    void kernel_v_s(cudf::size_type size,
-                    TypeOut* out_data, TypeLhs* lhs_data, TypeRhs* rhs_data) {
-        int tid = threadIdx.x;
-        int blkid = blockIdx.x;
-        int blksz = blockDim.x;
-        int gridsz = gridDim.x;
-
-        int start = tid + blkid * blksz;
-        int step = blksz * gridsz;
-
-        for (cudf::size_type i=start; i<size; i+=step) {
-            out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(lhs_data[i], rhs_data[0]);
-        }
-    }
-
-    template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-    __global__
-    void kernel_v_v(cudf::size_type size,
-                    TypeOut* out_data, TypeLhs* lhs_data, TypeRhs* rhs_data) {
-        int tid = threadIdx.x;
-        int blkid = blockIdx.x;
-        int blksz = blockDim.x;
-        int gridsz = gridDim.x;
-
-        int start = tid + blkid * blksz;
-        int step = blksz * gridsz;
-
-        for (cudf::size_type i=start; i<size; i+=step) {
-            out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(lhs_data[i], rhs_data[i]);
-        }
-    }
-
-    template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-    __global__
-    void kernel_v_v_with_validity(cudf::size_type size, TypeOut* out_data, TypeLhs* lhs_data,
-                                  TypeRhs* rhs_data, cudf::bitmask_type* output_mask,
-                                  cudf::bitmask_type const* lhs_mask, cudf::size_type lhs_offset,
-                                  cudf::bitmask_type const* rhs_mask, cudf::size_type rhs_offset) {
-        int tid = threadIdx.x;
-        int blkid = blockIdx.x;
-        int blksz = blockDim.x;
-        int gridsz = gridDim.x;
-
-        int start = tid + blkid * blksz;
-        int step = blksz * gridsz;
-
-        for (cudf::size_type i=start; i<size; i+=step) {
-            bool output_valid = false;
-            out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(
-                lhs_data[i], rhs_data[i],
-                lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true,
-                rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true,
-                output_valid);
-            if (output_mask && !output_valid) cudf::clear_bit(output_mask, i);
-        }
-    }
-)***";
-// clang-format on
-
-}  // namespace code
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/code/operation.cpp b/cpp/src/binaryop/jit/code/operation.cpp
deleted file mode 100644
index 938ab0614d4..00000000000
--- a/cpp/src/binaryop/jit/code/operation.cpp
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace cudf {
-namespace binops {
-namespace jit {
-namespace code {
-
-const char* operation =
-  R"***(
-    #pragma once
-
-    #include "traits.h"
-
-    #include <cmath>
-
-    #include <cuda/std/type_traits>
-
-    using namespace cuda::std;
-
-    struct Add {
-        // Allow sum between chronos only when both input and output types
-        // are chronos. Unsupported combinations will fail to compile
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(is_chrono_v<TypeOut> &&
-                               is_chrono_v<TypeLhs> &&
-                               is_chrono_v<TypeRhs>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return x + y;
-        }
-
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(!is_chrono_v<TypeOut> ||
-                               !is_chrono_v<TypeLhs> ||
-                               !is_chrono_v<TypeRhs>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-            return static_cast<TypeOut>(static_cast<TypeCommon>(x) + static_cast<TypeCommon>(y));
-        }
-    };
-
-    using RAdd = Add;
-
-    struct Sub {
-        // Allow difference between chronos only when both input and output types
-        // are chronos. Unsupported combinations will fail to compile
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(is_chrono_v<TypeOut> &&
-                               is_chrono_v<TypeLhs> &&
-                               is_chrono_v<TypeRhs>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return x - y;
-        }
-
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(!is_chrono_v<TypeOut> ||
-                               !is_chrono_v<TypeLhs> ||
-                               !is_chrono_v<TypeRhs>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-            return static_cast<TypeOut>(static_cast<TypeCommon>(x) - static_cast<TypeCommon>(y));
-        }
-    };
-
-    struct RSub {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return Sub::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct Mul {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(!is_duration_v<TypeOut>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-            return static_cast<TypeOut>(static_cast<TypeCommon>(x) * static_cast<TypeCommon>(y));
-        }
-
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(is_duration_v<TypeOut>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return DurationProduct<TypeOut>(x, y);
-        }
-
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(is_duration_v<TypeLhs> && is_integral_v<TypeRhs>) ||
-                              (is_integral_v<TypeLhs> && is_duration_v<TypeRhs>)>* = nullptr>
-        static TypeOut DurationProduct(TypeLhs x, TypeRhs y) {
-            return x * y;
-        }
-    };
-
-    using RMul = Mul;
-
-    struct Div {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(!is_duration_v<TypeLhs>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-            return static_cast<TypeOut>(static_cast<TypeCommon>(x) / static_cast<TypeCommon>(y));
-        }
-
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(is_duration_v<TypeLhs>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return DurationDivide<TypeOut>(x, y);
-        }
-
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs,
-                  enable_if_t<(is_integral_v<TypeRhs> || is_duration_v<TypeRhs>)>* = nullptr>
-        static TypeOut DurationDivide(TypeLhs x, TypeRhs y) {
-            return x / y;
-        }
-    };
-
-    struct RDiv {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return Div::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct TrueDiv {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (static_cast<double>(x) / static_cast<double>(y));
-        }
-    };
-
-    struct RTrueDiv {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return TrueDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct FloorDiv {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return floor(static_cast<double>(x) / static_cast<double>(y));
-        }
-    };
-
-    struct RFloorDiv {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return FloorDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct Mod {
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<(is_integral_v<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-            return static_cast<TypeOut>(static_cast<TypeCommon>(x) % static_cast<TypeCommon>(y));
-        }
-
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<(isFloat<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return static_cast<TypeOut>(fmodf(static_cast<float>(x), static_cast<float>(y)));
-        }
-
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<(isDouble<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return static_cast<TypeOut>(fmod(static_cast<double>(x), static_cast<double>(y)));
-        }
-
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return x % y;
-        }
-    };
-
-    struct RMod {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return Mod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct PyMod {
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<(is_integral_v<TypeOut>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return ((x % y) + y) % y;
-        }
-
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<(is_floating_point_v<TypeOut>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            double x1 = static_cast<double>(x);
-            double y1 = static_cast<double>(y);
-            return fmod(fmod(x1, y1) + y1, y1);
-        }
-
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return ((x % y) + y) % y;
-        }
-    };
-
-    struct RPyMod {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return PyMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct Pow {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return pow(static_cast<double>(x), static_cast<double>(y));
-        }
-    };
-
-    struct RPow {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return Pow::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct Equal {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x == y);
-        }
-    };
-
-    using REqual = Equal;
-
-    struct NotEqual {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x != y);
-        }
-    };
-
-    using RNotEqual = NotEqual;
-
-    struct Less {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x < y);
-        }
-    };
-
-    struct RLess {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (y < x);
-        }
-    };
-
-    struct Greater {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x > y);
-        }
-    };
-
-    struct RGreater {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (y > x);
-        }
-    };
-
-    struct LessEqual {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x <= y);
-        }
-    };
-
-    struct RLessEqual {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (y <= x);
-        }
-    };
-
-    struct GreaterEqual {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x >= y);
-        }
-    };
-
-    struct RGreaterEqual {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (y >= x);
-        }
-    };
-    
-    struct BitwiseAnd {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (static_cast<TypeOut>(x) & static_cast<TypeOut>(y));
-        }
-    };
-
-    using RBitwiseAnd = BitwiseAnd;
-
-    struct BitwiseOr {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (static_cast<TypeOut>(x) | static_cast<TypeOut>(y));
-        }
-    };
-
-    using RBitwiseOr = BitwiseOr;
-
-    struct BitwiseXor {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (static_cast<TypeOut>(x) ^ static_cast<TypeOut>(y));
-        }
-    };
-
-    using RBitwiseXor = BitwiseXor;
-
-    struct LogicalAnd {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x && y);
-        }
-    };
-
-    using RLogicalAnd = LogicalAnd;
-
-    struct LogicalOr {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x || y);
-        }
-    };
-
-    using RLogicalOr = LogicalOr;
-
-    struct UserDefinedOp {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            TypeOut output;
-            using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
-            GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
-            return output;
-        }
-    };    
-    
-    struct ShiftLeft {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x << y);
-        }
-    };
-
-    struct RShiftLeft {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (y << x);
-        }
-    };
-
-    struct ShiftRight {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (x >> y);
-        }
-    };    
-
-    struct RShiftRight {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (y >> x);
-        }
-    };
-
-    struct ShiftRightUnsigned {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (static_cast<make_unsigned_t<TypeLhs>>(x) >> y);            
-        }
-    };    
-
-    struct RShiftRightUnsigned {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (static_cast<make_unsigned_t<TypeRhs>>(y) >> x);            
-        }
-    };    
-
-    struct LogBase {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return (std::log(static_cast<double>(x)) / std::log(static_cast<double>(y)));
-        }
-    };
-
-    struct RLogBase {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return LogBase::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct NullEquals {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid,
-                               bool& output_valid) {
-            output_valid = true;
-            if (!lhs_valid && !rhs_valid) return true;
-            if (lhs_valid && rhs_valid) return x == y;
-            return false;
-        }
-    };
-
-    struct RNullEquals {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid,
-                               bool& output_valid) {
-            output_valid = true;
-            return NullEquals::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid,
-                                                                  output_valid);
-        }
-    };
-
-    struct NullMax {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid,
-                               bool& output_valid) {
-            output_valid = true;
-            if (!lhs_valid && !rhs_valid) {
-                output_valid = false;
-                return TypeOut{};
-            } else if (lhs_valid && rhs_valid) {
-                return (TypeOut{x} > TypeOut{y}) ? TypeOut{x} : TypeOut{y};
-            } else if (lhs_valid) return TypeOut{x};
-            else return TypeOut{y};
-        }
-    };
-
-    struct RNullMax {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid,
-                               bool& output_valid) {
-            return NullMax::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid,
-                                                               output_valid);
-        }
-    };
-
-    struct NullMin {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid,
-                               bool& output_valid) {
-            output_valid = true;
-            if (!lhs_valid && !rhs_valid) {
-                output_valid = false;
-                return TypeOut{};
-            } else if (lhs_valid && rhs_valid) {
-                return (TypeOut{x} < TypeOut{y}) ? TypeOut{x} : TypeOut{y};
-            } else if (lhs_valid) return TypeOut{x};
-            else return TypeOut{y};
-        }
-    };
-
-    struct RNullMin {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid,
-                               bool& output_valid) {
-            return NullMin::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid,
-                                                               output_valid);
-        }
-    };
-
-    struct PMod {
-        // Ideally, these two specializations - one for integral types and one for non integral
-        // types shouldn't be required, as std::fmod should promote integral types automatically
-        // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified
-        // code does not work - it is having trouble deciding between float/double overloads
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
-            common_t xconv{x};
-            common_t yconv{y};
-            auto rem = xconv % yconv;
-            if (rem < 0) rem = (rem + yconv) % yconv;
-            return TypeOut{rem};
-        }
-
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs,
-                  enable_if_t<!(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* = nullptr>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
-            common_t xconv{x};
-            common_t yconv{y};
-            auto rem = std::fmod(xconv, yconv);
-            if (rem < 0) rem = std::fmod(rem + yconv, yconv);
-            return TypeOut{rem};
-        }
-    };
-
-    struct RPMod {
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return PMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
-        }
-    };
-
-    struct ATan2 {
-        template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return TypeOut{std::atan2(double{x}, double{y})};
-        }
-    };
-
-    struct RATan2 {
-        template <typename TypeOut,
-                  typename TypeLhs,
-                  typename TypeRhs>
-        static TypeOut operate(TypeLhs x, TypeRhs y) {
-            return TypeOut{ATan2::operate<TypeOut, TypeRhs, TypeLhs>(y, x)};
-        }
-    };
-)***";
-
-}  // namespace code
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/code/traits.cpp b/cpp/src/binaryop/jit/code/traits.cpp
deleted file mode 100644
index 53b980b1a02..00000000000
--- a/cpp/src/binaryop/jit/code/traits.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace cudf {
-namespace binops {
-namespace jit {
-namespace code {
-const char* traits =
-  R"***(
-    #pragma once
-
-    // Include Jitify's cstddef header first
-    #include <cstddef>
-
-    #include <cuda/std/climits>
-    #include <cuda/std/cstddef>
-    #include <cuda/std/limits>
-    #include <cuda/std/type_traits>
-
-    #include <cudf/wrappers/durations.hpp>
-    #include <cudf/wrappers/timestamps.hpp>
-
-    // -------------------------------------------------------------------------
-    // type_traits cannot tell the difference between float and double
-    template <typename Type>
-    constexpr bool isFloat = false;
-
-    template <typename T>
-    constexpr bool is_timestamp_v =
-        cuda::std::is_same<cudf::timestamp_D, T>::value ||
-        cuda::std::is_same<cudf::timestamp_s, T>::value ||
-        cuda::std::is_same<cudf::timestamp_ms, T>::value ||
-        cuda::std::is_same<cudf::timestamp_us, T>::value ||
-        cuda::std::is_same<cudf::timestamp_ns, T>::value;
-
-    template <typename T>
-    constexpr bool is_duration_v =
-        cuda::std::is_same<cudf::duration_D, T>::value ||
-        cuda::std::is_same<cudf::duration_s, T>::value ||
-        cuda::std::is_same<cudf::duration_ms, T>::value ||
-        cuda::std::is_same<cudf::duration_us, T>::value ||
-        cuda::std::is_same<cudf::duration_ns, T>::value;
-
-    template <typename T>
-    constexpr bool is_chrono_v = is_timestamp_v<T> || is_duration_v<T>;
-
-    template <>
-    constexpr bool isFloat<float> = true;
-
-    template <typename Type>
-    constexpr bool isDouble = false;
-
-    template <>
-    constexpr bool isDouble<double> = true;
-)***";
-
-}  // namespace code
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
new file mode 100644
index 00000000000..fcfe16f979d
--- /dev/null
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Copyright 2018-2019 BlazingDB, Inc.
+ *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ *     Copyright 2018 Rommel Quintanilla <rommel@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <binaryop/jit/operation.hpp>
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/wrappers/durations.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+namespace cudf {
+namespace binops {
+namespace jit {
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
+__global__ void kernel_v_s_with_validity(cudf::size_type size,
+                                         TypeOut* out_data,
+                                         TypeLhs* lhs_data,
+                                         TypeRhs* rhs_data,
+                                         cudf::bitmask_type* output_mask,
+                                         cudf::bitmask_type const* mask,
+                                         cudf::size_type offset,
+                                         bool scalar_valid)
+{
+  int tid    = threadIdx.x;
+  int blkid  = blockIdx.x;
+  int blksz  = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int start = tid + blkid * blksz;
+  int step  = blksz * gridsz;
+
+  for (cudf::size_type i = start; i < size; i += step) {
+    bool output_valid = false;
+    out_data[i]       = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(
+      lhs_data[i],
+      rhs_data[0],
+      mask ? cudf::bit_is_set(mask, offset + i) : true,
+      scalar_valid,
+      output_valid);
+    if (output_mask && !output_valid) cudf::clear_bit(output_mask, i);
+  }
+}
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
+__global__ void kernel_v_s(cudf::size_type size,
+                           TypeOut* out_data,
+                           TypeLhs* lhs_data,
+                           TypeRhs* rhs_data)
+{
+  int tid    = threadIdx.x;
+  int blkid  = blockIdx.x;
+  int blksz  = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int start = tid + blkid * blksz;
+  int step  = blksz * gridsz;
+
+  for (cudf::size_type i = start; i < size; i += step) {
+    out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(lhs_data[i], rhs_data[0]);
+  }
+}
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
+__global__ void kernel_v_v(cudf::size_type size,
+                           TypeOut* out_data,
+                           TypeLhs* lhs_data,
+                           TypeRhs* rhs_data)
+{
+  int tid    = threadIdx.x;
+  int blkid  = blockIdx.x;
+  int blksz  = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int start = tid + blkid * blksz;
+  int step  = blksz * gridsz;
+
+  for (cudf::size_type i = start; i < size; i += step) {
+    out_data[i] = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(lhs_data[i], rhs_data[i]);
+  }
+}
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
+__global__ void kernel_v_v_with_validity(cudf::size_type size,
+                                         TypeOut* out_data,
+                                         TypeLhs* lhs_data,
+                                         TypeRhs* rhs_data,
+                                         cudf::bitmask_type* output_mask,
+                                         cudf::bitmask_type const* lhs_mask,
+                                         cudf::size_type lhs_offset,
+                                         cudf::bitmask_type const* rhs_mask,
+                                         cudf::size_type rhs_offset)
+{
+  int tid    = threadIdx.x;
+  int blkid  = blockIdx.x;
+  int blksz  = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int start = tid + blkid * blksz;
+  int step  = blksz * gridsz;
+
+  for (cudf::size_type i = start; i < size; i += step) {
+    bool output_valid = false;
+    out_data[i]       = TypeOpe::template operate<TypeOut, TypeLhs, TypeRhs>(
+      lhs_data[i],
+      rhs_data[i],
+      lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true,
+      rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true,
+      output_valid);
+    if (output_mask && !output_valid) cudf::clear_bit(output_mask, i);
+  }
+}
+
+}  // namespace jit
+}  // namespace binops
+}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/code/code.h b/cpp/src/binaryop/jit/operation-udf.hpp
similarity index 59%
rename from cpp/src/binaryop/jit/code/code.h
rename to cpp/src/binaryop/jit/operation-udf.hpp
index b8ff9e47c31..eaab2111d98 100644
--- a/cpp/src/binaryop/jit/code/code.h
+++ b/cpp/src/binaryop/jit/operation-udf.hpp
@@ -1,8 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,15 +16,5 @@
 
 #pragma once
 
-namespace cudf {
-namespace binops {
-namespace jit {
-namespace code {
-extern const char* kernel;
-extern const char* traits;
-extern const char* operation;
-
-}  // namespace code
-}  // namespace jit
-}  // namespace binops
-}  // namespace cudf
+// This file serves as a placeholder for user defined functions, so jitify can choose to override it
+// at runtime.
diff --git a/cpp/src/binaryop/jit/operation.hpp b/cpp/src/binaryop/jit/operation.hpp
new file mode 100644
index 00000000000..d117f2182f9
--- /dev/null
+++ b/cpp/src/binaryop/jit/operation.hpp
@@ -0,0 +1,646 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Copyright 2018-2019 BlazingDB, Inc.
+ *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+
+#include <binaryop/jit/operation-udf.hpp>
+#include <binaryop/jit/traits.hpp>
+
+#include <cmath>
+
+#include <cuda/std/type_traits>
+
+#pragma once
+
+using namespace cuda::std;
+
+namespace cudf {
+namespace binops {
+namespace jit {
+
+struct Add {
+  // Allow sum between chronos only when both input and output types
+  // are chronos. Unsupported combinations will fail to compile
+  template <
+    typename TypeOut,
+    typename TypeLhs,
+    typename TypeRhs,
+    enable_if_t<(is_chrono_v<TypeOut> && is_chrono_v<TypeLhs> && is_chrono_v<TypeRhs>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return x + y;
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(!is_chrono_v<TypeOut> || !is_chrono_v<TypeLhs> ||
+                         !is_chrono_v<TypeRhs>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    return static_cast<TypeOut>(static_cast<TypeCommon>(x) + static_cast<TypeCommon>(y));
+  }
+};
+
+using RAdd = Add;
+
+struct Sub {
+  // Allow difference between chronos only when both input and output types
+  // are chronos. Unsupported combinations will fail to compile
+  template <
+    typename TypeOut,
+    typename TypeLhs,
+    typename TypeRhs,
+    enable_if_t<(is_chrono_v<TypeOut> && is_chrono_v<TypeLhs> && is_chrono_v<TypeRhs>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return x - y;
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(!is_chrono_v<TypeOut> || !is_chrono_v<TypeLhs> ||
+                         !is_chrono_v<TypeRhs>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    return static_cast<TypeOut>(static_cast<TypeCommon>(x) - static_cast<TypeCommon>(y));
+  }
+};
+
+struct RSub {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return Sub::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct Mul {
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(!is_duration_v<TypeOut>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    return static_cast<TypeOut>(static_cast<TypeCommon>(x) * static_cast<TypeCommon>(y));
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_duration_v<TypeOut>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return DurationProduct<TypeOut>(x, y);
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_duration_v<TypeLhs> && is_integral_v<TypeRhs>) ||
+                        (is_integral_v<TypeLhs> && is_duration_v<TypeRhs>)>* = nullptr>
+  static TypeOut DurationProduct(TypeLhs x, TypeRhs y)
+  {
+    return x * y;
+  }
+};
+
+using RMul = Mul;
+
+struct Div {
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(!is_duration_v<TypeLhs>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    return static_cast<TypeOut>(static_cast<TypeCommon>(x) / static_cast<TypeCommon>(y));
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_duration_v<TypeLhs>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return DurationDivide<TypeOut>(x, y);
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_integral_v<TypeRhs> || is_duration_v<TypeRhs>)>* = nullptr>
+  static TypeOut DurationDivide(TypeLhs x, TypeRhs y)
+  {
+    return x / y;
+  }
+};
+
+struct RDiv {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return Div::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct TrueDiv {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (static_cast<double>(x) / static_cast<double>(y));
+  }
+};
+
+struct RTrueDiv {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return TrueDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct FloorDiv {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return floor(static_cast<double>(x) / static_cast<double>(y));
+  }
+};
+
+struct RFloorDiv {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return FloorDiv::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct Mod {
+  template <
+    typename TypeOut,
+    typename TypeLhs,
+    typename TypeRhs,
+    enable_if_t<(is_integral_v<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    return static_cast<TypeOut>(static_cast<TypeCommon>(x) % static_cast<TypeCommon>(y));
+  }
+
+  template <
+    typename TypeOut,
+    typename TypeLhs,
+    typename TypeRhs,
+    enable_if_t<(isFloat<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return static_cast<TypeOut>(fmodf(static_cast<float>(x), static_cast<float>(y)));
+  }
+
+  template <
+    typename TypeOut,
+    typename TypeLhs,
+    typename TypeRhs,
+    enable_if_t<(isDouble<typename common_type<TypeOut, TypeLhs, TypeRhs>::type>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return static_cast<TypeOut>(fmod(static_cast<double>(x), static_cast<double>(y)));
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return x % y;
+  }
+};
+
+struct RMod {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return Mod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct PyMod {
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_integral_v<TypeOut>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return ((x % y) + y) % y;
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_floating_point_v<TypeOut>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    double x1 = static_cast<double>(x);
+    double y1 = static_cast<double>(y);
+    return fmod(fmod(x1, y1) + y1, y1);
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_duration_v<TypeLhs> && is_duration_v<TypeOut>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return ((x % y) + y) % y;
+  }
+};
+
+struct RPyMod {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return PyMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct Pow {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return pow(static_cast<double>(x), static_cast<double>(y));
+  }
+};
+
+struct RPow {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return Pow::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct Equal {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x == y);
+  }
+};
+
+using REqual = Equal;
+
+struct NotEqual {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x != y);
+  }
+};
+
+using RNotEqual = NotEqual;
+
+struct Less {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x < y);
+  }
+};
+
+struct RLess {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (y < x);
+  }
+};
+
+struct Greater {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x > y);
+  }
+};
+
+struct RGreater {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (y > x);
+  }
+};
+
+struct LessEqual {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x <= y);
+  }
+};
+
+struct RLessEqual {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (y <= x);
+  }
+};
+
+struct GreaterEqual {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x >= y);
+  }
+};
+
+struct RGreaterEqual {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (y >= x);
+  }
+};
+
+struct BitwiseAnd {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (static_cast<TypeOut>(x) & static_cast<TypeOut>(y));
+  }
+};
+
+using RBitwiseAnd = BitwiseAnd;
+
+struct BitwiseOr {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (static_cast<TypeOut>(x) | static_cast<TypeOut>(y));
+  }
+};
+
+using RBitwiseOr = BitwiseOr;
+
+struct BitwiseXor {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (static_cast<TypeOut>(x) ^ static_cast<TypeOut>(y));
+  }
+};
+
+using RBitwiseXor = BitwiseXor;
+
+struct LogicalAnd {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x && y);
+  }
+};
+
+using RLogicalAnd = LogicalAnd;
+
+struct LogicalOr {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x || y);
+  }
+};
+
+using RLogicalOr = LogicalOr;
+
+struct UserDefinedOp {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    TypeOut output;
+    using TypeCommon = typename common_type<TypeOut, TypeLhs, TypeRhs>::type;
+    GENERIC_BINARY_OP(&output, static_cast<TypeCommon>(x), static_cast<TypeCommon>(y));
+    return output;
+  }
+};
+
+struct ShiftLeft {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x << y);
+  }
+};
+
+struct RShiftLeft {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (y << x);
+  }
+};
+
+struct ShiftRight {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (x >> y);
+  }
+};
+
+struct RShiftRight {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (y >> x);
+  }
+};
+
+struct ShiftRightUnsigned {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (static_cast<make_unsigned_t<TypeLhs>>(x) >> y);
+  }
+};
+
+struct RShiftRightUnsigned {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (static_cast<make_unsigned_t<TypeRhs>>(y) >> x);
+  }
+};
+
+struct LogBase {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return (std::log(static_cast<double>(x)) / std::log(static_cast<double>(y)));
+  }
+};
+
+struct RLogBase {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return LogBase::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct NullEquals {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
+  {
+    output_valid = true;
+    if (!lhs_valid && !rhs_valid) return true;
+    if (lhs_valid && rhs_valid) return x == y;
+    return false;
+  }
+};
+
+struct RNullEquals {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
+  {
+    output_valid = true;
+    return NullEquals::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
+  }
+};
+
+struct NullMax {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
+  {
+    output_valid = true;
+    if (!lhs_valid && !rhs_valid) {
+      output_valid = false;
+      return TypeOut{};
+    } else if (lhs_valid && rhs_valid) {
+      return (TypeOut{x} > TypeOut{y}) ? TypeOut{x} : TypeOut{y};
+    } else if (lhs_valid)
+      return TypeOut{x};
+    else
+      return TypeOut{y};
+  }
+};
+
+struct RNullMax {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
+  {
+    return NullMax::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
+  }
+};
+
+struct NullMin {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
+  {
+    output_valid = true;
+    if (!lhs_valid && !rhs_valid) {
+      output_valid = false;
+      return TypeOut{};
+    } else if (lhs_valid && rhs_valid) {
+      return (TypeOut{x} < TypeOut{y}) ? TypeOut{x} : TypeOut{y};
+    } else if (lhs_valid)
+      return TypeOut{x};
+    else
+      return TypeOut{y};
+  }
+};
+
+struct RNullMin {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
+  {
+    return NullMin::operate<TypeOut, TypeRhs, TypeLhs>(y, x, rhs_valid, lhs_valid, output_valid);
+  }
+};
+
+struct PMod {
+  // Ideally, these two specializations - one for integral types and one for non integral
+  // types shouldn't be required, as std::fmod should promote integral types automatically
+  // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified
+  // code does not work - it is having trouble deciding between float/double overloads
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* =
+              nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
+    common_t xconv{x};
+    common_t yconv{y};
+    auto rem = xconv % yconv;
+    if (rem < 0) rem = (rem + yconv) % yconv;
+    return TypeOut{rem};
+  }
+
+  template <typename TypeOut,
+            typename TypeLhs,
+            typename TypeRhs,
+            enable_if_t<
+              !(is_integral_v<typename cuda::std::common_type<TypeLhs, TypeRhs>::type>)>* = nullptr>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    using common_t = typename cuda::std::common_type<TypeLhs, TypeRhs>::type;
+    common_t xconv{x};
+    common_t yconv{y};
+    auto rem = std::fmod(xconv, yconv);
+    if (rem < 0) rem = std::fmod(rem + yconv, yconv);
+    return TypeOut{rem};
+  }
+};
+
+struct RPMod {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return PMod::operate<TypeOut, TypeRhs, TypeLhs>(y, x);
+  }
+};
+
+struct ATan2 {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return TypeOut{std::atan2(double{x}, double{y})};
+  }
+};
+
+struct RATan2 {
+  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+  static TypeOut operate(TypeLhs x, TypeRhs y)
+  {
+    return TypeOut{ATan2::operate<TypeOut, TypeRhs, TypeLhs>(y, x)};
+  }
+};
+
+}  // namespace jit
+}  // namespace binops
+}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/traits.hpp b/cpp/src/binaryop/jit/traits.hpp
new file mode 100644
index 00000000000..1cca2b6e155
--- /dev/null
+++ b/cpp/src/binaryop/jit/traits.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Copyright 2018-2019 BlazingDB, Inc.
+ *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// Include Jitify's cstddef header first
+#include <cstddef>
+
+#include <cuda/std/climits>
+#include <cuda/std/cstddef>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <cudf/wrappers/durations.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+namespace cudf {
+namespace binops {
+namespace jit {
+
+// -------------------------------------------------------------------------
+// type_traits cannot tell the difference between float and double
+template <typename Type>
+constexpr bool isFloat = false;
+
+template <typename T>
+constexpr bool is_timestamp_v = cuda::std::is_same<cudf::timestamp_D, T>::value ||
+                                cuda::std::is_same<cudf::timestamp_s, T>::value ||
+                                cuda::std::is_same<cudf::timestamp_ms, T>::value ||
+                                cuda::std::is_same<cudf::timestamp_us, T>::value ||
+                                cuda::std::is_same<cudf::timestamp_ns, T>::value;
+
+template <typename T>
+constexpr bool is_duration_v = cuda::std::is_same<cudf::duration_D, T>::value ||
+                               cuda::std::is_same<cudf::duration_s, T>::value ||
+                               cuda::std::is_same<cudf::duration_ms, T>::value ||
+                               cuda::std::is_same<cudf::duration_us, T>::value ||
+                               cuda::std::is_same<cudf::duration_ns, T>::value;
+
+template <typename T>
+constexpr bool is_chrono_v = is_timestamp_v<T> || is_duration_v<T>;
+
+template <>
+constexpr bool isFloat<float> = true;
+
+template <typename Type>
+constexpr bool isDouble = false;
+
+template <>
+constexpr bool isDouble<double> = true;
+
+}  // namespace jit
+}  // namespace binops
+}  // namespace cudf
diff --git a/cpp/src/binaryop/jit/util.hpp b/cpp/src/binaryop/jit/util.hpp
index 6b4085bf11b..34c42e28a8b 100644
--- a/cpp/src/binaryop/jit/util.hpp
+++ b/cpp/src/binaryop/jit/util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,11 +72,15 @@ std::string inline get_operator_name(binary_operator op, OperatorType type)
       case binary_operator::NULL_EQUALS:          return "NullEquals";
       case binary_operator::NULL_MAX:             return "NullMax";
       case binary_operator::NULL_MIN:             return "NullMin";
-      default:                              return "None";
+      default:                                    return "";
     }
     // clang-format on
   }();
-  return type == OperatorType::Direct ? operator_name : 'R' + operator_name;
+
+  if (operator_name == "") { return "None"; }
+
+  return "cudf::binops::jit::" +
+         (type == OperatorType::Direct ? operator_name : 'R' + operator_name);
 }
 
 }  // namespace jit
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 38c95da6dc7..0d1bb5a8312 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -17,8 +17,6 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 
-#include <jit/type.h>
-
 namespace cudf {
 namespace detail {
 
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index c634aa8d06b..f79c82aa0db 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,20 +14,15 @@
  * limitations under the License.
  */
 
-#include <jit/cache.h>
 #include <cudf/utilities/error.hpp>
 
-#include <errno.h>
-#include <fcntl.h>
-#include <pwd.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <boost/filesystem.hpp>
-
 #include <cuda.h>
+#include <boost/filesystem.hpp>
+#include <jitify2.hpp>
 
 namespace cudf {
 namespace jit {
+
 // Get the directory in home to use for storing the cache
 boost::filesystem::path get_user_home_cache_dir()
 {
@@ -62,7 +57,7 @@ boost::filesystem::path get_user_home_cache_dir()
  * are used and if $HOME is not defined, returns an empty path and file
  * caching is not used.
  */
-boost::filesystem::path getCacheDir()
+boost::filesystem::path get_cache_dir()
 {
   // The environment variable always overrides the
   // default/compile-time value of `LIBCUDF_KERNEL_CACHE_PATH`
@@ -98,158 +93,33 @@ boost::filesystem::path getCacheDir()
   return kernel_cache_path;
 }
 
-cudfJitCache::cudfJitCache() {}
-
-cudfJitCache::~cudfJitCache() {}
-
-std::mutex cudfJitCache::_kernel_cache_mutex;
-std::mutex cudfJitCache::_program_cache_mutex;
-
-named_prog<jitify::experimental::Program> cudfJitCache::getProgram(
-  std::string const& prog_name,
-  std::string const& cuda_source,
-  std::vector<std::string> const& given_headers,
-  std::vector<std::string> const& given_options,
-  jitify::experimental::file_callback_type file_callback)
-{
-  // Lock for thread safety
-  std::lock_guard<std::mutex> lock(_program_cache_mutex);
-
-  return getCached(prog_name, program_map, [&]() {
-    CUDF_EXPECTS(not cuda_source.empty(), "Program not found in cache, Needs source string.");
-    return jitify::experimental::Program(cuda_source, given_headers, given_options, file_callback);
-  });
-}
-
-named_prog<jitify::experimental::KernelInstantiation> cudfJitCache::getKernelInstantiation(
-  std::string const& kern_name,
-  named_prog<jitify::experimental::Program> const& named_program,
-  std::vector<std::string> const& arguments)
-{
-  // Lock for thread safety
-  std::lock_guard<std::mutex> lock(_kernel_cache_mutex);
-
-  std::string prog_name                  = std::get<0>(named_program);
-  jitify::experimental::Program& program = *std::get<1>(named_program);
-
-  // Make instance name e.g. "prog_binop.kernel_v_v_int_int_long int_Add"
-  std::string kern_inst_name = prog_name + '.' + kern_name;
-  for (auto&& arg : arguments) kern_inst_name += '_' + arg;
-
-  CUcontext c;
-  cuCtxGetCurrent(&c);
-
-  auto& kernel_inst_map = kernel_inst_context_map[c];
-
-  return getCached(kern_inst_name, kernel_inst_map, [&]() {
-    return program.kernel(kern_name).instantiate(arguments);
-  });
-}
-
-// Another overload for getKernelInstantiation which might be useful to get
-// kernel instantiations in one step
-// ------------------------------------------------------------------------
-/*
-jitify::experimental::KernelInstantiation cudfJitCache::getKernelInstantiation(
-    std::string const& kern_name,
-    std::string const& prog_name,
-    std::string const& cuda_source = "",
-    std::vector<std::string> const& given_headers = {},
-    std::vector<std::string> const& given_options = {},
-    file_callback_type file_callback = nullptr)
-{
-    auto program = getProgram(prog_name,
-                              cuda_source,
-                              given_headers,
-                              given_options,
-                              file_callback);
-    return getKernelInstantiation(kern_name, program);
-}
-*/
-
-cudfJitCache::cacheFile::cacheFile(std::string file_name) : _file_name{file_name} {}
-
-cudfJitCache::cacheFile::~cacheFile() {}
-
-std::string cudfJitCache::cacheFile::read()
+std::string get_program_cache_dir()
 {
-  // Open file (duh)
-  int fd = open(_file_name.c_str(), O_RDWR);
-  if (fd == -1) {
-    successful_read = false;
-    return std::string();
-  }
-
-  // Create args for file locking
-  flock fl{};
-  fl.l_type   = F_RDLCK;  // Shared lock for reading
-  fl.l_whence = SEEK_SET;
-
-  // Lock the file descriptor. Only reading is allowed now
-  if (fcntl(fd, F_SETLKW, &fl) == -1) {
-    successful_read = false;
-    return std::string();
-  }
-
-  // Get file descriptor from file pointer
-  FILE* fp = fdopen(fd, "rb");
-
-  // Get file length
-  fseek(fp, 0L, SEEK_END);
-  size_t file_size = ftell(fp);
-  rewind(fp);
-
-  // Allocate memory of file length size
-  std::string content;
-  content.resize(file_size);
-  char* buffer = &content[0];
-
-  // Copy file into buffer
-  if (fread(buffer, file_size, 1, fp) != 1) {
-    successful_read = false;
-    fclose(fp);
-    free(buffer);
-    return std::string();
-  }
-  fclose(fp);
-  successful_read = true;
-
-  return content;
+#if defined(JITIFY_USE_CACHE)
+  return get_cache_dir().string();
+#elif
+  return {};
+#endif
 }
 
-void cudfJitCache::cacheFile::write(std::string content)
+jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog)
 {
-  // Open file and create if it doesn't exist, with access 0600
-  int fd = open(_file_name.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
-  if (fd == -1) {
-    successful_write = false;
-    return;
-  }
+  static std::mutex caches_mutex{};
+  static std::unordered_map<std::string, std::unique_ptr<jitify2::ProgramCache<>>> caches{};
 
-  // Create args for file locking
-  flock fl{};
-  fl.l_type   = F_WRLCK;  // Exclusive lock for writing
-  fl.l_whence = SEEK_SET;
+  std::lock_guard<std::mutex> caches_lock(caches_mutex);
 
-  // Lock the file descriptor. we the only ones now
-  if (fcntl(fd, F_SETLKW, &fl) == -1) {
-    successful_write = false;
-    return;
-  }
+  auto existing_cache = caches.find(preprog.name());
 
-  // Get file descriptor from file pointer
-  FILE* fp = fdopen(fd, "wb");
+  if (existing_cache == caches.end()) {
+    auto res = caches.insert(
+      {preprog.name(),
+       std::make_unique<jitify2::ProgramCache<>>(100, preprog, nullptr, get_program_cache_dir())});
 
-  // Copy string into file
-  if (fwrite(content.c_str(), content.length(), 1, fp) != 1) {
-    successful_write = false;
-    fclose(fp);
-    return;
+    existing_cache = res.first;
   }
-  fclose(fp);
 
-  successful_write = true;
-  return;
+  return *(existing_cache->second);
 }
 
 }  // namespace jit
diff --git a/cpp/src/jit/cache.h b/cpp/src/jit/cache.h
deleted file mode 100644
index 071a951023b..00000000000
--- a/cpp/src/jit/cache.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef CUDF_JIT_CACHE_H_
-#define CUDF_JIT_CACHE_H_
-
-#include <boost/filesystem.hpp>
-#include <cudf/utilities/error.hpp>
-#include <jitify.hpp>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-
-namespace cudf {
-namespace jit {
-template <typename Tv>
-using named_prog = std::pair<std::string, std::shared_ptr<Tv>>;
-
-/**
- * @brief Get the string path to the JITIFY kernel cache directory.
- *
- * This path can be overridden at runtime by defining an environment variable
- * named `LIBCUDF_KERNEL_CACHE_PATH`. The value of this variable must be a path
- * under which the process' user has read/write privileges.
- *
- * This function returns a path to the cache directory, creating it if it
- * doesn't exist.
- *
- * The default cache directory is `$HOME/.cudf/$CUDF_VERSION`. If no overrides
- * are used and if $HOME is not defined, returns an empty path and file
- * caching is not used.
- */
-boost::filesystem::path getCacheDir();
-
-class cudfJitCache {
- public:
-  /**
-   * @brief Get a process wide singleton cache object
-   *
-   */
-  static cudfJitCache& Instance()
-  {
-    // Meyers' singleton is thread safe in C++11
-    // Link: https://stackoverflow.com/a/1661564
-    static cudfJitCache cache;
-    return cache;
-  }
-
-  cudfJitCache();
-  ~cudfJitCache();
-
-  /**
-   * @brief Get the Kernel Instantiation object
-   *
-   * Searches an internal in-memory cache and file based cache for the kernel
-   * and if not found, JIT compiles and returns the kernel
-   *
-   * @param kern_name  name of kernel to return
-   * @param program    Jitify preprocessed program to get the kernel from
-   * @param arguments  template arguments for kernel in vector of strings
-   * @return  Pair of string kernel identifier and compiled kernel object
-   */
-  named_prog<jitify::experimental::KernelInstantiation> getKernelInstantiation(
-    std::string const& kern_name,
-    named_prog<jitify::experimental::Program> const& program,
-    std::vector<std::string> const& arguments);
-
-  /**
-   * @brief Get the Jitify preprocessed Program object
-   *
-   * Searches an internal in-memory cache and file based cache for the Jitify
-   * pre-processed program and if not found, JIT processes and returns it
-   *
-   * @param prog_file_name name of program to return
-   * @param cuda_source    string source code of program to compile
-   * @param given_headers  vector of strings representing source or names of each header included in
-   *                       cuda_source
-   * @param given_options  vector of strings options to pass to NVRTC
-   * @param file_callback  pointer to callback function to call whenever a header needs to be loaded
-   * @return named_prog<jitify::experimental::Program>
-   */
-  named_prog<jitify::experimental::Program> getProgram(
-    std::string const& prog_file_name,
-    std::string const& cuda_source                         = "",
-    std::vector<std::string> const& given_headers          = {},
-    std::vector<std::string> const& given_options          = {},
-    jitify::experimental::file_callback_type file_callback = nullptr);
-
- private:
-  template <typename Tv>
-  using umap_str_shptr = std::unordered_map<std::string, std::shared_ptr<Tv>>;
-
-  std::unordered_map<CUcontext, umap_str_shptr<jitify::experimental::KernelInstantiation>>
-    kernel_inst_context_map;
-  umap_str_shptr<jitify::experimental::Program> program_map;
-
-  /*
-    Even though this class can be used as a non-singleton, the file cache
-    access should remain limited to one thread per process. The lockf locks can
-    prevent multiple processes from accessing the file but are ineffective in
-    preventing multiple threads from doing so as the lock is shared by the
-    entire process.
-    Therefore the mutexes are static.
-    */
-  static std::mutex _kernel_cache_mutex;
-  static std::mutex _program_cache_mutex;
-
- private:
-  /**
-   * @brief Class to allow process wise exclusive access to cache files
-   *
-   */
-  class cacheFile {
-   private:
-    std::string _file_name;
-    bool successful_read  = false;
-    bool successful_write = false;
-
-   public:
-    cacheFile(std::string file_name);
-    ~cacheFile();
-
-    /**
-     * @brief Read this file and return the contents as a std::string
-     *
-     */
-    std::string read();
-
-    /**
-     * @brief Write the passed string to this file
-     *
-     */
-    void write(std::string);
-
-    /**
-     * @brief Check whether the read() operation on the file completed successfully
-     *
-     * @return true Read was successful. String returned by `read()` is valid
-     * @return false Read was unsuccessful. String returned by `read()` is empty
-     */
-    bool is_read_successful() { return successful_read; }
-
-    /**
-     * @brief Check whether the write() operation on the file completed successfully
-     *
-     * @return true Write was successful.
-     * @return false Write was unsuccessful. File state is undefined
-     */
-    bool is_write_successful() { return successful_write; }
-  };
-
- private:
-  template <typename T, typename FallbackFunc>
-  named_prog<T> getCached(std::string const& name, umap_str_shptr<T>& map, FallbackFunc func)
-  {
-    // Find memory cached T object
-    auto it = map.find(name);
-    if (it != map.end()) {
-      return std::make_pair(name, it->second);
-    } else {  // Find file cached T object
-      bool successful_read = false;
-      std::string serialized;
-#if defined(JITIFY_USE_CACHE)
-      boost::filesystem::path cache_dir = getCacheDir();
-      if (not cache_dir.empty()) {
-        boost::filesystem::path file_name = cache_dir / name;
-        cacheFile file{file_name.string()};
-        serialized      = file.read();
-        successful_read = file.is_read_successful();
-      }
-#endif
-      if (not successful_read) {
-        // JIT compile and write to file if possible
-        serialized = func().serialize();
-#if defined(JITIFY_USE_CACHE)
-        if (not cache_dir.empty()) {
-          boost::filesystem::path file_name = cache_dir / name;
-          cacheFile file{file_name.string()};
-          file.write(serialized);
-        }
-#endif
-      }
-      // Add deserialized T to cache and return
-      auto program = std::make_shared<T>(T::deserialize(serialized));
-      map[name]    = program;
-      return std::make_pair(name, program);
-    }
-  }
-};
-
-}  // namespace jit
-}  // namespace cudf
-
-#endif  // CUDF_JIT_CACHE_H_
diff --git a/cpp/src/transform/jit/code/code.h b/cpp/src/jit/cache.hpp
similarity index 71%
rename from cpp/src/transform/jit/code/code.h
rename to cpp/src/jit/cache.hpp
index cc3d6a8fe89..df8d4278f0f 100644
--- a/cpp/src/transform/jit/code/code.h
+++ b/cpp/src/jit/cache.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,16 +16,13 @@
 
 #pragma once
 
+#include <jitify2.hpp>
+#include <memory>
+
 namespace cudf {
-namespace transformation {
 namespace jit {
-namespace code {
-extern const char* kernel_header;
-extern const char* kernel;
-extern const char* traits;
-extern const char* operation;
 
-}  // namespace code
+jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog);
+
 }  // namespace jit
-}  // namespace transformation
 }  // namespace cudf
diff --git a/cpp/src/jit/common_headers.hpp b/cpp/src/jit/common_headers.hpp
deleted file mode 100644
index 0f57790afe0..00000000000
--- a/cpp/src/jit/common_headers.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <jit/libcudacxx/cuda/std/chrono.jit>
-#include <jit/libcudacxx/cuda/std/climits.jit>
-#include <jit/libcudacxx/cuda/std/cstddef.jit>
-#include <jit/libcudacxx/cuda/std/cstdint.jit>
-#include <jit/libcudacxx/cuda/std/ctime.jit>
-#include <jit/libcudacxx/cuda/std/detail/__config.jit>
-#include <jit/libcudacxx/cuda/std/detail/__pragma_pop.jit>
-#include <jit/libcudacxx/cuda/std/detail/__pragma_push.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/__config.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_pop.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_push.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/__undef_macros.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/chrono.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/climits.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/cstddef.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/cstdint.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/ctime.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/limits.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/ratio.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/type_traits.jit>
-#include <jit/libcudacxx/cuda/std/detail/libcxx/include/version.jit>
-#include <jit/libcudacxx/cuda/std/limits.jit>
-#include <jit/libcudacxx/cuda/std/ratio.jit>
-#include <jit/libcudacxx/cuda/std/type_traits.jit>
-#include <jit/libcudacxx/cuda/std/version.jit>
-
-#include <cstring>
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace cudf {
-namespace jit {
-
-const std::vector<std::string> compiler_flags
-{
-  "-std=c++14",
-    // Have jitify prune unused global variables
-    "-remove-unused-globals",
-    // suppress all NVRTC warnings
-    "-w",
-    // force libcudacxx to not include system headers
-    "-D__CUDACC_RTC__",
-#if defined(__powerpc64__)
-    "-D__powerpc64__"
-#elif defined(__x86_64__)
-    "-D__x86_64__"
-#endif
-};
-
-const std::unordered_map<std::string, char const*> stringified_headers{
-  {"cuda/std/chrono", cuda_std_chrono},
-  {"cuda/std/climits", cuda_std_climits},
-  {"cuda/std/cstddef", cuda_std_cstddef},
-  {"cuda/std/cstdint", cuda_std_cstdint},
-  {"cuda/std/ctime", cuda_std_ctime},
-  {"cuda/std/limits", cuda_std_limits},
-  {"cuda/std/ratio", cuda_std_ratio},
-  {"cuda/std/type_traits", cuda_std_type_traits},
-  {"cuda/std/type_traits", cuda_std_type_traits},
-  {"cuda/std/version", cuda_std_version},
-  {"cuda/std/detail/__config", cuda_std_detail___config},
-  {"cuda/std/detail/__pragma_pop", cuda_std_detail___pragma_pop},
-  {"cuda/std/detail/__pragma_push", cuda_std_detail___pragma_push},
-  {"cuda/std/detail/libcxx/include/__config", cuda_std_detail_libcxx_include___config},
-  {"cuda/std/detail/libcxx/include/__pragma_pop", cuda_std_detail_libcxx_include___pragma_pop},
-  {"cuda/std/detail/libcxx/include/__pragma_push", cuda_std_detail_libcxx_include___pragma_push},
-  {"cuda/std/detail/libcxx/include/__undef_macros", cuda_std_detail_libcxx_include___undef_macros},
-  {"cuda/std/detail/libcxx/include/chrono", cuda_std_detail_libcxx_include_chrono},
-  {"cuda/std/detail/libcxx/include/climits", cuda_std_detail_libcxx_include_climits},
-  {"cuda/std/detail/libcxx/include/cstddef", cuda_std_detail_libcxx_include_cstddef},
-  {"cuda/std/detail/libcxx/include/cstdint", cuda_std_detail_libcxx_include_cstdint},
-  {"cuda/std/detail/libcxx/include/ctime", cuda_std_detail_libcxx_include_ctime},
-  {"cuda/std/detail/libcxx/include/limits", cuda_std_detail_libcxx_include_limits},
-  {"cuda/std/detail/libcxx/include/ratio", cuda_std_detail_libcxx_include_ratio},
-  {"cuda/std/detail/libcxx/include/type_traits", cuda_std_detail_libcxx_include_type_traits},
-  {"cuda/std/detail/libcxx/include/version", cuda_std_detail_libcxx_include_version},
-};
-
-inline std::istream* send_stringified_header(std::iostream& stream, char const* header)
-{
-  // skip the filename line added by stringify
-  stream << (std::strchr(header, '\n') + 1);
-  return &stream;
-}
-
-}  // namespace jit
-}  // namespace cudf
diff --git a/cpp/src/jit/launcher.cpp b/cpp/src/jit/launcher.cpp
deleted file mode 100644
index 2ddcac7d5ba..00000000000
--- a/cpp/src/jit/launcher.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <jit/launcher.h>
-#include <jit/parser.h>
-#include <chrono>
-#include <cstdint>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-namespace jit {
-
-launcher::launcher(const std::string& hash,
-                   const std::string& cuda_source,
-                   const std::vector<std::string>& header_names,
-                   const std::vector<std::string>& compiler_flags,
-                   jitify::experimental::file_callback_type file_callback,
-                   rmm::cuda_stream_view stream)
-  : cache_instance{cudf::jit::cudfJitCache::Instance()}, stream(stream)
-{
-  program = cache_instance.getProgram(
-    hash, cuda_source.c_str(), header_names, compiler_flags, file_callback);
-}
-
-launcher::launcher(launcher&& launcher)
-  : cache_instance{cudf::jit::cudfJitCache::Instance()},
-    program{std::move(launcher.program)},
-    kernel_inst{std::move(launcher.kernel_inst)},
-    stream{launcher.stream}
-{
-}
-
-}  // namespace jit
-}  // namespace cudf
diff --git a/cpp/src/jit/launcher.h b/cpp/src/jit/launcher.h
deleted file mode 100644
index 8bcd92149a8..00000000000
--- a/cpp/src/jit/launcher.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <jit/cache.h>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <jitify.hpp>
-
-#include <chrono>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-namespace cudf {
-namespace jit {
-/**
- * @brief Class used to handle compilation and execution of JIT kernels
- */
-class launcher {
- public:
-  launcher() = delete;
-
-  /**
-   * @brief Constructor of the launcher class
-   *
-   * Method to generate vector containing all template types for a JIT kernel.
-   *  This vector is used to get the compiled kernel for one set of types and set
-   *  it as the kernel to launch using this launcher.
-   *
-   * @param hash The hash to be used as the key for caching
-   * @param cuda_code The CUDA code that contains the kernel to be launched
-   * @param header_names Strings of header_names or strings that contain content
-   * of the header files
-   * @param compiler_flags Strings of compiler flags
-   * @param file_callback a function that returns header file contents given header
-   * file names.
-   * @param stream The non-owned stream to use for execution
-   */
-  launcher(const std::string& hash,
-           const std::string& cuda_source,
-           const std::vector<std::string>& header_names,
-           const std::vector<std::string>& compiler_flags,
-           jitify::experimental::file_callback_type file_callback,
-           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-  launcher(launcher&&);
-  launcher(const launcher&) = delete;
-  launcher& operator=(launcher&&) = delete;
-  launcher& operator=(const launcher&) = delete;
-
-  /**
-   * @brief Sets the kernel to launch using this launcher
-   *
-   * Method to generate vector containing all template types for a JIT kernel.
-   *  This vector is used to get the compiled kernel for one set of types and set
-   *  it as the kernel to launch using this launcher.
-   *
-   * @param kernel_name The kernel to be launched
-   * @param arguments   The template arguments to be used to instantiate the kernel
-   * @return launcher& ref to this launcher object
-   */
-  launcher& set_kernel_inst(const std::string& kernel_name,
-                            const std::vector<std::string>& arguments)
-  {
-    kernel_inst = cache_instance.getKernelInstantiation(kernel_name, program, arguments);
-    return *this;
-  }
-
-  /**
-   * @brief Handle the Jitify API to launch using information
-   *  contained in the members of `this`
-   *
-   * @tparam All parameters to launch the kernel
-   */
-  template <typename... Args>
-  void launch(Args... args)
-  {
-    get_kernel().configure_1d_max_occupancy(0, 0, 0, stream.value()).safe_launch(args...);
-  }
-
- private:
-  cudf::jit::cudfJitCache& cache_instance;
-  cudf::jit::named_prog<jitify::experimental::Program> program;
-  cudf::jit::named_prog<jitify::experimental::KernelInstantiation> kernel_inst;
-  rmm::cuda_stream_view stream;
-
-  jitify::experimental::KernelInstantiation& get_kernel() { return *std::get<1>(kernel_inst); }
-};
-
-}  // namespace jit
-}  // namespace cudf
diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index 01fd3aea33a..8929d58be08 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
+#include "parser.hpp"
+
+#include <cudf/utilities/error.hpp>
+
 #include <algorithm>
 #include <cctype>
-#include <cudf/utilities/error.hpp>
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
-#include "parser.h"
-
 namespace cudf {
 namespace jit {
 constexpr char percent_escape[] = "_";
diff --git a/cpp/src/jit/parser.h b/cpp/src/jit/parser.hpp
similarity index 100%
rename from cpp/src/jit/parser.h
rename to cpp/src/jit/parser.hpp
diff --git a/cpp/src/jit/type.cpp b/cpp/src/jit/type.cpp
index e833a6fa10f..16894168b31 100644
--- a/cpp/src/jit/type.cpp
+++ b/cpp/src/jit/type.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+
 #include <string>
 
 namespace cudf {
diff --git a/cpp/src/jit/type.h b/cpp/src/jit/type.hpp
similarity index 100%
rename from cpp/src/jit/type.h
rename to cpp/src/jit/type.hpp
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 34d6d5fa194..ca4913c1843 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
+#include "rolling_detail.cuh"
+#include "rolling_jit_detail.hpp"
+
 #include <cudf/detail/iterator.cuh>
 #include <cudf/unary.hpp>
-#include "rolling_detail.cuh"
 
 namespace cudf {
 
diff --git a/cpp/src/rolling/jit/code/operation.cpp b/cpp/src/rolling/jit/code/operation.cpp
deleted file mode 100644
index 1fdc4080634..00000000000
--- a/cpp/src/rolling/jit/code/operation.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace cudf {
-namespace rolling {
-namespace jit {
-namespace code {
-const char* operation_h =
-  R"***(operation.h
-#pragma once
-  struct rolling_udf_ptx {
-    template <typename OutType, typename InType>
-    static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count) {
-      OutType ret;
-      rolling_udf(
-        &ret, 0, 0, 0, 0, &in_col[start], count, sizeof(InType));
-      return ret;
-    }
-  };
-
-  struct rolling_udf_cuda {
-    template <typename OutType, typename InType>
-    static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count) {
-      OutType ret;
-      rolling_udf(
-        &ret, in_col, start, count);
-      return ret;
-    }
-  };
-
-)***";
-
-}  // namespace code
-}  // namespace jit
-}  // namespace rolling
-}  // namespace cudf
diff --git a/cpp/src/rolling/jit/code/kernel.cpp b/cpp/src/rolling/jit/kernel.cu
similarity index 61%
rename from cpp/src/rolling/jit/code/kernel.cpp
rename to cpp/src/rolling/jit/kernel.cu
index 2c612162f79..52e397b9351 100644
--- a/cpp/src/rolling/jit/code/kernel.cpp
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,47 +14,50 @@
  * limitations under the License.
  */
 
-namespace cudf {
-namespace rolling {
-namespace jit {
-namespace code {
-const char* kernel_headers =
-  R"***(
-#include <../src/rolling/rolling_jit_detail.hpp>
+#include <rolling/jit/operation.hpp>
+#include <rolling/rolling_jit_detail.hpp>
+
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
-)***";
 
-const char* kernel =
-  R"***(
-#include "operation.h"
+namespace cudf {
+namespace rolling {
+namespace jit {
 
 template <typename WindowType>
-cudf::size_type __device__ get_window(WindowType window, cudf::size_type index) { return window[index]; }
+cudf::size_type __device__ get_window(WindowType window, cudf::size_type index)
+{
+  return window[index];
+}
 
 template <>
-cudf::size_type __device__ get_window(cudf::size_type window, cudf::size_type index) { return window; }
-
-template <typename InType, typename OutType, class agg_op, typename PrecedingWindowType, typename FollowingWindowType>
-__global__
-void gpu_rolling_new(cudf::size_type nrows,
-                 InType const* const __restrict__ in_col, 
-                 cudf::bitmask_type const* const __restrict__ in_col_valid,
-                 OutType* __restrict__ out_col, 
-                 cudf::bitmask_type* __restrict__ out_col_valid,
-                 cudf::size_type * __restrict__ output_valid_count,
-                 PrecedingWindowType preceding_window_begin,
-                 FollowingWindowType following_window_begin,
-                 cudf::size_type min_periods)
+cudf::size_type __device__ get_window(cudf::size_type window, cudf::size_type index)
 {
-  cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  return window;
+}
+
+template <typename InType,
+          typename OutType,
+          class agg_op,
+          typename PrecedingWindowType,
+          typename FollowingWindowType>
+__global__ void gpu_rolling_new(cudf::size_type nrows,
+                                InType const* const __restrict__ in_col,
+                                cudf::bitmask_type const* const __restrict__ in_col_valid,
+                                OutType* __restrict__ out_col,
+                                cudf::bitmask_type* __restrict__ out_col_valid,
+                                cudf::size_type* __restrict__ output_valid_count,
+                                PrecedingWindowType preceding_window_begin,
+                                FollowingWindowType following_window_begin,
+                                cudf::size_type min_periods)
+{
+  cudf::size_type i      = blockIdx.x * blockDim.x + threadIdx.x;
   cudf::size_type stride = blockDim.x * gridDim.x;
 
   cudf::size_type warp_valid_count{0};
 
   auto active_threads = __ballot_sync(0xffffffff, i < nrows);
-  while(i < nrows)
-  {
+  while (i < nrows) {
     // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
     // for CUDA 10.0 and below (fixed in CUDA 10.1)
     volatile cudf::size_type count = 0;
@@ -63,16 +66,16 @@ void gpu_rolling_new(cudf::size_type nrows,
     cudf::size_type following_window = get_window(following_window_begin, i);
 
     // compute bounds
-    cudf::size_type start = min(nrows, max(0, i - preceding_window + 1));
-    cudf::size_type end = min(nrows, max(0, i + following_window + 1));
+    cudf::size_type start       = min(nrows, max(0, i - preceding_window + 1));
+    cudf::size_type end         = min(nrows, max(0, i + following_window + 1));
     cudf::size_type start_index = min(start, end);
-    cudf::size_type end_index = max(start, end);
+    cudf::size_type end_index   = max(start, end);
 
     // aggregate
     // TODO: We should explore using shared memory to avoid redundant loads.
     //       This might require separating the kernel into a special version
     //       for dynamic and static sizes.
-    count = end_index - start_index;
+    count       = end_index - start_index;
     OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);
 
     // check if we have enough input samples
@@ -82,9 +85,7 @@ void gpu_rolling_new(cudf::size_type nrows,
     const unsigned int result_mask = __ballot_sync(active_threads, output_is_valid);
 
     // store the output value, one per thread
-    if (output_is_valid) {
-      out_col[i] = val;
-    }
+    if (output_is_valid) { out_col[i] = val; }
 
     // only one thread writes the mask
     if (0 == cudf::intra_word_index(i)) {
@@ -92,20 +93,16 @@ void gpu_rolling_new(cudf::size_type nrows,
       warp_valid_count += __popc(result_mask);
     }
 
-    // process next element 
+    // process next element
     i += stride;
     active_threads = __ballot_sync(active_threads, i < nrows);
   }
 
   // TODO: likely faster to do a single_lane_block_reduce and a single
   // atomic per block but that requires jitifying single_lane_block_reduce...
-  if(0 == cudf::intra_word_index(threadIdx.x)) {
-    atomicAdd(output_valid_count, warp_valid_count);
-  }
+  if (0 == cudf::intra_word_index(threadIdx.x)) { atomicAdd(output_valid_count, warp_valid_count); }
 }
-)***";
 
-}  // namespace code
 }  // namespace jit
 }  // namespace rolling
 }  // namespace cudf
diff --git a/cpp/src/rolling/jit/code/code.h b/cpp/src/rolling/jit/operation-udf.hpp
similarity index 51%
rename from cpp/src/rolling/jit/code/code.h
rename to cpp/src/rolling/jit/operation-udf.hpp
index c5577d326c7..eaab2111d98 100644
--- a/cpp/src/rolling/jit/code/code.h
+++ b/cpp/src/rolling/jit/operation-udf.hpp
@@ -1,8 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Copyright 2018-2019 BlazingDB, Inc.
- *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,21 +16,5 @@
 
 #pragma once
 
-namespace cudf {
-namespace rolling {
-namespace jit {
-namespace code {
-extern const char* kernel_headers;
-extern const char* kernel;
-extern const char* operation_h;
-
-extern const char* kernel_headers;
-extern const char* kernel;
-extern const char* operation_h;
-
-extern const char* grouped_window_wrapper;
-
-}  // namespace code
-}  // namespace jit
-}  // namespace rolling
-}  // namespace cudf
+// This file serves as a placeholder for user defined functions, so jitify can choose to override it
+// at runtime.
diff --git a/cpp/src/rolling/jit/operation.hpp b/cpp/src/rolling/jit/operation.hpp
new file mode 100644
index 00000000000..9af8c2ac3fb
--- /dev/null
+++ b/cpp/src/rolling/jit/operation.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+
+#include <rolling/jit/operation-udf.hpp>
+
+#pragma once
+
+struct rolling_udf_ptx {
+  template <typename OutType, typename InType>
+  static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count)
+  {
+    OutType ret;
+    rolling_udf(&ret, 0, 0, 0, 0, &in_col[start], count, sizeof(InType));
+    return ret;
+  }
+};
+
+struct rolling_udf_cuda {
+  template <typename OutType, typename InType>
+  static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count)
+  {
+    OutType ret;
+    rolling_udf(&ret, in_col, start, count);
+    return ret;
+  }
+};
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 42562507fa9..bb431fad537 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include <rolling/jit/code/code.h>
 #include <rolling/rolling_detail.hpp>
-#include <rolling/rolling_jit_detail.hpp>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -44,12 +42,11 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <jit/launcher.h>
-#include <jit/parser.h>
-#include <jit/type.h>
-#include <jit/bit.hpp.jit>
-#include <jit/rolling_jit_detail.hpp.jit>
-#include <jit/types.hpp.jit>
+#include <jit/cache.hpp>
+#include <jit/parser.hpp>
+#include <jit/type.hpp>
+
+#include <jit_preprocessed_files/rolling/jit/kernel.cu.jit.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <rmm/cuda_stream_view.hpp>
@@ -1270,19 +1267,15 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
   std::string cuda_source;
   switch (udf_agg->kind) {
     case aggregation::Kind::PTX:
-      cuda_source = cudf::rolling::jit::code::kernel_headers;
       cuda_source +=
         cudf::jit::parse_single_function_ptx(udf_agg->_source,
                                              udf_agg->_function_name,
                                              cudf::jit::get_type_name(udf_agg->_output_type),
                                              {0, 5});  // args 0 and 5 are pointers.
-      cuda_source += cudf::rolling::jit::code::kernel;
       break;
     case aggregation::Kind::CUDA:
-      cuda_source = cudf::rolling::jit::code::kernel_headers;
       cuda_source +=
         cudf::jit::parse_single_function_cuda(udf_agg->_source, udf_agg->_function_name);
-      cuda_source += cudf::rolling::jit::code::kernel;
       break;
     default: CUDF_FAIL("Unsupported UDF type.");
   }
@@ -1293,37 +1286,26 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
   auto output_view = output->mutable_view();
   rmm::device_scalar<size_type> device_valid_count{0, stream};
 
-  const std::vector<std::string> compiler_flags{"-std=c++14",
-                                                // Have jitify prune unused global variables
-                                                "-remove-unused-globals",
-                                                // suppress all NVRTC warnings
-                                                "-w"};
-
-  // Launch the jitify kernel
-  cudf::jit::launcher(hash,
-                      cuda_source,
-                      {cudf_types_hpp,
-                       cudf_utilities_bit_hpp,
-                       cudf::rolling::jit::code::operation_h,
-                       ___src_rolling_rolling_jit_detail_hpp},
-                      compiler_flags,
-                      nullptr,
-                      stream)
-    .set_kernel_inst("gpu_rolling_new",  // name of the kernel we are launching
-                     {cudf::jit::get_type_name(input.type()),  // list of template arguments
-                      cudf::jit::get_type_name(output->type()),
-                      udf_agg->_operator_name,
-                      preceding_window_str.c_str(),
-                      following_window_str.c_str()})
-    .launch(input.size(),
-            cudf::jit::get_data_ptr(input),
-            input.null_mask(),
-            cudf::jit::get_data_ptr(output_view),
-            output_view.null_mask(),
-            device_valid_count.data(),
-            preceding_window,
-            following_window,
-            min_periods);
+  std::string kernel_name =
+    jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new")  //
+      .instantiate(cudf::jit::get_type_name(input.type()),  // list of template arguments
+                   cudf::jit::get_type_name(output->type()),
+                   udf_agg->_operator_name,
+                   preceding_window_str.c_str(),
+                   following_window_str.c_str());
+
+  cudf::jit::get_program_cache(*rolling_jit_kernel_cu_jit)
+    .get_kernel(kernel_name, {}, {{"rolling/jit/operation-udf.hpp", cuda_source}})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                           //
+    ->launch(input.size(),
+             cudf::jit::get_data_ptr(input),
+             input.null_mask(),
+             cudf::jit::get_data_ptr(output_view),
+             output_view.null_mask(),
+             device_valid_count.data(),
+             preceding_window,
+             following_window,
+             min_periods);
 
   output->set_null_count(output->size() - device_valid_count.value(stream));
 
diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp
deleted file mode 100644
index 58fdb945de3..00000000000
--- a/cpp/src/transform/jit/code/kernel.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace cudf {
-namespace transformation {
-namespace jit {
-namespace code {
-const char* kernel_header =
-  R"***(
-    #pragma once
-
-    // Include Jitify's cstddef header first
-    #include <cstddef>
-
-    #include <cuda/std/climits>
-    #include <cuda/std/cstddef>
-    #include <cuda/std/limits>
-
-    #include <cudf/types.hpp>
-    #include <cudf/wrappers/timestamps.hpp>
-  )***";
-
-const char* kernel =
-  R"***(
-    template <typename TypeOut, typename TypeIn>
-    __global__
-    void kernel(cudf::size_type size,
-                    TypeOut* out_data, TypeIn* in_data) {
-        int tid = threadIdx.x;
-        int blkid = blockIdx.x;
-        int blksz = blockDim.x;
-        int gridsz = gridDim.x;
-
-        int start = tid + blkid * blksz;
-        int step = blksz * gridsz;
-
-        for (cudf::size_type i=start; i<size; i+=step) {
-          GENERIC_UNARY_OP(&out_data[i], in_data[i]);  
-        }
-    }
-  )***";
-
-}  // namespace code
-}  // namespace jit
-}  // namespace transformation
-}  // namespace cudf
diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu
new file mode 100644
index 00000000000..3360ac8cf77
--- /dev/null
+++ b/cpp/src/transform/jit/kernel.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Include Jitify's cstddef header first
+#include <cstddef>
+
+#include <cuda/std/climits>
+#include <cuda/std/cstddef>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include <cudf/wrappers/durations.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+#include <transform/jit/operation-udf.hpp>
+
+#include <cudf/types.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+namespace cudf {
+namespace transformation {
+namespace jit {
+
+template <typename TypeOut, typename TypeIn>
+__global__ void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
+{
+  int tid    = threadIdx.x;
+  int blkid  = blockIdx.x;
+  int blksz  = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int start = tid + blkid * blksz;
+  int step  = blksz * gridsz;
+
+  for (cudf::size_type i = start; i < size; i += step) {
+    GENERIC_UNARY_OP(&out_data[i], in_data[i]);
+  }
+}
+
+}  // namespace jit
+}  // namespace transformation
+}  // namespace cudf
diff --git a/cpp/src/transform/jit/operation-udf.hpp b/cpp/src/transform/jit/operation-udf.hpp
new file mode 100644
index 00000000000..eaab2111d98
--- /dev/null
+++ b/cpp/src/transform/jit/operation-udf.hpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// This file serves as a placeholder for user defined functions, so jitify can choose to override it
+// at runtime.
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 6da0f78687b..8f176d035d2 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#include "jit/code/code.h"
+#include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 
-#include <jit/launcher.h>
-#include <jit/parser.h>
-#include <jit/type.h>
-#include <jit/common_headers.hpp>
+#include <jit/cache.hpp>
+#include <jit/parser.hpp>
+#include <jit/type.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -29,27 +28,12 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <jit/timestamps.hpp.jit>
-#include <jit/types.hpp.jit>
-
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace transformation {
-//! Jit functions
 namespace jit {
 
-const std::vector<std::string> header_names{cudf_types_hpp, cudf_wrappers_timestamps_hpp};
-
-std::istream* headers_code(std::string filename, std::iostream& stream)
-{
-  auto it = cudf::jit::stringified_headers.find(filename);
-  if (it != cudf::jit::stringified_headers.end()) {
-    return cudf::jit::send_stringified_header(stream, it->second);
-  }
-  return nullptr;
-}
-
 void unary_operation(mutable_column_view output,
                      column_view input,
                      const std::string& udf,
@@ -57,28 +41,25 @@ void unary_operation(mutable_column_view output,
                      bool is_ptx,
                      rmm::cuda_stream_view stream)
 {
-  std::string hash = "prog_transform" + std::to_string(std::hash<std::string>{}(udf));
-
-  std::string cuda_source = code::kernel_header;
-  if (is_ptx) {
-    cuda_source += cudf::jit::parse_single_function_ptx(
-                     udf, "GENERIC_UNARY_OP", cudf::jit::get_type_name(output_type), {0}) +
-                   code::kernel;
-  } else {
-    cuda_source += cudf::jit::parse_single_function_cuda(udf, "GENERIC_UNARY_OP") + code::kernel;
-  }
-
-  // Launch the jitify kernel
-  cudf::jit::launcher(hash,
-                      cuda_source,
-                      header_names,
-                      cudf::jit::compiler_flags,
-                      headers_code,
-                      stream)
-    .set_kernel_inst("kernel",  // name of the kernel we are launching
-                     {cudf::jit::get_type_name(output.type()),  // list of template arguments
-                      cudf::jit::get_type_name(input.type())})
-    .launch(output.size(), cudf::jit::get_data_ptr(output), cudf::jit::get_data_ptr(input));
+  std::string kernel_name =
+    jitify2::reflection::Template("cudf::transformation::jit::kernel")  //
+      .instantiate(cudf::jit::get_type_name(output.type()),  // list of template arguments
+                   cudf::jit::get_type_name(input.type()));
+
+  std::string cuda_source =
+    is_ptx ? cudf::jit::parse_single_function_ptx(udf,  //
+                                                  "GENERIC_UNARY_OP",
+                                                  cudf::jit::get_type_name(output_type),
+                                                  {0})
+           : cudf::jit::parse_single_function_cuda(udf,  //
+                                                   "GENERIC_UNARY_OP");
+
+  cudf::jit::get_program_cache(*transform_jit_kernel_cu_jit)
+    .get_kernel(kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}})  //
+    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                             //
+    ->launch(output.size(),                                                           //
+             cudf::jit::get_data_ptr(output),
+             cudf::jit::get_data_ptr(input));
 }
 
 }  // namespace jit
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 11ee7f6c458..342ec9145fd 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -154,7 +154,8 @@ ConfigureTest(BINARY_TEST
     binaryop/binop-verify-input-test.cpp
     binaryop/binop-null-test.cpp
     binaryop/binop-integration-test.cpp
-    binaryop/binop-generic-ptx-test.cpp)
+    binaryop/binop-generic-ptx-test.cpp
+    )
 
 ###################################################################################################
 # - unary transform tests -------------------------------------------------------------------------
@@ -172,16 +173,6 @@ ConfigureTest(INTEROP_TEST
     interop/from_arrow_test.cpp
     interop/dlpack_test.cpp)
 
-###################################################################################################
-# - jit cache tests -------------------------------------------------------------------------------
-ConfigureTest(JITCACHE_TEST
-    "${CUDF_SOURCE_DIR}/src/jit/cache.cpp"
-    jit/jit-cache-test.cpp)
-
-ConfigureTest(JITCACHE_MULTIPROC_TEST
-    "${CUDF_SOURCE_DIR}/src/jit/cache.cpp"
-    jit/jit-cache-multiprocess-test.cpp)
-
 ###################################################################################################
 # - io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cu)
@@ -277,7 +268,8 @@ ConfigureTest(ROLLING_TEST
     rolling/rolling_test.cpp 
     rolling/grouped_rolling_test.cpp
     rolling/lead_lag_test.cpp
-    rolling/collect_list_test.cpp)
+    rolling/collect_list_test.cpp
+    )
 
 ###################################################################################################
 # - filling test ----------------------------------------------------------------------------------
diff --git a/cpp/tests/jit/jit-cache-multiprocess-test.cpp b/cpp/tests/jit/jit-cache-multiprocess-test.cpp
deleted file mode 100644
index 2f0b353673e..00000000000
--- a/cpp/tests/jit/jit-cache-multiprocess-test.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sys/types.h>
-#include <unistd.h>
-#include "jit-cache-test.hpp"
-#include "rmm/mr/device/per_device_resource.hpp"
-
-#if defined(JITIFY_USE_CACHE)
-
-/**
- * @brief This test runs two processes that try to access the same kernel
- *
- * This is a stress test.
- *
- * A single test process is forked before invocation of CUDA and then both the
- * parent and child processes try to get and run a kernel. The child process
- * clears the cache before each iteration of the test so that the cache has to
- * be re-written by it. The parent process runs on a changing time offset so
- * that it sometimes gets the kernel from cache and sometimes it doesn't.
- *
- * The aim of this test is to check that the file cache doesn't get corrupted
- * when multiple processes are reading/writing to it at the same time. Since
- * the public API of JitCache doesn't return the serialized string of the
- * cached kernel, the way to test its validity is to run it on test data.
- */
-TEST_F(JitCacheMultiProcessTest, MultiProcessTest)
-{
-  int num_tests = 20;
-  // Cannot initialize scalars before forking
-  rmm::device_scalar<int> *input;
-  rmm::device_scalar<int> *output;
-  int expect = 64;
-
-  auto tester = [&](int pid, int test_no) {
-    // Brand new cache object that has nothing in in-memory cache
-    cudf::jit::cudfJitCache cache;
-
-    auto const in{4};
-    auto const out{1};
-    input->set_value(in);
-    output->set_value(out);
-
-    // make program
-    auto program = cache.getProgram("FileCacheTestProg3", program3_source);
-    // make kernel
-    auto kernel = cache.getKernelInstantiation("my_kernel", program, {"3", "int"});
-    (*std::get<1>(kernel)).configure(grid, block).launch(input->data(), output->data());
-    CUDA_TRY(cudaDeviceSynchronize());
-
-    ASSERT_TRUE(expect == output->value()) << "Expected val: " << expect << '\n'
-                                           << "  Actual val: " << output->value();
-  };
-
-  // This pipe is how the child process will send output to parent
-  int pipefd[2];
-  ASSERT_NE(pipe(pipefd), -1) << "Unable to create pipe";
-
-  pid_t cpid = fork();
-  ASSERT_TRUE(cpid >= 0) << "Fork failed";
-
-  if (cpid > 0) {      // Parent
-    close(pipefd[1]);  // Close write end of pipe. Parent doesn't write.
-    usleep(100000);
-  } else {                           // Child
-    close(pipefd[0]);                // Close read end of pipe. Child doesn't read.
-    dup2(pipefd[1], STDOUT_FILENO);  // redirect stdout to pipe
-  }
-
-  input  = new rmm::device_scalar<int>();
-  output = new rmm::device_scalar<int>();
-
-  for (int i = 0; i < num_tests; i++) {
-    if (cpid > 0)
-      usleep(10000);
-    else
-      purgeFileCache();
-
-    tester(cpid, i);
-  }
-
-  // Child ends here --------------------------------------------------------
-
-  if (cpid > 0) {
-    int status;
-    wait(&status);
-
-    std::cout << "Child output begin:" << std::endl;
-    char buf;
-    while (read(pipefd[0], &buf, 1) > 0) ASSERT_EQ(write(STDOUT_FILENO, &buf, 1), 1);
-    ASSERT_EQ(write(STDOUT_FILENO, "\n", 1), 1);
-    std::cout << "Child output end" << std::endl;
-
-    ASSERT_TRUE(WIFEXITED(status)) << "Child did not exit normally.";
-    ASSERT_EQ(WEXITSTATUS(status), 0) << "Error in child.";
-  }
-}
-#endif
-
-int main(int argc, char **argv)
-{
-  ::testing::InitGoogleTest(&argc, argv);
-
-  // This test relies on the fact that the cuda context will be created in
-  // each process separately after the fork. With the default CUDF_TEST_MAIN,
-  // using rmm_mode=pool will cause the cuda context to be created at startup,
-  // before the fork. So we hardcode the rmm_mode to "cuda" for this test
-  // and explicitly set the device 0 resource to it. Note that using
-  // `set_current_device_resource` would result in a call to `cudaGetDevice()`
-  // which would also initialize the CUDA context before the fork.
-  auto const rmm_mode = "cuda";
-  auto resource       = cudf::test::create_memory_resource(rmm_mode);
-  rmm::mr::set_per_device_resource(rmm::cuda_device_id{0}, resource.get());
-  return RUN_ALL_TESTS();
-}
diff --git a/cpp/tests/jit/jit-cache-test.cpp b/cpp/tests/jit/jit-cache-test.cpp
deleted file mode 100644
index 43cd5911ae7..00000000000
--- a/cpp/tests/jit/jit-cache-test.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "jit-cache-test.hpp"
-
-namespace cudf {
-namespace test {
-TEST_F(JitCacheTest, CacheExceptionTest)
-{
-  EXPECT_NO_THROW(auto program = getProgram("MemoryCacheTestProg"));
-  EXPECT_ANY_THROW(auto program1 = getProgram("MemoryCacheTestProg1"));
-}
-
-// Test the in memory caching ability
-TEST_F(JitCacheTest, MemoryCacheKernelTest)
-{
-  // Check the kernel caching
-
-  // Single value column
-  // TODO (dm): should be a scalar tho
-  auto column = cudf::test::fixed_width_column_wrapper<int>{{5, 0}};
-  auto expect = cudf::test::fixed_width_column_wrapper<int>{{125, 0}};
-
-  // make new program and rename it to match old program
-  auto program = getProgram("MemoryCacheTestProg1", program2_source);
-  // TODO: when I convert this pair to a class, make an inherited test class that can edit names
-  std::get<0>(program) = "MemoryCacheTestProg";
-
-  // remove any file cache so below kernel should not be obtained from file
-  purgeFileCache();
-
-  // make kernel that if the cache tried to compile, will use a different
-  // program than intended and give wrong result.
-  auto kernel = getKernelInstantiation("my_kernel", program, {"3", "int"});
-
-  (*std::get<1>(kernel))
-    .configure(grid, block)
-    .launch(column.operator cudf::mutable_column_view().data<int>());
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column);
-}
-
-TEST_F(JitCacheTest, MemoryCacheProgramTest)
-{
-  // Check program source caching
-
-  // Single value column
-  // TODO (dm): should be a scalar tho
-  auto column = cudf::test::fixed_width_column_wrapper<int>{{5, 0}};
-  auto expect = cudf::test::fixed_width_column_wrapper<int>{{625, 0}};
-
-  // remove any file cache so below program should not be obtained from file
-  purgeFileCache();
-
-  auto program = getProgram("MemoryCacheTestProg");
-  // make kernel that HAS to be compiled
-  auto kernel = getKernelInstantiation("my_kernel", program, {"4", "int"});
-
-  (*std::get<1>(kernel))
-    .configure(grid, block)
-    .launch(column.operator cudf::mutable_column_view().data<int>());
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column);
-}
-
-// Test the file caching ability
-#if defined(JITIFY_USE_CACHE)
-TEST_F(JitCacheTest, FileCacheProgramTest)
-{
-  // Brand new cache object that has nothing in in-memory cache
-  cudf::jit::cudfJitCache cache;
-
-  // Single value column
-  auto column = cudf::test::fixed_width_column_wrapper<int>{{5, 0}};
-  auto expect = cudf::test::fixed_width_column_wrapper<int>{{625, 0}};
-
-  // make program
-  auto program = cache.getProgram("FileCacheTestProg", program_source);
-  // make kernel that HAS to be compiled
-  auto kernel = cache.getKernelInstantiation("my_kernel", program, {"4", "int"});
-  (*std::get<1>(kernel))
-    .configure(grid, block)
-    .launch(column.operator cudf::mutable_column_view().data<int>());
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column);
-}
-
-TEST_F(JitCacheTest, FileCacheKernelTest)
-{
-  // Brand new cache object that has nothing in in-memory cache
-  cudf::jit::cudfJitCache cache;
-
-  // Single value column
-  auto column = cudf::test::fixed_width_column_wrapper<int>{{5, 0}};
-  auto expect = cudf::test::fixed_width_column_wrapper<int>{{125, 0}};
-
-  // make program
-  auto program = cache.getProgram("FileCacheTestProg", program_source);
-  // make kernel that should NOT need to be compiled
-  auto kernel = cache.getKernelInstantiation("my_kernel", program, {"3", "int"});
-  (*std::get<1>(kernel))
-    .configure(grid, block)
-    .launch(column.operator cudf::mutable_column_view().data<int>());
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column);
-}
-#endif
-
-}  // namespace test
-}  // namespace cudf
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/jit/jit-cache-test.hpp b/cpp/tests/jit/jit-cache-test.hpp
deleted file mode 100644
index 261cc0fd3b4..00000000000
--- a/cpp/tests/jit/jit-cache-test.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <boost/filesystem.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
-#include <jit/cache.h>
-
-// Note that this test does not inherit from cudf::test::BaseFixture because
-// doing so would cause the CUDA context to be created before the fork in
-// the JitCacheMultiProcessTest, where we need it to be created after the fork
-// to ensure the forked child has a context. These tests do not need the
-// memory_resource member of BaseFixture.
-struct JitCacheTest : public ::testing::Test, public cudf::jit::cudfJitCache {
-  JitCacheTest() : grid(1), block(1) {}
-
-  virtual ~JitCacheTest() {}
-
-  virtual void SetUp()
-  {
-    purgeFileCache();
-    warmUp();
-  }
-
-  virtual void TearDown() { purgeFileCache(); }
-
-  void purgeFileCache()
-  {
-#if defined(JITIFY_USE_CACHE)
-    // In the multi-process test there are two processes repeatedly creating and deleting the cache.
-    // While deleting the cache, we cannot use `filesystem::remove_all(cudf::jit::getCacheDir())`
-    // because it would recursively remove all files within the cache directory and then finally
-    // remove the directory itself. A non-empty directory cannot be removed and throws an exception.
-    // On slower disks, there would be times when one process would be deleting the cache and the
-    // other would be creating it. So while the process that’s trying to delete is done deleting the
-    // contents of the directory, and is about to delete the directory itself, the other process
-    // would go ahead and create a cache file in that directory. Thus causing an exception to be
-    // thrown on the process trying to delete the now non-empty directory.
-
-    // By recursing the cache directory and only deleting cache files, we leave the directory alone.
-    // That way the aforementioned scenario doesn’t occur
-    std::vector<boost::filesystem::path> file_paths;
-    for (auto& path : boost::filesystem::recursive_directory_iterator(cudf::jit::getCacheDir())) {
-      if (boost::filesystem::is_regular_file(path)) { file_paths.push_back(path); }
-    }
-    for (auto& file_path : file_paths) { boost::filesystem::remove(file_path); }
-#endif
-  }
-
-  void warmUp()
-  {
-    // Prime up the cache so that the in-memory and file cache is populated
-
-    // Single value column
-    auto column = cudf::test::fixed_width_column_wrapper<int>({4, 0});
-    auto expect = cudf::test::fixed_width_column_wrapper<int>({64, 0});
-
-    // make program
-    auto program = getProgram("MemoryCacheTestProg", program_source);
-    // make kernel
-    auto kernel = getKernelInstantiation("my_kernel", program, {"3", "int"});
-    (*std::get<1>(kernel))
-      .configure(grid, block)
-      .launch(column.operator cudf::mutable_column_view().data<int>());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column);
-  }
-
-  const char* program_source =
-    "my_program\n"
-    "template<int N, typename T>\n"
-    "__global__\n"
-    "void my_kernel(T* data) {\n"
-    "    T data0 = data[0];\n"
-    "    for( int i=0; i<N-1; ++i ) {\n"
-    "        data[0] *= data0;\n"
-    "    }\n"
-    "}\n";
-
-  const char* program2_source =
-    "my_program\n"
-    "template<int N, typename T>\n"
-    "__global__\n"
-    "void my_kernel(T* data) {\n"
-    "    T data0 = data[0];\n"
-    "    for( int i=0; i<N-1; ++i ) {\n"
-    "        data[0] += data0;\n"
-    "    }\n"
-    "}\n";
-
-  const char* program3_source =
-    "my_program\n"
-    "template<int N, typename T>\n"
-    "__global__\n"
-    "void my_kernel(T* data, T* out) {\n"
-    "    T data0 = data[0];\n"
-    "    for( int i=0; i<N; ++i ) {\n"
-    "        out[0] *= data0;\n"
-    "    }\n"
-    "}\n";
-
-  dim3 grid;
-  dim3 block;
-};
-
-/**
- * @brief Similar to JitCacheTest but it doesn't run warmUp() test in SetUp and
- * purgeFileCache() in SetUp and TearDown
- */
-struct JitCacheMultiProcessTest : public JitCacheTest {
-  virtual void SetUp() {}
-
-  virtual void TearDown() {}
-};
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index e7eaeb7f415..b6e2b35e760 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -879,15 +879,14 @@ TEST_F(RollingTestUdf, StaticWindow)
 
   std::unique_ptr<cudf::column> output;
 
-  auto start = cudf::detail::make_counting_transform_iterator(0, [size] __device__(size_type row) {
+  auto start = cudf::detail::make_counting_transform_iterator(0, [size](size_type row) {
     return std::accumulate(thrust::make_counting_iterator(std::max(0, row - 2 + 1)),
                            thrust::make_counting_iterator(std::min(size, row + 2 + 1)),
                            0);
   });
 
-  auto valid = cudf::detail::make_counting_transform_iterator(0, [size] __device__(size_type row) {
-    return (row != 0 && row != size - 2 && row != size - 1);
-  });
+  auto valid = cudf::detail::make_counting_transform_iterator(
+    0, [size](size_type row) { return (row != 0 && row != size - 2 && row != size - 1); });
 
   fixed_width_column_wrapper<int64_t> expected{start, start + size, valid};
 
@@ -895,7 +894,7 @@ TEST_F(RollingTestUdf, StaticWindow)
   auto cuda_udf_agg = cudf::make_udf_aggregation(
     cudf::udf_type::CUDA, this->cuda_func, cudf::data_type{cudf::type_id::INT64});
 
-  EXPECT_NO_THROW(output = cudf::rolling_window(input, 2, 2, 4, cuda_udf_agg));
+  output = cudf::rolling_window(input, 2, 2, 4, cuda_udf_agg);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 
@@ -903,7 +902,7 @@ TEST_F(RollingTestUdf, StaticWindow)
   auto ptx_udf_agg = cudf::make_udf_aggregation(
     cudf::udf_type::PTX, this->ptx_func, cudf::data_type{cudf::type_id::INT64});
 
-  EXPECT_NO_THROW(output = cudf::rolling_window(input, 2, 2, 4, ptx_udf_agg));
+  output = cudf::rolling_window(input, 2, 2, 4, ptx_udf_agg);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 }
@@ -941,7 +940,7 @@ TEST_F(RollingTestUdf, DynamicWindow)
   auto cuda_udf_agg = cudf::make_udf_aggregation(
     cudf::udf_type::CUDA, this->cuda_func, cudf::data_type{cudf::type_id::INT64});
 
-  EXPECT_NO_THROW(output = cudf::rolling_window(input, preceding, following, 2, cuda_udf_agg));
+  output = cudf::rolling_window(input, preceding, following, 2, cuda_udf_agg);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 
@@ -949,7 +948,7 @@ TEST_F(RollingTestUdf, DynamicWindow)
   auto ptx_udf_agg = cudf::make_udf_aggregation(
     cudf::udf_type::PTX, this->ptx_func, cudf::data_type{cudf::type_id::INT64});
 
-  EXPECT_NO_THROW(output = cudf::rolling_window(input, preceding, following, 2, ptx_udf_agg));
+  output = cudf::rolling_window(input, preceding, following, 2, ptx_udf_agg);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 }
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index a54c86405a5..5205124c129 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -32,7 +32,7 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/detail/column_utilities.hpp>
 
-#include <jit/type.h>
+#include <jit/type.hpp>
 
 #include <rmm/exec_policy.hpp>