diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 03d32959c1a..b0c8c255aef 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -137,8 +137,8 @@ include(cmake/thirdparty/CUDF_GetDLPack.cmake) include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake) # find or install GoogleTest include(cmake/thirdparty/CUDF_GetGTest.cmake) -# Stringify libcudf and libcudacxx headers used in JIT operations -include(cmake/Modules/StringifyJITHeaders.cmake) +# preprocess jitify-able kernels +include(cmake/Modules/JitifyPreprocessKernels.cmake) # find cuFile include(cmake/Modules/FindcuFile.cmake) @@ -153,9 +153,6 @@ add_library(cudf src/ast/transform.cu src/binaryop/binaryop.cpp src/binaryop/compiled/binary_ops.cu - src/binaryop/jit/code/kernel.cpp - src/binaryop/jit/code/operation.cpp - src/binaryop/jit/code/traits.cpp src/labeling/label_bins.cu src/bitmask/null_mask.cu src/column/column.cu @@ -256,7 +253,6 @@ add_library(cudf src/io/utilities/parsing_utils.cu src/io/utilities/type_conversion.cpp src/jit/cache.cpp - src/jit/launcher.cpp src/jit/parser.cpp src/jit/type.cpp src/join/cross_join.cu @@ -302,8 +298,6 @@ add_library(cudf src/reshape/interleave_columns.cu src/reshape/tile.cu src/rolling/grouped_rolling.cu - src/rolling/jit/code/kernel.cpp - src/rolling/jit/code/operation.cpp src/rolling/rolling.cu src/round/round.cu src/scalar/scalar.cpp @@ -387,7 +381,6 @@ add_library(cudf src/text/tokenize.cu src/transform/bools_to_mask.cu src/transform/encode.cu - src/transform/jit/code/kernel.cpp src/transform/mask_to_bools.cu src/transform/nans_to_nulls.cu src/transform/row_bit_count.cu @@ -465,7 +458,7 @@ endif() target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}") # Compile stringified JIT sources first -add_dependencies(cudf stringify_run) +add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries(cudf @@ -476,9 +469,9 @@ target_link_libraries(cudf rmm::rmm) if(CUDA_STATIC_RUNTIME) - target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart_static CUDA::cuda_driver) + target_link_libraries(cudf PUBLIC CUDA::cudart_static CUDA::cuda_driver) else() - target_link_libraries(cudf PUBLIC CUDA::nvrtc CUDA::cudart CUDA::cuda_driver) + target_link_libraries(cudf PUBLIC CUDA::cudart CUDA::cuda_driver) endif() # Add cuFile interface if available diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake new file mode 100644 index 00000000000..7bb5b1d0a14 --- /dev/null +++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake @@ -0,0 +1,70 @@ +#============================================================================= +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +cmake_minimum_required(VERSION 3.18) + +file(MAKE_DIRECTORY "${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files") + +# Create `jitify_preprocess` executable +project(jitify_preprocess VERSION 2.0 LANGUAGES CXX CUDA) +add_executable(jitify_preprocess "${JITIFY_INCLUDE_DIR}/jitify2_preprocess.cpp") + +target_link_libraries(jitify_preprocess CUDA::cudart ${CMAKE_DL_LIBS}) + +function(jit_preprocess_files) + cmake_parse_arguments(ARG + "" + "SOURCE_DIRECTORY" + "FILES" + ${ARGN} + ) + + foreach(ARG_FILE ${ARG_FILES}) + set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit) + list(APPEND JIT_PREPROCESSED_FILES "${ARG_OUTPUT}") + add_custom_command(WORKING_DIRECTORY ${ARG_SOURCE_DIRECTORY} + DEPENDS jitify_preprocess + OUTPUT ${ARG_OUTPUT} + VERBATIM + COMMAND jitify_preprocess ${ARG_FILE} + -o ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files + -v + -i + -m + -std=c++14 + -remove-unused-globals + -D__CUDACC_RTC__ + -I${CUDF_SOURCE_DIR}/include + -I${CUDF_SOURCE_DIR}/src + -I${LIBCUDACXX_INCLUDE_DIR} + -I${CUDAToolkit_INCLUDE_DIRS} + --no-preinclude-workarounds + --no-replace-pragma-once + ) + endforeach() + set(JIT_PREPROCESSED_FILES "${JIT_PREPROCESSED_FILES}" PARENT_SCOPE) +endfunction() + +jit_preprocess_files(SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src + FILES binaryop/jit/kernel.cu + transform/jit/kernel.cu + rolling/jit/kernel.cu + ) + +add_custom_target(jitify_preprocess_run DEPENDS ${JIT_PREPROCESSED_FILES}) + +file(COPY "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx") +file(COPY "${LIBCXX_INCLUDE_DIR}" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx") diff --git a/cpp/cmake/Modules/StringifyJITHeaders.cmake b/cpp/cmake/Modules/StringifyJITHeaders.cmake deleted file mode 100644 index d67c546cd30..00000000000 --- a/cpp/cmake/Modules/StringifyJITHeaders.cmake +++ /dev/null @@ -1,172 +0,0 @@ -#============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#============================================================================= - -file(MAKE_DIRECTORY "${CUDF_GENERATED_INCLUDE_DIR}/include") - -# Create `stringify` executable -add_executable(stringify "${JITIFY_INCLUDE_DIR}/stringify.cpp") - -execute_process(WORKING_DIRECTORY ${CUDF_GENERATED_INCLUDE_DIR} - COMMAND ${CMAKE_COMMAND} -E make_directory - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include - ) - -# Use `stringify` to convert types.h to c-str for use in JIT code -add_custom_command(WORKING_DIRECTORY ${CUDF_SOURCE_DIR}/include - COMMENT "Stringify headers for use in JIT compiled code" - DEPENDS stringify - OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.h.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/bit.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/timestamps.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/fixed_point.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/durations.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/assert.cuh.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/chrono.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/climits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstddef.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstdint.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ctime.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/limits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ratio.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/type_traits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/version.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__config.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_pop.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_push.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__config.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_pop.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_push.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__undef_macros.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/chrono.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/climits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstddef.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstdint.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ctime.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/limits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ratio.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/type_traits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/version.jit - MAIN_DEPENDENCY ${CUDF_SOURCE_DIR}/include/cudf/types.h - ${CUDF_SOURCE_DIR}/include/cudf/types.hpp - ${CUDF_SOURCE_DIR}/include/cudf/utilities/bit.hpp - ${CUDF_SOURCE_DIR}/include/cudf/wrappers/timestamps.hpp - ${CUDF_SOURCE_DIR}/include/cudf/fixed_point/fixed_point.hpp - ${CUDF_SOURCE_DIR}/include/cudf/wrappers/durations.hpp - ${CUDF_SOURCE_DIR}/include/cudf/detail/utilities/assert.cuh - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/chrono - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/climits - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/cstddef - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/cstdint - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/ctime - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/limits - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/ratio - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/type_traits - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/version - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__config - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__pragma_pop - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__pragma_push - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__config - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__pragma_pop - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__pragma_push - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__undef_macros - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/chrono - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/climits - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/cstddef - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/cstdint - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/ctime - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/limits - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/ratio - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/type_traits - ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/version - - # stringified headers are placed underneath the bin include jit directory and end in ".jit" - COMMAND ${CUDF_BINARY_DIR}/stringify cudf/types.h > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.h.jit - COMMAND ${CUDF_BINARY_DIR}/stringify cudf/types.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.hpp.jit - COMMAND ${CUDF_BINARY_DIR}/stringify cudf/utilities/bit.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/bit.hpp.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ../src/rolling/rolling_jit_detail.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/rolling_jit_detail.hpp.jit - COMMAND ${CUDF_BINARY_DIR}/stringify cudf/wrappers/timestamps.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/timestamps.hpp.jit - COMMAND ${CUDF_BINARY_DIR}/stringify cudf/fixed_point/fixed_point.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/fixed_point.hpp.jit - COMMAND ${CUDF_BINARY_DIR}/stringify cudf/wrappers/durations.hpp > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/durations.hpp.jit - COMMAND ${CUDF_BINARY_DIR}/stringify cudf/detail/utilities/assert.cuh > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/assert.cuh.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/chrono cuda_std_chrono > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/chrono.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/climits cuda_std_climits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/climits.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/cstddef cuda_std_cstddef > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstddef.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/cstdint cuda_std_cstdint > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstdint.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/ctime cuda_std_ctime > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ctime.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/limits cuda_std_limits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/limits.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/ratio cuda_std_ratio > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ratio.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/type_traits cuda_std_type_traits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/type_traits.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/version cuda_std_version > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/version.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__config cuda_std_detail___config > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__config.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__pragma_pop cuda_std_detail___pragma_pop > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_pop.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/__pragma_push cuda_std_detail___pragma_push > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_push.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__config cuda_std_detail_libcxx_include___config > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__config.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__pragma_pop cuda_std_detail_libcxx_include___pragma_pop > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_pop.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__pragma_push cuda_std_detail_libcxx_include___pragma_push > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_push.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/__undef_macros cuda_std_detail_libcxx_include___undef_macros > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__undef_macros.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/chrono cuda_std_detail_libcxx_include_chrono > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/chrono.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/climits cuda_std_detail_libcxx_include_climits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/climits.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/cstddef cuda_std_detail_libcxx_include_cstddef > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstddef.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/cstdint cuda_std_detail_libcxx_include_cstdint > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstdint.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/ctime cuda_std_detail_libcxx_include_ctime > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ctime.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/limits cuda_std_detail_libcxx_include_limits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/limits.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/ratio cuda_std_detail_libcxx_include_ratio > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ratio.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/type_traits cuda_std_detail_libcxx_include_type_traits > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/type_traits.jit - COMMAND ${CUDF_BINARY_DIR}/stringify ${LIBCUDACXX_INCLUDE_DIR}/cuda/std/detail/libcxx/include/version cuda_std_detail_libcxx_include_version > ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/version.jit - ) - -add_custom_target(stringify_run DEPENDS - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.h.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/types.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/bit.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/timestamps.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/fixed_point.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/durations.hpp.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/assert.cuh.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/chrono.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/climits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstddef.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/cstdint.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ctime.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/limits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/ratio.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/type_traits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/version.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__config.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_pop.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/__pragma_push.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__config.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_pop.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__pragma_push.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/__undef_macros.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/chrono.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/climits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstddef.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/cstdint.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ctime.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/limits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/ratio.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/type_traits.jit - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit/libcudacxx/cuda/std/detail/libcxx/include/version.jit - ) - -################################################################################################### -# - copy libcu++ ---------------------------------------------------------------------------------- - -# `${LIBCUDACXX_INCLUDE_DIR}/` specifies that the contents of this directory will be installed (not the directory itself) -file(COPY "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx") -file(COPY "${LIBCXX_INCLUDE_DIR}" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx") diff --git a/cpp/cmake/thirdparty/CUDF_GetJitify.cmake b/cpp/cmake/thirdparty/CUDF_GetJitify.cmake index e041be26d64..6e853816ec5 100644 --- a/cpp/cmake/thirdparty/CUDF_GetJitify.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetJitify.cmake @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,9 +18,9 @@ function(find_and_configure_jitify) CPMFindPackage(NAME jitify - VERSION 1.0.0 + VERSION 2.0.0 GIT_REPOSITORY https://github.com/rapidsai/jitify.git - GIT_TAG cudf_0.16 + GIT_TAG cudf_0.19 GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE) set(JITIFY_INCLUDE_DIR "${jitify_SOURCE_DIR}" PARENT_SCOPE) diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 9557670e967..55a7f7a9f1b 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -18,20 +18,13 @@ */ #include "compiled/binary_ops.hpp" -#include "jit/code/code.h" #include "jit/util.hpp" -#include -#include -#include +#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include #include @@ -57,6 +50,7 @@ namespace cudf { namespace binops { namespace detail { + /** * @brief Computes output valid mask for op between a column and a scalar */ @@ -79,70 +73,47 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col, namespace jit { -const std::string hash = "prog_binop"; - -const std::vector header_names{"operation.h", - "traits.h", - cudf_types_hpp, - cudf_utilities_bit_hpp, - cudf_wrappers_timestamps_hpp, - cudf_wrappers_durations_hpp, - cudf_fixed_point_fixed_point_hpp, - cudf_detail_utilities_assert_cuh}; - -std::istream* headers_code(std::string filename, std::iostream& stream) -{ - if (filename == "operation.h") { - stream << code::operation; - return &stream; - } - if (filename == "traits.h") { - stream << code::traits; - return &stream; - } - auto it = cudf::jit::stringified_headers.find(filename); - if (it != cudf::jit::stringified_headers.end()) { - return cudf::jit::send_stringified_header(stream, it->second); - } - return nullptr; -} - void binary_operation(mutable_column_view& out, - scalar const& lhs, - column_view const& rhs, + column_view const& lhs, + scalar const& rhs, binary_operator op, + OperatorType op_type, rmm::cuda_stream_view stream) { if (is_null_dependent(op)) { - cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) - .set_kernel_inst("kernel_v_s_with_validity", // name of the kernel we are - // launching - {cudf::jit::get_type_name(out.type()), // list of template arguments - cudf::jit::get_type_name(rhs.type()), - cudf::jit::get_type_name(lhs.type()), - get_operator_name(op, OperatorType::Reverse)}) - .launch(out.size(), - cudf::jit::get_data_ptr(out), - cudf::jit::get_data_ptr(rhs), - cudf::jit::get_data_ptr(lhs), - out.null_mask(), - rhs.null_mask(), - rhs.offset(), - lhs.is_valid()); + std::string kernel_name = + jitify2::reflection::Template("cudf::binops::jit::kernel_v_s_with_validity") // + .instantiate(cudf::jit::get_type_name(out.type()), // list of template arguments + cudf::jit::get_type_name(lhs.type()), + cudf::jit::get_type_name(rhs.type()), + get_operator_name(op, op_type)); + + cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) + .get_kernel(kernel_name) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + ->launch(out.size(), + cudf::jit::get_data_ptr(out), + cudf::jit::get_data_ptr(lhs), + cudf::jit::get_data_ptr(rhs), + out.null_mask(), + lhs.null_mask(), + lhs.offset(), + rhs.is_valid()); } else { - cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) - .set_kernel_inst("kernel_v_s", // name of the kernel we are - // launching - {cudf::jit::get_type_name(out.type()), // list of template arguments - cudf::jit::get_type_name(rhs.type()), - cudf::jit::get_type_name(lhs.type()), - get_operator_name(op, OperatorType::Reverse)}) - .launch(out.size(), - cudf::jit::get_data_ptr(out), - cudf::jit::get_data_ptr(rhs), - cudf::jit::get_data_ptr(lhs)); + std::string kernel_name = + jitify2::reflection::Template("cudf::binops::jit::kernel_v_s") // + .instantiate(cudf::jit::get_type_name(out.type()), // list of template arguments + cudf::jit::get_type_name(lhs.type()), + cudf::jit::get_type_name(rhs.type()), + get_operator_name(op, op_type)); + + cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) + .get_kernel(kernel_name) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + ->launch(out.size(), + cudf::jit::get_data_ptr(out), + cudf::jit::get_data_ptr(lhs), + cudf::jit::get_data_ptr(rhs)); } } @@ -152,37 +123,16 @@ void binary_operation(mutable_column_view& out, binary_operator op, rmm::cuda_stream_view stream) { - if (is_null_dependent(op)) { - cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) - .set_kernel_inst("kernel_v_s_with_validity", // name of the kernel we are - // launching - {cudf::jit::get_type_name(out.type()), // list of template arguments - cudf::jit::get_type_name(lhs.type()), - cudf::jit::get_type_name(rhs.type()), - get_operator_name(op, OperatorType::Direct)}) - .launch(out.size(), - cudf::jit::get_data_ptr(out), - cudf::jit::get_data_ptr(lhs), - cudf::jit::get_data_ptr(rhs), - out.null_mask(), - lhs.null_mask(), - lhs.offset(), - rhs.is_valid()); - } else { - cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) - .set_kernel_inst("kernel_v_s", // name of the kernel we are - // launching - {cudf::jit::get_type_name(out.type()), // list of template arguments - cudf::jit::get_type_name(lhs.type()), - cudf::jit::get_type_name(rhs.type()), - get_operator_name(op, OperatorType::Direct)}) - .launch(out.size(), - cudf::jit::get_data_ptr(out), - cudf::jit::get_data_ptr(lhs), - cudf::jit::get_data_ptr(rhs)); - } + return binary_operation(out, lhs, rhs, op, OperatorType::Direct, stream); +} + +void binary_operation(mutable_column_view& out, + scalar const& lhs, + column_view const& rhs, + binary_operator op, + rmm::cuda_stream_view stream) +{ + return binary_operation(out, rhs, lhs, op, OperatorType::Reverse, stream); } void binary_operation(mutable_column_view& out, @@ -192,36 +142,40 @@ void binary_operation(mutable_column_view& out, rmm::cuda_stream_view stream) { if (is_null_dependent(op)) { - cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) - .set_kernel_inst("kernel_v_v_with_validity", // name of the kernel we are - // launching - {cudf::jit::get_type_name(out.type()), // list of template arguments - cudf::jit::get_type_name(lhs.type()), - cudf::jit::get_type_name(rhs.type()), - get_operator_name(op, OperatorType::Direct)}) - .launch(out.size(), - cudf::jit::get_data_ptr(out), - cudf::jit::get_data_ptr(lhs), - cudf::jit::get_data_ptr(rhs), - out.null_mask(), - lhs.null_mask(), - rhs.offset(), - rhs.null_mask(), - rhs.offset()); + std::string kernel_name = + jitify2::reflection::Template("cudf::binops::jit::kernel_v_v_with_validity") // + .instantiate(cudf::jit::get_type_name(out.type()), // list of template arguments + cudf::jit::get_type_name(lhs.type()), + cudf::jit::get_type_name(rhs.type()), + get_operator_name(op, OperatorType::Direct)); + + cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) + .get_kernel(kernel_name) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + ->launch(out.size(), + cudf::jit::get_data_ptr(out), + cudf::jit::get_data_ptr(lhs), + cudf::jit::get_data_ptr(rhs), + out.null_mask(), + lhs.null_mask(), + rhs.offset(), + rhs.null_mask(), + rhs.offset()); } else { - cudf::jit::launcher( - hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream) - .set_kernel_inst("kernel_v_v", // name of the kernel we are - // launching - {cudf::jit::get_type_name(out.type()), // list of template arguments - cudf::jit::get_type_name(lhs.type()), - cudf::jit::get_type_name(rhs.type()), - get_operator_name(op, OperatorType::Direct)}) - .launch(out.size(), - cudf::jit::get_data_ptr(out), - cudf::jit::get_data_ptr(lhs), - cudf::jit::get_data_ptr(rhs)); + std::string kernel_name = + jitify2::reflection::Template("cudf::binops::jit::kernel_v_v") // + .instantiate(cudf::jit::get_type_name(out.type()), // list of template arguments + cudf::jit::get_type_name(lhs.type()), + cudf::jit::get_type_name(rhs.type()), + get_operator_name(op, OperatorType::Direct)); + + cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) + .get_kernel(kernel_name) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + ->launch(out.size(), + cudf::jit::get_data_ptr(out), + cudf::jit::get_data_ptr(lhs), + cudf::jit::get_data_ptr(rhs)); } } @@ -234,23 +188,24 @@ void binary_operation(mutable_column_view& out, std::string const output_type_name = cudf::jit::get_type_name(out.type()); std::string ptx_hash = - hash + "." + std::to_string(std::hash{}(ptx + output_type_name)); + "prog_binop." + std::to_string(std::hash{}(ptx + output_type_name)); std::string cuda_source = - "\n#include \n" + - cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name) + code::kernel; - - cudf::jit::launcher( - ptx_hash, cuda_source, header_names, cudf::jit::compiler_flags, headers_code, stream) - .set_kernel_inst("kernel_v_v", // name of the kernel - // we are launching - {output_type_name, // list of template arguments - cudf::jit::get_type_name(lhs.type()), - cudf::jit::get_type_name(rhs.type()), - get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct)}) - .launch(out.size(), - cudf::jit::get_data_ptr(out), - cudf::jit::get_data_ptr(lhs), - cudf::jit::get_data_ptr(rhs)); + cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name); + + std::string kernel_name = + jitify2::reflection::Template("cudf::binops::jit::kernel_v_v") // + .instantiate(output_type_name, // list of template arguments + cudf::jit::get_type_name(lhs.type()), + cudf::jit::get_type_name(rhs.type()), + get_operator_name(binary_operator::GENERIC_BINARY, OperatorType::Direct)); + + cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit) + .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + ->launch(out.size(), + cudf::jit::get_data_ptr(out), + cudf::jit::get_data_ptr(lhs), + cudf::jit::get_data_ptr(rhs)); } } // namespace jit diff --git a/cpp/src/binaryop/jit/code/kernel.cpp b/cpp/src/binaryop/jit/code/kernel.cpp deleted file mode 100644 index cfa1f1f82d2..00000000000 --- a/cpp/src/binaryop/jit/code/kernel.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini - * Copyright 2018 Rommel Quintanilla - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace cudf { -namespace binops { -namespace jit { -namespace code { - -// clang-format off -const char* kernel = - R"***( - #include "operation.h" - - #include - #include - #include - #include - #include - - template - __global__ - void kernel_v_s_with_validity(cudf::size_type size, TypeOut* out_data, TypeLhs* lhs_data, - TypeRhs* rhs_data, cudf::bitmask_type* output_mask, - cudf::bitmask_type const* mask, - cudf::size_type offset, bool scalar_valid) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; - - for (cudf::size_type i=start; i( - lhs_data[i], rhs_data[0], - mask ? cudf::bit_is_set(mask, offset + i) : true, scalar_valid, output_valid); - if (output_mask && !output_valid) cudf::clear_bit(output_mask, i); - } - } - - template - __global__ - void kernel_v_s(cudf::size_type size, - TypeOut* out_data, TypeLhs* lhs_data, TypeRhs* rhs_data) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; - - for (cudf::size_type i=start; i(lhs_data[i], rhs_data[0]); - } - } - - template - __global__ - void kernel_v_v(cudf::size_type size, - TypeOut* out_data, TypeLhs* lhs_data, TypeRhs* rhs_data) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; - - for (cudf::size_type i=start; i(lhs_data[i], rhs_data[i]); - } - } - - template - __global__ - void kernel_v_v_with_validity(cudf::size_type size, TypeOut* out_data, TypeLhs* lhs_data, - TypeRhs* rhs_data, cudf::bitmask_type* output_mask, - cudf::bitmask_type const* lhs_mask, cudf::size_type lhs_offset, - cudf::bitmask_type const* rhs_mask, cudf::size_type rhs_offset) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; - - for (cudf::size_type i=start; i( - lhs_data[i], rhs_data[i], - lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true, - rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true, - output_valid); - if (output_mask && !output_valid) cudf::clear_bit(output_mask, i); - } - } -)***"; -// clang-format on - -} // namespace code -} // namespace jit -} // namespace binops -} // namespace cudf diff --git a/cpp/src/binaryop/jit/code/operation.cpp b/cpp/src/binaryop/jit/code/operation.cpp deleted file mode 100644 index 938ab0614d4..00000000000 --- a/cpp/src/binaryop/jit/code/operation.cpp +++ /dev/null @@ -1,574 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace cudf { -namespace binops { -namespace jit { -namespace code { - -const char* operation = - R"***( - #pragma once - - #include "traits.h" - - #include - - #include - - using namespace cuda::std; - - struct Add { - // Allow sum between chronos only when both input and output types - // are chronos. Unsupported combinations will fail to compile - template && - is_chrono_v && - is_chrono_v)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return x + y; - } - - template || - !is_chrono_v || - !is_chrono_v)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - using TypeCommon = typename common_type::type; - return static_cast(static_cast(x) + static_cast(y)); - } - }; - - using RAdd = Add; - - struct Sub { - // Allow difference between chronos only when both input and output types - // are chronos. Unsupported combinations will fail to compile - template && - is_chrono_v && - is_chrono_v)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return x - y; - } - - template || - !is_chrono_v || - !is_chrono_v)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - using TypeCommon = typename common_type::type; - return static_cast(static_cast(x) - static_cast(y)); - } - }; - - struct RSub { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return Sub::operate(y, x); - } - }; - - struct Mul { - template )>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - using TypeCommon = typename common_type::type; - return static_cast(static_cast(x) * static_cast(y)); - } - - template )>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return DurationProduct(x, y); - } - - template && is_integral_v) || - (is_integral_v && is_duration_v)>* = nullptr> - static TypeOut DurationProduct(TypeLhs x, TypeRhs y) { - return x * y; - } - }; - - using RMul = Mul; - - struct Div { - template )>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - using TypeCommon = typename common_type::type; - return static_cast(static_cast(x) / static_cast(y)); - } - - template )>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return DurationDivide(x, y); - } - - template || is_duration_v)>* = nullptr> - static TypeOut DurationDivide(TypeLhs x, TypeRhs y) { - return x / y; - } - }; - - struct RDiv { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return Div::operate(y, x); - } - }; - - struct TrueDiv { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (static_cast(x) / static_cast(y)); - } - }; - - struct RTrueDiv { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return TrueDiv::operate(y, x); - } - }; - - struct FloorDiv { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return floor(static_cast(x) / static_cast(y)); - } - }; - - struct RFloorDiv { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return FloorDiv::operate(y, x); - } - }; - - struct Mod { - template ::type>)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - using TypeCommon = typename common_type::type; - return static_cast(static_cast(x) % static_cast(y)); - } - - template ::type>)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return static_cast(fmodf(static_cast(x), static_cast(y))); - } - - template ::type>)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return static_cast(fmod(static_cast(x), static_cast(y))); - } - - template && is_duration_v)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return x % y; - } - }; - - struct RMod { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return Mod::operate(y, x); - } - }; - - struct PyMod { - template )>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return ((x % y) + y) % y; - } - - template )>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - double x1 = static_cast(x); - double y1 = static_cast(y); - return fmod(fmod(x1, y1) + y1, y1); - } - - template && is_duration_v)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - return ((x % y) + y) % y; - } - }; - - struct RPyMod { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return PyMod::operate(y, x); - } - }; - - struct Pow { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return pow(static_cast(x), static_cast(y)); - } - }; - - struct RPow { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return Pow::operate(y, x); - } - }; - - struct Equal { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x == y); - } - }; - - using REqual = Equal; - - struct NotEqual { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x != y); - } - }; - - using RNotEqual = NotEqual; - - struct Less { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x < y); - } - }; - - struct RLess { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (y < x); - } - }; - - struct Greater { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x > y); - } - }; - - struct RGreater { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (y > x); - } - }; - - struct LessEqual { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x <= y); - } - }; - - struct RLessEqual { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (y <= x); - } - }; - - struct GreaterEqual { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x >= y); - } - }; - - struct RGreaterEqual { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (y >= x); - } - }; - - struct BitwiseAnd { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (static_cast(x) & static_cast(y)); - } - }; - - using RBitwiseAnd = BitwiseAnd; - - struct BitwiseOr { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (static_cast(x) | static_cast(y)); - } - }; - - using RBitwiseOr = BitwiseOr; - - struct BitwiseXor { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (static_cast(x) ^ static_cast(y)); - } - }; - - using RBitwiseXor = BitwiseXor; - - struct LogicalAnd { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x && y); - } - }; - - using RLogicalAnd = LogicalAnd; - - struct LogicalOr { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x || y); - } - }; - - using RLogicalOr = LogicalOr; - - struct UserDefinedOp { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - TypeOut output; - using TypeCommon = typename common_type::type; - GENERIC_BINARY_OP(&output, static_cast(x), static_cast(y)); - return output; - } - }; - - struct ShiftLeft { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x << y); - } - }; - - struct RShiftLeft { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (y << x); - } - }; - - struct ShiftRight { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (x >> y); - } - }; - - struct RShiftRight { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (y >> x); - } - }; - - struct ShiftRightUnsigned { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (static_cast>(x) >> y); - } - }; - - struct RShiftRightUnsigned { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (static_cast>(y) >> x); - } - }; - - struct LogBase { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return (std::log(static_cast(x)) / std::log(static_cast(y))); - } - }; - - struct RLogBase { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return LogBase::operate(y, x); - } - }; - - struct NullEquals { - template - static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, - bool& output_valid) { - output_valid = true; - if (!lhs_valid && !rhs_valid) return true; - if (lhs_valid && rhs_valid) return x == y; - return false; - } - }; - - struct RNullEquals { - template - static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, - bool& output_valid) { - output_valid = true; - return NullEquals::operate(y, x, rhs_valid, lhs_valid, - output_valid); - } - }; - - struct NullMax { - template - static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, - bool& output_valid) { - output_valid = true; - if (!lhs_valid && !rhs_valid) { - output_valid = false; - return TypeOut{}; - } else if (lhs_valid && rhs_valid) { - return (TypeOut{x} > TypeOut{y}) ? TypeOut{x} : TypeOut{y}; - } else if (lhs_valid) return TypeOut{x}; - else return TypeOut{y}; - } - }; - - struct RNullMax { - template - static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, - bool& output_valid) { - return NullMax::operate(y, x, rhs_valid, lhs_valid, - output_valid); - } - }; - - struct NullMin { - template - static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, - bool& output_valid) { - output_valid = true; - if (!lhs_valid && !rhs_valid) { - output_valid = false; - return TypeOut{}; - } else if (lhs_valid && rhs_valid) { - return (TypeOut{x} < TypeOut{y}) ? TypeOut{x} : TypeOut{y}; - } else if (lhs_valid) return TypeOut{x}; - else return TypeOut{y}; - } - }; - - struct RNullMin { - template - static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, - bool& output_valid) { - return NullMin::operate(y, x, rhs_valid, lhs_valid, - output_valid); - } - }; - - struct PMod { - // Ideally, these two specializations - one for integral types and one for non integral - // types shouldn't be required, as std::fmod should promote integral types automatically - // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified - // code does not work - it is having trouble deciding between float/double overloads - template ::type>)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - using common_t = typename cuda::std::common_type::type; - common_t xconv{x}; - common_t yconv{y}; - auto rem = xconv % yconv; - if (rem < 0) rem = (rem + yconv) % yconv; - return TypeOut{rem}; - } - - template ::type>)>* = nullptr> - static TypeOut operate(TypeLhs x, TypeRhs y) { - using common_t = typename cuda::std::common_type::type; - common_t xconv{x}; - common_t yconv{y}; - auto rem = std::fmod(xconv, yconv); - if (rem < 0) rem = std::fmod(rem + yconv, yconv); - return TypeOut{rem}; - } - }; - - struct RPMod { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return PMod::operate(y, x); - } - }; - - struct ATan2 { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return TypeOut{std::atan2(double{x}, double{y})}; - } - }; - - struct RATan2 { - template - static TypeOut operate(TypeLhs x, TypeRhs y) { - return TypeOut{ATan2::operate(y, x)}; - } - }; -)***"; - -} // namespace code -} // namespace jit -} // namespace binops -} // namespace cudf diff --git a/cpp/src/binaryop/jit/code/traits.cpp b/cpp/src/binaryop/jit/code/traits.cpp deleted file mode 100644 index 53b980b1a02..00000000000 --- a/cpp/src/binaryop/jit/code/traits.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace cudf { -namespace binops { -namespace jit { -namespace code { -const char* traits = - R"***( - #pragma once - - // Include Jitify's cstddef header first - #include - - #include - #include - #include - #include - - #include - #include - - // ------------------------------------------------------------------------- - // type_traits cannot tell the difference between float and double - template - constexpr bool isFloat = false; - - template - constexpr bool is_timestamp_v = - cuda::std::is_same::value || - cuda::std::is_same::value || - cuda::std::is_same::value || - cuda::std::is_same::value || - cuda::std::is_same::value; - - template - constexpr bool is_duration_v = - cuda::std::is_same::value || - cuda::std::is_same::value || - cuda::std::is_same::value || - cuda::std::is_same::value || - cuda::std::is_same::value; - - template - constexpr bool is_chrono_v = is_timestamp_v || is_duration_v; - - template <> - constexpr bool isFloat = true; - - template - constexpr bool isDouble = false; - - template <> - constexpr bool isDouble = true; -)***"; - -} // namespace code -} // namespace jit -} // namespace binops -} // namespace cudf diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu new file mode 100644 index 00000000000..fcfe16f979d --- /dev/null +++ b/cpp/src/binaryop/jit/kernel.cu @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Copyright 2018-2019 BlazingDB, Inc. + * Copyright 2018 Christian Noboa Mardini + * Copyright 2018 Rommel Quintanilla + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace binops { +namespace jit { + +template +__global__ void kernel_v_s_with_validity(cudf::size_type size, + TypeOut* out_data, + TypeLhs* lhs_data, + TypeRhs* rhs_data, + cudf::bitmask_type* output_mask, + cudf::bitmask_type const* mask, + cudf::size_type offset, + bool scalar_valid) +{ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int start = tid + blkid * blksz; + int step = blksz * gridsz; + + for (cudf::size_type i = start; i < size; i += step) { + bool output_valid = false; + out_data[i] = TypeOpe::template operate( + lhs_data[i], + rhs_data[0], + mask ? cudf::bit_is_set(mask, offset + i) : true, + scalar_valid, + output_valid); + if (output_mask && !output_valid) cudf::clear_bit(output_mask, i); + } +} + +template +__global__ void kernel_v_s(cudf::size_type size, + TypeOut* out_data, + TypeLhs* lhs_data, + TypeRhs* rhs_data) +{ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int start = tid + blkid * blksz; + int step = blksz * gridsz; + + for (cudf::size_type i = start; i < size; i += step) { + out_data[i] = TypeOpe::template operate(lhs_data[i], rhs_data[0]); + } +} + +template +__global__ void kernel_v_v(cudf::size_type size, + TypeOut* out_data, + TypeLhs* lhs_data, + TypeRhs* rhs_data) +{ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int start = tid + blkid * blksz; + int step = blksz * gridsz; + + for (cudf::size_type i = start; i < size; i += step) { + out_data[i] = TypeOpe::template operate(lhs_data[i], rhs_data[i]); + } +} + +template +__global__ void kernel_v_v_with_validity(cudf::size_type size, + TypeOut* out_data, + TypeLhs* lhs_data, + TypeRhs* rhs_data, + cudf::bitmask_type* output_mask, + cudf::bitmask_type const* lhs_mask, + cudf::size_type lhs_offset, + cudf::bitmask_type const* rhs_mask, + cudf::size_type rhs_offset) +{ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int start = tid + blkid * blksz; + int step = blksz * gridsz; + + for (cudf::size_type i = start; i < size; i += step) { + bool output_valid = false; + out_data[i] = TypeOpe::template operate( + lhs_data[i], + rhs_data[i], + lhs_mask ? cudf::bit_is_set(lhs_mask, lhs_offset + i) : true, + rhs_mask ? cudf::bit_is_set(rhs_mask, rhs_offset + i) : true, + output_valid); + if (output_mask && !output_valid) cudf::clear_bit(output_mask, i); + } +} + +} // namespace jit +} // namespace binops +} // namespace cudf diff --git a/cpp/src/binaryop/jit/code/code.h b/cpp/src/binaryop/jit/operation-udf.hpp similarity index 59% rename from cpp/src/binaryop/jit/code/code.h rename to cpp/src/binaryop/jit/operation-udf.hpp index b8ff9e47c31..eaab2111d98 100644 --- a/cpp/src/binaryop/jit/code/code.h +++ b/cpp/src/binaryop/jit/operation-udf.hpp @@ -1,8 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,15 +16,5 @@ #pragma once -namespace cudf { -namespace binops { -namespace jit { -namespace code { -extern const char* kernel; -extern const char* traits; -extern const char* operation; - -} // namespace code -} // namespace jit -} // namespace binops -} // namespace cudf +// This file serves as a placeholder for user defined functions, so jitify can choose to override it +// at runtime. diff --git a/cpp/src/binaryop/jit/operation.hpp b/cpp/src/binaryop/jit/operation.hpp new file mode 100644 index 00000000000..d117f2182f9 --- /dev/null +++ b/cpp/src/binaryop/jit/operation.hpp @@ -0,0 +1,646 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Copyright 2018-2019 BlazingDB, Inc. + * Copyright 2018 Christian Noboa Mardini + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include + +#pragma once + +using namespace cuda::std; + +namespace cudf { +namespace binops { +namespace jit { + +struct Add { + // Allow sum between chronos only when both input and output types + // are chronos. Unsupported combinations will fail to compile + template < + typename TypeOut, + typename TypeLhs, + typename TypeRhs, + enable_if_t<(is_chrono_v && is_chrono_v && is_chrono_v)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return x + y; + } + + template || !is_chrono_v || + !is_chrono_v)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + using TypeCommon = typename common_type::type; + return static_cast(static_cast(x) + static_cast(y)); + } +}; + +using RAdd = Add; + +struct Sub { + // Allow difference between chronos only when both input and output types + // are chronos. Unsupported combinations will fail to compile + template < + typename TypeOut, + typename TypeLhs, + typename TypeRhs, + enable_if_t<(is_chrono_v && is_chrono_v && is_chrono_v)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return x - y; + } + + template || !is_chrono_v || + !is_chrono_v)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + using TypeCommon = typename common_type::type; + return static_cast(static_cast(x) - static_cast(y)); + } +}; + +struct RSub { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return Sub::operate(y, x); + } +}; + +struct Mul { + template )>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + using TypeCommon = typename common_type::type; + return static_cast(static_cast(x) * static_cast(y)); + } + + template )>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return DurationProduct(x, y); + } + + template && is_integral_v) || + (is_integral_v && is_duration_v)>* = nullptr> + static TypeOut DurationProduct(TypeLhs x, TypeRhs y) + { + return x * y; + } +}; + +using RMul = Mul; + +struct Div { + template )>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + using TypeCommon = typename common_type::type; + return static_cast(static_cast(x) / static_cast(y)); + } + + template )>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return DurationDivide(x, y); + } + + template || is_duration_v)>* = nullptr> + static TypeOut DurationDivide(TypeLhs x, TypeRhs y) + { + return x / y; + } +}; + +struct RDiv { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return Div::operate(y, x); + } +}; + +struct TrueDiv { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (static_cast(x) / static_cast(y)); + } +}; + +struct RTrueDiv { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return TrueDiv::operate(y, x); + } +}; + +struct FloorDiv { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return floor(static_cast(x) / static_cast(y)); + } +}; + +struct RFloorDiv { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return FloorDiv::operate(y, x); + } +}; + +struct Mod { + template < + typename TypeOut, + typename TypeLhs, + typename TypeRhs, + enable_if_t<(is_integral_v::type>)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + using TypeCommon = typename common_type::type; + return static_cast(static_cast(x) % static_cast(y)); + } + + template < + typename TypeOut, + typename TypeLhs, + typename TypeRhs, + enable_if_t<(isFloat::type>)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return static_cast(fmodf(static_cast(x), static_cast(y))); + } + + template < + typename TypeOut, + typename TypeLhs, + typename TypeRhs, + enable_if_t<(isDouble::type>)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return static_cast(fmod(static_cast(x), static_cast(y))); + } + + template && is_duration_v)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return x % y; + } +}; + +struct RMod { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return Mod::operate(y, x); + } +}; + +struct PyMod { + template )>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return ((x % y) + y) % y; + } + + template )>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + double x1 = static_cast(x); + double y1 = static_cast(y); + return fmod(fmod(x1, y1) + y1, y1); + } + + template && is_duration_v)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return ((x % y) + y) % y; + } +}; + +struct RPyMod { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return PyMod::operate(y, x); + } +}; + +struct Pow { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return pow(static_cast(x), static_cast(y)); + } +}; + +struct RPow { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return Pow::operate(y, x); + } +}; + +struct Equal { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x == y); + } +}; + +using REqual = Equal; + +struct NotEqual { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x != y); + } +}; + +using RNotEqual = NotEqual; + +struct Less { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x < y); + } +}; + +struct RLess { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (y < x); + } +}; + +struct Greater { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x > y); + } +}; + +struct RGreater { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (y > x); + } +}; + +struct LessEqual { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x <= y); + } +}; + +struct RLessEqual { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (y <= x); + } +}; + +struct GreaterEqual { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x >= y); + } +}; + +struct RGreaterEqual { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (y >= x); + } +}; + +struct BitwiseAnd { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (static_cast(x) & static_cast(y)); + } +}; + +using RBitwiseAnd = BitwiseAnd; + +struct BitwiseOr { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (static_cast(x) | static_cast(y)); + } +}; + +using RBitwiseOr = BitwiseOr; + +struct BitwiseXor { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (static_cast(x) ^ static_cast(y)); + } +}; + +using RBitwiseXor = BitwiseXor; + +struct LogicalAnd { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x && y); + } +}; + +using RLogicalAnd = LogicalAnd; + +struct LogicalOr { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x || y); + } +}; + +using RLogicalOr = LogicalOr; + +struct UserDefinedOp { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + TypeOut output; + using TypeCommon = typename common_type::type; + GENERIC_BINARY_OP(&output, static_cast(x), static_cast(y)); + return output; + } +}; + +struct ShiftLeft { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x << y); + } +}; + +struct RShiftLeft { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (y << x); + } +}; + +struct ShiftRight { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (x >> y); + } +}; + +struct RShiftRight { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (y >> x); + } +}; + +struct ShiftRightUnsigned { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (static_cast>(x) >> y); + } +}; + +struct RShiftRightUnsigned { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (static_cast>(y) >> x); + } +}; + +struct LogBase { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return (std::log(static_cast(x)) / std::log(static_cast(y))); + } +}; + +struct RLogBase { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return LogBase::operate(y, x); + } +}; + +struct NullEquals { + template + static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) + { + output_valid = true; + if (!lhs_valid && !rhs_valid) return true; + if (lhs_valid && rhs_valid) return x == y; + return false; + } +}; + +struct RNullEquals { + template + static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) + { + output_valid = true; + return NullEquals::operate(y, x, rhs_valid, lhs_valid, output_valid); + } +}; + +struct NullMax { + template + static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) + { + output_valid = true; + if (!lhs_valid && !rhs_valid) { + output_valid = false; + return TypeOut{}; + } else if (lhs_valid && rhs_valid) { + return (TypeOut{x} > TypeOut{y}) ? TypeOut{x} : TypeOut{y}; + } else if (lhs_valid) + return TypeOut{x}; + else + return TypeOut{y}; + } +}; + +struct RNullMax { + template + static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) + { + return NullMax::operate(y, x, rhs_valid, lhs_valid, output_valid); + } +}; + +struct NullMin { + template + static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) + { + output_valid = true; + if (!lhs_valid && !rhs_valid) { + output_valid = false; + return TypeOut{}; + } else if (lhs_valid && rhs_valid) { + return (TypeOut{x} < TypeOut{y}) ? TypeOut{x} : TypeOut{y}; + } else if (lhs_valid) + return TypeOut{x}; + else + return TypeOut{y}; + } +}; + +struct RNullMin { + template + static TypeOut operate(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) + { + return NullMin::operate(y, x, rhs_valid, lhs_valid, output_valid); + } +}; + +struct PMod { + // Ideally, these two specializations - one for integral types and one for non integral + // types shouldn't be required, as std::fmod should promote integral types automatically + // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified + // code does not work - it is having trouble deciding between float/double overloads + template ::type>)>* = + nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + using common_t = typename cuda::std::common_type::type; + common_t xconv{x}; + common_t yconv{y}; + auto rem = xconv % yconv; + if (rem < 0) rem = (rem + yconv) % yconv; + return TypeOut{rem}; + } + + template ::type>)>* = nullptr> + static TypeOut operate(TypeLhs x, TypeRhs y) + { + using common_t = typename cuda::std::common_type::type; + common_t xconv{x}; + common_t yconv{y}; + auto rem = std::fmod(xconv, yconv); + if (rem < 0) rem = std::fmod(rem + yconv, yconv); + return TypeOut{rem}; + } +}; + +struct RPMod { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return PMod::operate(y, x); + } +}; + +struct ATan2 { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return TypeOut{std::atan2(double{x}, double{y})}; + } +}; + +struct RATan2 { + template + static TypeOut operate(TypeLhs x, TypeRhs y) + { + return TypeOut{ATan2::operate(y, x)}; + } +}; + +} // namespace jit +} // namespace binops +} // namespace cudf diff --git a/cpp/src/binaryop/jit/traits.hpp b/cpp/src/binaryop/jit/traits.hpp new file mode 100644 index 00000000000..1cca2b6e155 --- /dev/null +++ b/cpp/src/binaryop/jit/traits.hpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Copyright 2018-2019 BlazingDB, Inc. + * Copyright 2018 Christian Noboa Mardini + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// Include Jitify's cstddef header first +#include + +#include +#include +#include +#include + +#include +#include + +namespace cudf { +namespace binops { +namespace jit { + +// ------------------------------------------------------------------------- +// type_traits cannot tell the difference between float and double +template +constexpr bool isFloat = false; + +template +constexpr bool is_timestamp_v = cuda::std::is_same::value || + cuda::std::is_same::value || + cuda::std::is_same::value || + cuda::std::is_same::value || + cuda::std::is_same::value; + +template +constexpr bool is_duration_v = cuda::std::is_same::value || + cuda::std::is_same::value || + cuda::std::is_same::value || + cuda::std::is_same::value || + cuda::std::is_same::value; + +template +constexpr bool is_chrono_v = is_timestamp_v || is_duration_v; + +template <> +constexpr bool isFloat = true; + +template +constexpr bool isDouble = false; + +template <> +constexpr bool isDouble = true; + +} // namespace jit +} // namespace binops +} // namespace cudf diff --git a/cpp/src/binaryop/jit/util.hpp b/cpp/src/binaryop/jit/util.hpp index 6b4085bf11b..34c42e28a8b 100644 --- a/cpp/src/binaryop/jit/util.hpp +++ b/cpp/src/binaryop/jit/util.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,11 +72,15 @@ std::string inline get_operator_name(binary_operator op, OperatorType type) case binary_operator::NULL_EQUALS: return "NullEquals"; case binary_operator::NULL_MAX: return "NullMax"; case binary_operator::NULL_MIN: return "NullMin"; - default: return "None"; + default: return ""; } // clang-format on }(); - return type == OperatorType::Direct ? operator_name : 'R' + operator_name; + + if (operator_name == "") { return "None"; } + + return "cudf::binops::jit::" + + (type == OperatorType::Direct ? operator_name : 'R' + operator_name); } } // namespace jit diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 38c95da6dc7..0d1bb5a8312 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -17,8 +17,6 @@ #include #include -#include - namespace cudf { namespace detail { diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp index c634aa8d06b..f79c82aa0db 100644 --- a/cpp/src/jit/cache.cpp +++ b/cpp/src/jit/cache.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,20 +14,15 @@ * limitations under the License. */ -#include #include -#include -#include -#include -#include -#include -#include - #include +#include +#include namespace cudf { namespace jit { + // Get the directory in home to use for storing the cache boost::filesystem::path get_user_home_cache_dir() { @@ -62,7 +57,7 @@ boost::filesystem::path get_user_home_cache_dir() * are used and if $HOME is not defined, returns an empty path and file * caching is not used. */ -boost::filesystem::path getCacheDir() +boost::filesystem::path get_cache_dir() { // The environment variable always overrides the // default/compile-time value of `LIBCUDF_KERNEL_CACHE_PATH` @@ -98,158 +93,33 @@ boost::filesystem::path getCacheDir() return kernel_cache_path; } -cudfJitCache::cudfJitCache() {} - -cudfJitCache::~cudfJitCache() {} - -std::mutex cudfJitCache::_kernel_cache_mutex; -std::mutex cudfJitCache::_program_cache_mutex; - -named_prog cudfJitCache::getProgram( - std::string const& prog_name, - std::string const& cuda_source, - std::vector const& given_headers, - std::vector const& given_options, - jitify::experimental::file_callback_type file_callback) -{ - // Lock for thread safety - std::lock_guard lock(_program_cache_mutex); - - return getCached(prog_name, program_map, [&]() { - CUDF_EXPECTS(not cuda_source.empty(), "Program not found in cache, Needs source string."); - return jitify::experimental::Program(cuda_source, given_headers, given_options, file_callback); - }); -} - -named_prog cudfJitCache::getKernelInstantiation( - std::string const& kern_name, - named_prog const& named_program, - std::vector const& arguments) -{ - // Lock for thread safety - std::lock_guard lock(_kernel_cache_mutex); - - std::string prog_name = std::get<0>(named_program); - jitify::experimental::Program& program = *std::get<1>(named_program); - - // Make instance name e.g. "prog_binop.kernel_v_v_int_int_long int_Add" - std::string kern_inst_name = prog_name + '.' + kern_name; - for (auto&& arg : arguments) kern_inst_name += '_' + arg; - - CUcontext c; - cuCtxGetCurrent(&c); - - auto& kernel_inst_map = kernel_inst_context_map[c]; - - return getCached(kern_inst_name, kernel_inst_map, [&]() { - return program.kernel(kern_name).instantiate(arguments); - }); -} - -// Another overload for getKernelInstantiation which might be useful to get -// kernel instantiations in one step -// ------------------------------------------------------------------------ -/* -jitify::experimental::KernelInstantiation cudfJitCache::getKernelInstantiation( - std::string const& kern_name, - std::string const& prog_name, - std::string const& cuda_source = "", - std::vector const& given_headers = {}, - std::vector const& given_options = {}, - file_callback_type file_callback = nullptr) -{ - auto program = getProgram(prog_name, - cuda_source, - given_headers, - given_options, - file_callback); - return getKernelInstantiation(kern_name, program); -} -*/ - -cudfJitCache::cacheFile::cacheFile(std::string file_name) : _file_name{file_name} {} - -cudfJitCache::cacheFile::~cacheFile() {} - -std::string cudfJitCache::cacheFile::read() +std::string get_program_cache_dir() { - // Open file (duh) - int fd = open(_file_name.c_str(), O_RDWR); - if (fd == -1) { - successful_read = false; - return std::string(); - } - - // Create args for file locking - flock fl{}; - fl.l_type = F_RDLCK; // Shared lock for reading - fl.l_whence = SEEK_SET; - - // Lock the file descriptor. Only reading is allowed now - if (fcntl(fd, F_SETLKW, &fl) == -1) { - successful_read = false; - return std::string(); - } - - // Get file descriptor from file pointer - FILE* fp = fdopen(fd, "rb"); - - // Get file length - fseek(fp, 0L, SEEK_END); - size_t file_size = ftell(fp); - rewind(fp); - - // Allocate memory of file length size - std::string content; - content.resize(file_size); - char* buffer = &content[0]; - - // Copy file into buffer - if (fread(buffer, file_size, 1, fp) != 1) { - successful_read = false; - fclose(fp); - free(buffer); - return std::string(); - } - fclose(fp); - successful_read = true; - - return content; +#if defined(JITIFY_USE_CACHE) + return get_cache_dir().string(); +#elif + return {}; +#endif } -void cudfJitCache::cacheFile::write(std::string content) +jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog) { - // Open file and create if it doesn't exist, with access 0600 - int fd = open(_file_name.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); - if (fd == -1) { - successful_write = false; - return; - } + static std::mutex caches_mutex{}; + static std::unordered_map>> caches{}; - // Create args for file locking - flock fl{}; - fl.l_type = F_WRLCK; // Exclusive lock for writing - fl.l_whence = SEEK_SET; + std::lock_guard caches_lock(caches_mutex); - // Lock the file descriptor. we the only ones now - if (fcntl(fd, F_SETLKW, &fl) == -1) { - successful_write = false; - return; - } + auto existing_cache = caches.find(preprog.name()); - // Get file descriptor from file pointer - FILE* fp = fdopen(fd, "wb"); + if (existing_cache == caches.end()) { + auto res = caches.insert( + {preprog.name(), + std::make_unique>(100, preprog, nullptr, get_program_cache_dir())}); - // Copy string into file - if (fwrite(content.c_str(), content.length(), 1, fp) != 1) { - successful_write = false; - fclose(fp); - return; + existing_cache = res.first; } - fclose(fp); - successful_write = true; - return; + return *(existing_cache->second); } } // namespace jit diff --git a/cpp/src/jit/cache.h b/cpp/src/jit/cache.h deleted file mode 100644 index 071a951023b..00000000000 --- a/cpp/src/jit/cache.h +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef CUDF_JIT_CACHE_H_ -#define CUDF_JIT_CACHE_H_ - -#include -#include -#include -#include -#include -#include -#include - -namespace cudf { -namespace jit { -template -using named_prog = std::pair>; - -/** - * @brief Get the string path to the JITIFY kernel cache directory. - * - * This path can be overridden at runtime by defining an environment variable - * named `LIBCUDF_KERNEL_CACHE_PATH`. The value of this variable must be a path - * under which the process' user has read/write privileges. - * - * This function returns a path to the cache directory, creating it if it - * doesn't exist. - * - * The default cache directory is `$HOME/.cudf/$CUDF_VERSION`. If no overrides - * are used and if $HOME is not defined, returns an empty path and file - * caching is not used. - */ -boost::filesystem::path getCacheDir(); - -class cudfJitCache { - public: - /** - * @brief Get a process wide singleton cache object - * - */ - static cudfJitCache& Instance() - { - // Meyers' singleton is thread safe in C++11 - // Link: https://stackoverflow.com/a/1661564 - static cudfJitCache cache; - return cache; - } - - cudfJitCache(); - ~cudfJitCache(); - - /** - * @brief Get the Kernel Instantiation object - * - * Searches an internal in-memory cache and file based cache for the kernel - * and if not found, JIT compiles and returns the kernel - * - * @param kern_name name of kernel to return - * @param program Jitify preprocessed program to get the kernel from - * @param arguments template arguments for kernel in vector of strings - * @return Pair of string kernel identifier and compiled kernel object - */ - named_prog getKernelInstantiation( - std::string const& kern_name, - named_prog const& program, - std::vector const& arguments); - - /** - * @brief Get the Jitify preprocessed Program object - * - * Searches an internal in-memory cache and file based cache for the Jitify - * pre-processed program and if not found, JIT processes and returns it - * - * @param prog_file_name name of program to return - * @param cuda_source string source code of program to compile - * @param given_headers vector of strings representing source or names of each header included in - * cuda_source - * @param given_options vector of strings options to pass to NVRTC - * @param file_callback pointer to callback function to call whenever a header needs to be loaded - * @return named_prog - */ - named_prog getProgram( - std::string const& prog_file_name, - std::string const& cuda_source = "", - std::vector const& given_headers = {}, - std::vector const& given_options = {}, - jitify::experimental::file_callback_type file_callback = nullptr); - - private: - template - using umap_str_shptr = std::unordered_map>; - - std::unordered_map> - kernel_inst_context_map; - umap_str_shptr program_map; - - /* - Even though this class can be used as a non-singleton, the file cache - access should remain limited to one thread per process. The lockf locks can - prevent multiple processes from accessing the file but are ineffective in - preventing multiple threads from doing so as the lock is shared by the - entire process. - Therefore the mutexes are static. - */ - static std::mutex _kernel_cache_mutex; - static std::mutex _program_cache_mutex; - - private: - /** - * @brief Class to allow process wise exclusive access to cache files - * - */ - class cacheFile { - private: - std::string _file_name; - bool successful_read = false; - bool successful_write = false; - - public: - cacheFile(std::string file_name); - ~cacheFile(); - - /** - * @brief Read this file and return the contents as a std::string - * - */ - std::string read(); - - /** - * @brief Write the passed string to this file - * - */ - void write(std::string); - - /** - * @brief Check whether the read() operation on the file completed successfully - * - * @return true Read was successful. String returned by `read()` is valid - * @return false Read was unsuccessful. String returned by `read()` is empty - */ - bool is_read_successful() { return successful_read; } - - /** - * @brief Check whether the write() operation on the file completed successfully - * - * @return true Write was successful. - * @return false Write was unsuccessful. File state is undefined - */ - bool is_write_successful() { return successful_write; } - }; - - private: - template - named_prog getCached(std::string const& name, umap_str_shptr& map, FallbackFunc func) - { - // Find memory cached T object - auto it = map.find(name); - if (it != map.end()) { - return std::make_pair(name, it->second); - } else { // Find file cached T object - bool successful_read = false; - std::string serialized; -#if defined(JITIFY_USE_CACHE) - boost::filesystem::path cache_dir = getCacheDir(); - if (not cache_dir.empty()) { - boost::filesystem::path file_name = cache_dir / name; - cacheFile file{file_name.string()}; - serialized = file.read(); - successful_read = file.is_read_successful(); - } -#endif - if (not successful_read) { - // JIT compile and write to file if possible - serialized = func().serialize(); -#if defined(JITIFY_USE_CACHE) - if (not cache_dir.empty()) { - boost::filesystem::path file_name = cache_dir / name; - cacheFile file{file_name.string()}; - file.write(serialized); - } -#endif - } - // Add deserialized T to cache and return - auto program = std::make_shared(T::deserialize(serialized)); - map[name] = program; - return std::make_pair(name, program); - } - } -}; - -} // namespace jit -} // namespace cudf - -#endif // CUDF_JIT_CACHE_H_ diff --git a/cpp/src/transform/jit/code/code.h b/cpp/src/jit/cache.hpp similarity index 71% rename from cpp/src/transform/jit/code/code.h rename to cpp/src/jit/cache.hpp index cc3d6a8fe89..df8d4278f0f 100644 --- a/cpp/src/transform/jit/code/code.h +++ b/cpp/src/jit/cache.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,16 +16,13 @@ #pragma once +#include +#include + namespace cudf { -namespace transformation { namespace jit { -namespace code { -extern const char* kernel_header; -extern const char* kernel; -extern const char* traits; -extern const char* operation; -} // namespace code +jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog); + } // namespace jit -} // namespace transformation } // namespace cudf diff --git a/cpp/src/jit/common_headers.hpp b/cpp/src/jit/common_headers.hpp deleted file mode 100644 index 0f57790afe0..00000000000 --- a/cpp/src/jit/common_headers.hpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -namespace cudf { -namespace jit { - -const std::vector compiler_flags -{ - "-std=c++14", - // Have jitify prune unused global variables - "-remove-unused-globals", - // suppress all NVRTC warnings - "-w", - // force libcudacxx to not include system headers - "-D__CUDACC_RTC__", -#if defined(__powerpc64__) - "-D__powerpc64__" -#elif defined(__x86_64__) - "-D__x86_64__" -#endif -}; - -const std::unordered_map stringified_headers{ - {"cuda/std/chrono", cuda_std_chrono}, - {"cuda/std/climits", cuda_std_climits}, - {"cuda/std/cstddef", cuda_std_cstddef}, - {"cuda/std/cstdint", cuda_std_cstdint}, - {"cuda/std/ctime", cuda_std_ctime}, - {"cuda/std/limits", cuda_std_limits}, - {"cuda/std/ratio", cuda_std_ratio}, - {"cuda/std/type_traits", cuda_std_type_traits}, - {"cuda/std/type_traits", cuda_std_type_traits}, - {"cuda/std/version", cuda_std_version}, - {"cuda/std/detail/__config", cuda_std_detail___config}, - {"cuda/std/detail/__pragma_pop", cuda_std_detail___pragma_pop}, - {"cuda/std/detail/__pragma_push", cuda_std_detail___pragma_push}, - {"cuda/std/detail/libcxx/include/__config", cuda_std_detail_libcxx_include___config}, - {"cuda/std/detail/libcxx/include/__pragma_pop", cuda_std_detail_libcxx_include___pragma_pop}, - {"cuda/std/detail/libcxx/include/__pragma_push", cuda_std_detail_libcxx_include___pragma_push}, - {"cuda/std/detail/libcxx/include/__undef_macros", cuda_std_detail_libcxx_include___undef_macros}, - {"cuda/std/detail/libcxx/include/chrono", cuda_std_detail_libcxx_include_chrono}, - {"cuda/std/detail/libcxx/include/climits", cuda_std_detail_libcxx_include_climits}, - {"cuda/std/detail/libcxx/include/cstddef", cuda_std_detail_libcxx_include_cstddef}, - {"cuda/std/detail/libcxx/include/cstdint", cuda_std_detail_libcxx_include_cstdint}, - {"cuda/std/detail/libcxx/include/ctime", cuda_std_detail_libcxx_include_ctime}, - {"cuda/std/detail/libcxx/include/limits", cuda_std_detail_libcxx_include_limits}, - {"cuda/std/detail/libcxx/include/ratio", cuda_std_detail_libcxx_include_ratio}, - {"cuda/std/detail/libcxx/include/type_traits", cuda_std_detail_libcxx_include_type_traits}, - {"cuda/std/detail/libcxx/include/version", cuda_std_detail_libcxx_include_version}, -}; - -inline std::istream* send_stringified_header(std::iostream& stream, char const* header) -{ - // skip the filename line added by stringify - stream << (std::strchr(header, '\n') + 1); - return &stream; -} - -} // namespace jit -} // namespace cudf diff --git a/cpp/src/jit/launcher.cpp b/cpp/src/jit/launcher.cpp deleted file mode 100644 index 2ddcac7d5ba..00000000000 --- a/cpp/src/jit/launcher.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include - -namespace cudf { -namespace jit { - -launcher::launcher(const std::string& hash, - const std::string& cuda_source, - const std::vector& header_names, - const std::vector& compiler_flags, - jitify::experimental::file_callback_type file_callback, - rmm::cuda_stream_view stream) - : cache_instance{cudf::jit::cudfJitCache::Instance()}, stream(stream) -{ - program = cache_instance.getProgram( - hash, cuda_source.c_str(), header_names, compiler_flags, file_callback); -} - -launcher::launcher(launcher&& launcher) - : cache_instance{cudf::jit::cudfJitCache::Instance()}, - program{std::move(launcher.program)}, - kernel_inst{std::move(launcher.kernel_inst)}, - stream{launcher.stream} -{ -} - -} // namespace jit -} // namespace cudf diff --git a/cpp/src/jit/launcher.h b/cpp/src/jit/launcher.h deleted file mode 100644 index 8bcd92149a8..00000000000 --- a/cpp/src/jit/launcher.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -#include - -#include -#include -#include -#include -#include - -namespace cudf { -namespace jit { -/** - * @brief Class used to handle compilation and execution of JIT kernels - */ -class launcher { - public: - launcher() = delete; - - /** - * @brief Constructor of the launcher class - * - * Method to generate vector containing all template types for a JIT kernel. - * This vector is used to get the compiled kernel for one set of types and set - * it as the kernel to launch using this launcher. - * - * @param hash The hash to be used as the key for caching - * @param cuda_code The CUDA code that contains the kernel to be launched - * @param header_names Strings of header_names or strings that contain content - * of the header files - * @param compiler_flags Strings of compiler flags - * @param file_callback a function that returns header file contents given header - * file names. - * @param stream The non-owned stream to use for execution - */ - launcher(const std::string& hash, - const std::string& cuda_source, - const std::vector& header_names, - const std::vector& compiler_flags, - jitify::experimental::file_callback_type file_callback, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - launcher(launcher&&); - launcher(const launcher&) = delete; - launcher& operator=(launcher&&) = delete; - launcher& operator=(const launcher&) = delete; - - /** - * @brief Sets the kernel to launch using this launcher - * - * Method to generate vector containing all template types for a JIT kernel. - * This vector is used to get the compiled kernel for one set of types and set - * it as the kernel to launch using this launcher. - * - * @param kernel_name The kernel to be launched - * @param arguments The template arguments to be used to instantiate the kernel - * @return launcher& ref to this launcher object - */ - launcher& set_kernel_inst(const std::string& kernel_name, - const std::vector& arguments) - { - kernel_inst = cache_instance.getKernelInstantiation(kernel_name, program, arguments); - return *this; - } - - /** - * @brief Handle the Jitify API to launch using information - * contained in the members of `this` - * - * @tparam All parameters to launch the kernel - */ - template - void launch(Args... args) - { - get_kernel().configure_1d_max_occupancy(0, 0, 0, stream.value()).safe_launch(args...); - } - - private: - cudf::jit::cudfJitCache& cache_instance; - cudf::jit::named_prog program; - cudf::jit::named_prog kernel_inst; - rmm::cuda_stream_view stream; - - jitify::experimental::KernelInstantiation& get_kernel() { return *std::get<1>(kernel_inst); } -}; - -} // namespace jit -} // namespace cudf diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp index 01fd3aea33a..8929d58be08 100644 --- a/cpp/src/jit/parser.cpp +++ b/cpp/src/jit/parser.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,15 +14,17 @@ * limitations under the License. */ +#include "parser.hpp" + +#include + #include #include -#include #include +#include #include #include -#include "parser.h" - namespace cudf { namespace jit { constexpr char percent_escape[] = "_"; diff --git a/cpp/src/jit/parser.h b/cpp/src/jit/parser.hpp similarity index 100% rename from cpp/src/jit/parser.h rename to cpp/src/jit/parser.hpp diff --git a/cpp/src/jit/type.cpp b/cpp/src/jit/type.cpp index e833a6fa10f..16894168b31 100644 --- a/cpp/src/jit/type.cpp +++ b/cpp/src/jit/type.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include + #include namespace cudf { diff --git a/cpp/src/jit/type.h b/cpp/src/jit/type.hpp similarity index 100% rename from cpp/src/jit/type.h rename to cpp/src/jit/type.hpp diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index 34d6d5fa194..ca4913c1843 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -14,9 +14,11 @@ * limitations under the License. */ +#include "rolling_detail.cuh" +#include "rolling_jit_detail.hpp" + #include #include -#include "rolling_detail.cuh" namespace cudf { diff --git a/cpp/src/rolling/jit/code/operation.cpp b/cpp/src/rolling/jit/code/operation.cpp deleted file mode 100644 index 1fdc4080634..00000000000 --- a/cpp/src/rolling/jit/code/operation.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace cudf { -namespace rolling { -namespace jit { -namespace code { -const char* operation_h = - R"***(operation.h -#pragma once - struct rolling_udf_ptx { - template - static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count) { - OutType ret; - rolling_udf( - &ret, 0, 0, 0, 0, &in_col[start], count, sizeof(InType)); - return ret; - } - }; - - struct rolling_udf_cuda { - template - static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count) { - OutType ret; - rolling_udf( - &ret, in_col, start, count); - return ret; - } - }; - -)***"; - -} // namespace code -} // namespace jit -} // namespace rolling -} // namespace cudf diff --git a/cpp/src/rolling/jit/code/kernel.cpp b/cpp/src/rolling/jit/kernel.cu similarity index 61% rename from cpp/src/rolling/jit/code/kernel.cpp rename to cpp/src/rolling/jit/kernel.cu index 2c612162f79..52e397b9351 100644 --- a/cpp/src/rolling/jit/code/kernel.cpp +++ b/cpp/src/rolling/jit/kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,47 +14,50 @@ * limitations under the License. */ -namespace cudf { -namespace rolling { -namespace jit { -namespace code { -const char* kernel_headers = - R"***( -#include <../src/rolling/rolling_jit_detail.hpp> +#include +#include + #include #include -)***"; -const char* kernel = - R"***( -#include "operation.h" +namespace cudf { +namespace rolling { +namespace jit { template -cudf::size_type __device__ get_window(WindowType window, cudf::size_type index) { return window[index]; } +cudf::size_type __device__ get_window(WindowType window, cudf::size_type index) +{ + return window[index]; +} template <> -cudf::size_type __device__ get_window(cudf::size_type window, cudf::size_type index) { return window; } - -template -__global__ -void gpu_rolling_new(cudf::size_type nrows, - InType const* const __restrict__ in_col, - cudf::bitmask_type const* const __restrict__ in_col_valid, - OutType* __restrict__ out_col, - cudf::bitmask_type* __restrict__ out_col_valid, - cudf::size_type * __restrict__ output_valid_count, - PrecedingWindowType preceding_window_begin, - FollowingWindowType following_window_begin, - cudf::size_type min_periods) +cudf::size_type __device__ get_window(cudf::size_type window, cudf::size_type index) { - cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x; + return window; +} + +template +__global__ void gpu_rolling_new(cudf::size_type nrows, + InType const* const __restrict__ in_col, + cudf::bitmask_type const* const __restrict__ in_col_valid, + OutType* __restrict__ out_col, + cudf::bitmask_type* __restrict__ out_col_valid, + cudf::size_type* __restrict__ output_valid_count, + PrecedingWindowType preceding_window_begin, + FollowingWindowType following_window_begin, + cudf::size_type min_periods) +{ + cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x; cudf::size_type stride = blockDim.x * gridDim.x; cudf::size_type warp_valid_count{0}; auto active_threads = __ballot_sync(0xffffffff, i < nrows); - while(i < nrows) - { + while (i < nrows) { // declare this as volatile to avoid some compiler optimizations that lead to incorrect results // for CUDA 10.0 and below (fixed in CUDA 10.1) volatile cudf::size_type count = 0; @@ -63,16 +66,16 @@ void gpu_rolling_new(cudf::size_type nrows, cudf::size_type following_window = get_window(following_window_begin, i); // compute bounds - cudf::size_type start = min(nrows, max(0, i - preceding_window + 1)); - cudf::size_type end = min(nrows, max(0, i + following_window + 1)); + cudf::size_type start = min(nrows, max(0, i - preceding_window + 1)); + cudf::size_type end = min(nrows, max(0, i + following_window + 1)); cudf::size_type start_index = min(start, end); - cudf::size_type end_index = max(start, end); + cudf::size_type end_index = max(start, end); // aggregate // TODO: We should explore using shared memory to avoid redundant loads. // This might require separating the kernel into a special version // for dynamic and static sizes. - count = end_index - start_index; + count = end_index - start_index; OutType val = agg_op::template operate(in_col, start_index, count); // check if we have enough input samples @@ -82,9 +85,7 @@ void gpu_rolling_new(cudf::size_type nrows, const unsigned int result_mask = __ballot_sync(active_threads, output_is_valid); // store the output value, one per thread - if (output_is_valid) { - out_col[i] = val; - } + if (output_is_valid) { out_col[i] = val; } // only one thread writes the mask if (0 == cudf::intra_word_index(i)) { @@ -92,20 +93,16 @@ void gpu_rolling_new(cudf::size_type nrows, warp_valid_count += __popc(result_mask); } - // process next element + // process next element i += stride; active_threads = __ballot_sync(active_threads, i < nrows); } // TODO: likely faster to do a single_lane_block_reduce and a single // atomic per block but that requires jitifying single_lane_block_reduce... - if(0 == cudf::intra_word_index(threadIdx.x)) { - atomicAdd(output_valid_count, warp_valid_count); - } + if (0 == cudf::intra_word_index(threadIdx.x)) { atomicAdd(output_valid_count, warp_valid_count); } } -)***"; -} // namespace code } // namespace jit } // namespace rolling } // namespace cudf diff --git a/cpp/src/rolling/jit/code/code.h b/cpp/src/rolling/jit/operation-udf.hpp similarity index 51% rename from cpp/src/rolling/jit/code/code.h rename to cpp/src/rolling/jit/operation-udf.hpp index c5577d326c7..eaab2111d98 100644 --- a/cpp/src/rolling/jit/code/code.h +++ b/cpp/src/rolling/jit/operation-udf.hpp @@ -1,8 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Copyright 2018-2019 BlazingDB, Inc. - * Copyright 2018 Christian Noboa Mardini + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,21 +16,5 @@ #pragma once -namespace cudf { -namespace rolling { -namespace jit { -namespace code { -extern const char* kernel_headers; -extern const char* kernel; -extern const char* operation_h; - -extern const char* kernel_headers; -extern const char* kernel; -extern const char* operation_h; - -extern const char* grouped_window_wrapper; - -} // namespace code -} // namespace jit -} // namespace rolling -} // namespace cudf +// This file serves as a placeholder for user defined functions, so jitify can choose to override it +// at runtime. diff --git a/cpp/src/rolling/jit/operation.hpp b/cpp/src/rolling/jit/operation.hpp new file mode 100644 index 00000000000..9af8c2ac3fb --- /dev/null +++ b/cpp/src/rolling/jit/operation.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#pragma once + +struct rolling_udf_ptx { + template + static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count) + { + OutType ret; + rolling_udf(&ret, 0, 0, 0, 0, &in_col[start], count, sizeof(InType)); + return ret; + } +}; + +struct rolling_udf_cuda { + template + static OutType operate(const InType* in_col, cudf::size_type start, cudf::size_type count) + { + OutType ret; + rolling_udf(&ret, in_col, start, count); + return ret; + } +}; diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 42562507fa9..bb431fad537 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -16,9 +16,7 @@ #pragma once -#include #include -#include #include #include @@ -44,12 +42,11 @@ #include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include + +#include #include #include @@ -1270,19 +1267,15 @@ std::unique_ptr rolling_window_udf(column_view const& input, std::string cuda_source; switch (udf_agg->kind) { case aggregation::Kind::PTX: - cuda_source = cudf::rolling::jit::code::kernel_headers; cuda_source += cudf::jit::parse_single_function_ptx(udf_agg->_source, udf_agg->_function_name, cudf::jit::get_type_name(udf_agg->_output_type), {0, 5}); // args 0 and 5 are pointers. - cuda_source += cudf::rolling::jit::code::kernel; break; case aggregation::Kind::CUDA: - cuda_source = cudf::rolling::jit::code::kernel_headers; cuda_source += cudf::jit::parse_single_function_cuda(udf_agg->_source, udf_agg->_function_name); - cuda_source += cudf::rolling::jit::code::kernel; break; default: CUDF_FAIL("Unsupported UDF type."); } @@ -1293,37 +1286,26 @@ std::unique_ptr rolling_window_udf(column_view const& input, auto output_view = output->mutable_view(); rmm::device_scalar device_valid_count{0, stream}; - const std::vector compiler_flags{"-std=c++14", - // Have jitify prune unused global variables - "-remove-unused-globals", - // suppress all NVRTC warnings - "-w"}; - - // Launch the jitify kernel - cudf::jit::launcher(hash, - cuda_source, - {cudf_types_hpp, - cudf_utilities_bit_hpp, - cudf::rolling::jit::code::operation_h, - ___src_rolling_rolling_jit_detail_hpp}, - compiler_flags, - nullptr, - stream) - .set_kernel_inst("gpu_rolling_new", // name of the kernel we are launching - {cudf::jit::get_type_name(input.type()), // list of template arguments - cudf::jit::get_type_name(output->type()), - udf_agg->_operator_name, - preceding_window_str.c_str(), - following_window_str.c_str()}) - .launch(input.size(), - cudf::jit::get_data_ptr(input), - input.null_mask(), - cudf::jit::get_data_ptr(output_view), - output_view.null_mask(), - device_valid_count.data(), - preceding_window, - following_window, - min_periods); + std::string kernel_name = + jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new") // + .instantiate(cudf::jit::get_type_name(input.type()), // list of template arguments + cudf::jit::get_type_name(output->type()), + udf_agg->_operator_name, + preceding_window_str.c_str(), + following_window_str.c_str()); + + cudf::jit::get_program_cache(*rolling_jit_kernel_cu_jit) + .get_kernel(kernel_name, {}, {{"rolling/jit/operation-udf.hpp", cuda_source}}) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + ->launch(input.size(), + cudf::jit::get_data_ptr(input), + input.null_mask(), + cudf::jit::get_data_ptr(output_view), + output_view.null_mask(), + device_valid_count.data(), + preceding_window, + following_window, + min_periods); output->set_null_count(output->size() - device_valid_count.value(stream)); diff --git a/cpp/src/transform/jit/code/kernel.cpp b/cpp/src/transform/jit/code/kernel.cpp deleted file mode 100644 index 58fdb945de3..00000000000 --- a/cpp/src/transform/jit/code/kernel.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace cudf { -namespace transformation { -namespace jit { -namespace code { -const char* kernel_header = - R"***( - #pragma once - - // Include Jitify's cstddef header first - #include - - #include - #include - #include - - #include - #include - )***"; - -const char* kernel = - R"***( - template - __global__ - void kernel(cudf::size_type size, - TypeOut* out_data, TypeIn* in_data) { - int tid = threadIdx.x; - int blkid = blockIdx.x; - int blksz = blockDim.x; - int gridsz = gridDim.x; - - int start = tid + blkid * blksz; - int step = blksz * gridsz; - - for (cudf::size_type i=start; i + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +namespace cudf { +namespace transformation { +namespace jit { + +template +__global__ void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data) +{ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int start = tid + blkid * blksz; + int step = blksz * gridsz; + + for (cudf::size_type i = start; i < size; i += step) { + GENERIC_UNARY_OP(&out_data[i], in_data[i]); + } +} + +} // namespace jit +} // namespace transformation +} // namespace cudf diff --git a/cpp/src/transform/jit/operation-udf.hpp b/cpp/src/transform/jit/operation-udf.hpp new file mode 100644 index 00000000000..eaab2111d98 --- /dev/null +++ b/cpp/src/transform/jit/operation-udf.hpp @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// This file serves as a placeholder for user defined functions, so jitify can choose to override it +// at runtime. diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 6da0f78687b..8f176d035d2 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,12 +14,11 @@ * limitations under the License. */ -#include "jit/code/code.h" +#include -#include -#include -#include -#include +#include +#include +#include #include #include @@ -29,27 +28,12 @@ #include #include -#include -#include - #include namespace cudf { namespace transformation { -//! Jit functions namespace jit { -const std::vector header_names{cudf_types_hpp, cudf_wrappers_timestamps_hpp}; - -std::istream* headers_code(std::string filename, std::iostream& stream) -{ - auto it = cudf::jit::stringified_headers.find(filename); - if (it != cudf::jit::stringified_headers.end()) { - return cudf::jit::send_stringified_header(stream, it->second); - } - return nullptr; -} - void unary_operation(mutable_column_view output, column_view input, const std::string& udf, @@ -57,28 +41,25 @@ void unary_operation(mutable_column_view output, bool is_ptx, rmm::cuda_stream_view stream) { - std::string hash = "prog_transform" + std::to_string(std::hash{}(udf)); - - std::string cuda_source = code::kernel_header; - if (is_ptx) { - cuda_source += cudf::jit::parse_single_function_ptx( - udf, "GENERIC_UNARY_OP", cudf::jit::get_type_name(output_type), {0}) + - code::kernel; - } else { - cuda_source += cudf::jit::parse_single_function_cuda(udf, "GENERIC_UNARY_OP") + code::kernel; - } - - // Launch the jitify kernel - cudf::jit::launcher(hash, - cuda_source, - header_names, - cudf::jit::compiler_flags, - headers_code, - stream) - .set_kernel_inst("kernel", // name of the kernel we are launching - {cudf::jit::get_type_name(output.type()), // list of template arguments - cudf::jit::get_type_name(input.type())}) - .launch(output.size(), cudf::jit::get_data_ptr(output), cudf::jit::get_data_ptr(input)); + std::string kernel_name = + jitify2::reflection::Template("cudf::transformation::jit::kernel") // + .instantiate(cudf::jit::get_type_name(output.type()), // list of template arguments + cudf::jit::get_type_name(input.type())); + + std::string cuda_source = + is_ptx ? cudf::jit::parse_single_function_ptx(udf, // + "GENERIC_UNARY_OP", + cudf::jit::get_type_name(output_type), + {0}) + : cudf::jit::parse_single_function_cuda(udf, // + "GENERIC_UNARY_OP"); + + cudf::jit::get_program_cache(*transform_jit_kernel_cu_jit) + .get_kernel(kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}) // + ->configure_1d_max_occupancy(0, 0, 0, stream.value()) // + ->launch(output.size(), // + cudf::jit::get_data_ptr(output), + cudf::jit::get_data_ptr(input)); } } // namespace jit diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 11ee7f6c458..342ec9145fd 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -154,7 +154,8 @@ ConfigureTest(BINARY_TEST binaryop/binop-verify-input-test.cpp binaryop/binop-null-test.cpp binaryop/binop-integration-test.cpp - binaryop/binop-generic-ptx-test.cpp) + binaryop/binop-generic-ptx-test.cpp + ) ################################################################################################### # - unary transform tests ------------------------------------------------------------------------- @@ -172,16 +173,6 @@ ConfigureTest(INTEROP_TEST interop/from_arrow_test.cpp interop/dlpack_test.cpp) -################################################################################################### -# - jit cache tests ------------------------------------------------------------------------------- -ConfigureTest(JITCACHE_TEST - "${CUDF_SOURCE_DIR}/src/jit/cache.cpp" - jit/jit-cache-test.cpp) - -ConfigureTest(JITCACHE_MULTIPROC_TEST - "${CUDF_SOURCE_DIR}/src/jit/cache.cpp" - jit/jit-cache-multiprocess-test.cpp) - ################################################################################################### # - io tests -------------------------------------------------------------------------------------- ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cu) @@ -277,7 +268,8 @@ ConfigureTest(ROLLING_TEST rolling/rolling_test.cpp rolling/grouped_rolling_test.cpp rolling/lead_lag_test.cpp - rolling/collect_list_test.cpp) + rolling/collect_list_test.cpp + ) ################################################################################################### # - filling test ---------------------------------------------------------------------------------- diff --git a/cpp/tests/jit/jit-cache-multiprocess-test.cpp b/cpp/tests/jit/jit-cache-multiprocess-test.cpp deleted file mode 100644 index 2f0b353673e..00000000000 --- a/cpp/tests/jit/jit-cache-multiprocess-test.cpp +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include "jit-cache-test.hpp" -#include "rmm/mr/device/per_device_resource.hpp" - -#if defined(JITIFY_USE_CACHE) - -/** - * @brief This test runs two processes that try to access the same kernel - * - * This is a stress test. - * - * A single test process is forked before invocation of CUDA and then both the - * parent and child processes try to get and run a kernel. The child process - * clears the cache before each iteration of the test so that the cache has to - * be re-written by it. The parent process runs on a changing time offset so - * that it sometimes gets the kernel from cache and sometimes it doesn't. - * - * The aim of this test is to check that the file cache doesn't get corrupted - * when multiple processes are reading/writing to it at the same time. Since - * the public API of JitCache doesn't return the serialized string of the - * cached kernel, the way to test its validity is to run it on test data. - */ -TEST_F(JitCacheMultiProcessTest, MultiProcessTest) -{ - int num_tests = 20; - // Cannot initialize scalars before forking - rmm::device_scalar *input; - rmm::device_scalar *output; - int expect = 64; - - auto tester = [&](int pid, int test_no) { - // Brand new cache object that has nothing in in-memory cache - cudf::jit::cudfJitCache cache; - - auto const in{4}; - auto const out{1}; - input->set_value(in); - output->set_value(out); - - // make program - auto program = cache.getProgram("FileCacheTestProg3", program3_source); - // make kernel - auto kernel = cache.getKernelInstantiation("my_kernel", program, {"3", "int"}); - (*std::get<1>(kernel)).configure(grid, block).launch(input->data(), output->data()); - CUDA_TRY(cudaDeviceSynchronize()); - - ASSERT_TRUE(expect == output->value()) << "Expected val: " << expect << '\n' - << " Actual val: " << output->value(); - }; - - // This pipe is how the child process will send output to parent - int pipefd[2]; - ASSERT_NE(pipe(pipefd), -1) << "Unable to create pipe"; - - pid_t cpid = fork(); - ASSERT_TRUE(cpid >= 0) << "Fork failed"; - - if (cpid > 0) { // Parent - close(pipefd[1]); // Close write end of pipe. Parent doesn't write. - usleep(100000); - } else { // Child - close(pipefd[0]); // Close read end of pipe. Child doesn't read. - dup2(pipefd[1], STDOUT_FILENO); // redirect stdout to pipe - } - - input = new rmm::device_scalar(); - output = new rmm::device_scalar(); - - for (int i = 0; i < num_tests; i++) { - if (cpid > 0) - usleep(10000); - else - purgeFileCache(); - - tester(cpid, i); - } - - // Child ends here -------------------------------------------------------- - - if (cpid > 0) { - int status; - wait(&status); - - std::cout << "Child output begin:" << std::endl; - char buf; - while (read(pipefd[0], &buf, 1) > 0) ASSERT_EQ(write(STDOUT_FILENO, &buf, 1), 1); - ASSERT_EQ(write(STDOUT_FILENO, "\n", 1), 1); - std::cout << "Child output end" << std::endl; - - ASSERT_TRUE(WIFEXITED(status)) << "Child did not exit normally."; - ASSERT_EQ(WEXITSTATUS(status), 0) << "Error in child."; - } -} -#endif - -int main(int argc, char **argv) -{ - ::testing::InitGoogleTest(&argc, argv); - - // This test relies on the fact that the cuda context will be created in - // each process separately after the fork. With the default CUDF_TEST_MAIN, - // using rmm_mode=pool will cause the cuda context to be created at startup, - // before the fork. So we hardcode the rmm_mode to "cuda" for this test - // and explicitly set the device 0 resource to it. Note that using - // `set_current_device_resource` would result in a call to `cudaGetDevice()` - // which would also initialize the CUDA context before the fork. - auto const rmm_mode = "cuda"; - auto resource = cudf::test::create_memory_resource(rmm_mode); - rmm::mr::set_per_device_resource(rmm::cuda_device_id{0}, resource.get()); - return RUN_ALL_TESTS(); -} diff --git a/cpp/tests/jit/jit-cache-test.cpp b/cpp/tests/jit/jit-cache-test.cpp deleted file mode 100644 index 43cd5911ae7..00000000000 --- a/cpp/tests/jit/jit-cache-test.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "jit-cache-test.hpp" - -namespace cudf { -namespace test { -TEST_F(JitCacheTest, CacheExceptionTest) -{ - EXPECT_NO_THROW(auto program = getProgram("MemoryCacheTestProg")); - EXPECT_ANY_THROW(auto program1 = getProgram("MemoryCacheTestProg1")); -} - -// Test the in memory caching ability -TEST_F(JitCacheTest, MemoryCacheKernelTest) -{ - // Check the kernel caching - - // Single value column - // TODO (dm): should be a scalar tho - auto column = cudf::test::fixed_width_column_wrapper{{5, 0}}; - auto expect = cudf::test::fixed_width_column_wrapper{{125, 0}}; - - // make new program and rename it to match old program - auto program = getProgram("MemoryCacheTestProg1", program2_source); - // TODO: when I convert this pair to a class, make an inherited test class that can edit names - std::get<0>(program) = "MemoryCacheTestProg"; - - // remove any file cache so below kernel should not be obtained from file - purgeFileCache(); - - // make kernel that if the cache tried to compile, will use a different - // program than intended and give wrong result. - auto kernel = getKernelInstantiation("my_kernel", program, {"3", "int"}); - - (*std::get<1>(kernel)) - .configure(grid, block) - .launch(column.operator cudf::mutable_column_view().data()); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column); -} - -TEST_F(JitCacheTest, MemoryCacheProgramTest) -{ - // Check program source caching - - // Single value column - // TODO (dm): should be a scalar tho - auto column = cudf::test::fixed_width_column_wrapper{{5, 0}}; - auto expect = cudf::test::fixed_width_column_wrapper{{625, 0}}; - - // remove any file cache so below program should not be obtained from file - purgeFileCache(); - - auto program = getProgram("MemoryCacheTestProg"); - // make kernel that HAS to be compiled - auto kernel = getKernelInstantiation("my_kernel", program, {"4", "int"}); - - (*std::get<1>(kernel)) - .configure(grid, block) - .launch(column.operator cudf::mutable_column_view().data()); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column); -} - -// Test the file caching ability -#if defined(JITIFY_USE_CACHE) -TEST_F(JitCacheTest, FileCacheProgramTest) -{ - // Brand new cache object that has nothing in in-memory cache - cudf::jit::cudfJitCache cache; - - // Single value column - auto column = cudf::test::fixed_width_column_wrapper{{5, 0}}; - auto expect = cudf::test::fixed_width_column_wrapper{{625, 0}}; - - // make program - auto program = cache.getProgram("FileCacheTestProg", program_source); - // make kernel that HAS to be compiled - auto kernel = cache.getKernelInstantiation("my_kernel", program, {"4", "int"}); - (*std::get<1>(kernel)) - .configure(grid, block) - .launch(column.operator cudf::mutable_column_view().data()); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column); -} - -TEST_F(JitCacheTest, FileCacheKernelTest) -{ - // Brand new cache object that has nothing in in-memory cache - cudf::jit::cudfJitCache cache; - - // Single value column - auto column = cudf::test::fixed_width_column_wrapper{{5, 0}}; - auto expect = cudf::test::fixed_width_column_wrapper{{125, 0}}; - - // make program - auto program = cache.getProgram("FileCacheTestProg", program_source); - // make kernel that should NOT need to be compiled - auto kernel = cache.getKernelInstantiation("my_kernel", program, {"3", "int"}); - (*std::get<1>(kernel)) - .configure(grid, block) - .launch(column.operator cudf::mutable_column_view().data()); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column); -} -#endif - -} // namespace test -} // namespace cudf - -CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/jit/jit-cache-test.hpp b/cpp/tests/jit/jit-cache-test.hpp deleted file mode 100644 index 261cc0fd3b4..00000000000 --- a/cpp/tests/jit/jit-cache-test.hpp +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include - -#include - -// Note that this test does not inherit from cudf::test::BaseFixture because -// doing so would cause the CUDA context to be created before the fork in -// the JitCacheMultiProcessTest, where we need it to be created after the fork -// to ensure the forked child has a context. These tests do not need the -// memory_resource member of BaseFixture. -struct JitCacheTest : public ::testing::Test, public cudf::jit::cudfJitCache { - JitCacheTest() : grid(1), block(1) {} - - virtual ~JitCacheTest() {} - - virtual void SetUp() - { - purgeFileCache(); - warmUp(); - } - - virtual void TearDown() { purgeFileCache(); } - - void purgeFileCache() - { -#if defined(JITIFY_USE_CACHE) - // In the multi-process test there are two processes repeatedly creating and deleting the cache. - // While deleting the cache, we cannot use `filesystem::remove_all(cudf::jit::getCacheDir())` - // because it would recursively remove all files within the cache directory and then finally - // remove the directory itself. A non-empty directory cannot be removed and throws an exception. - // On slower disks, there would be times when one process would be deleting the cache and the - // other would be creating it. So while the process that’s trying to delete is done deleting the - // contents of the directory, and is about to delete the directory itself, the other process - // would go ahead and create a cache file in that directory. Thus causing an exception to be - // thrown on the process trying to delete the now non-empty directory. - - // By recursing the cache directory and only deleting cache files, we leave the directory alone. - // That way the aforementioned scenario doesn’t occur - std::vector file_paths; - for (auto& path : boost::filesystem::recursive_directory_iterator(cudf::jit::getCacheDir())) { - if (boost::filesystem::is_regular_file(path)) { file_paths.push_back(path); } - } - for (auto& file_path : file_paths) { boost::filesystem::remove(file_path); } -#endif - } - - void warmUp() - { - // Prime up the cache so that the in-memory and file cache is populated - - // Single value column - auto column = cudf::test::fixed_width_column_wrapper({4, 0}); - auto expect = cudf::test::fixed_width_column_wrapper({64, 0}); - - // make program - auto program = getProgram("MemoryCacheTestProg", program_source); - // make kernel - auto kernel = getKernelInstantiation("my_kernel", program, {"3", "int"}); - (*std::get<1>(kernel)) - .configure(grid, block) - .launch(column.operator cudf::mutable_column_view().data()); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, column); - } - - const char* program_source = - "my_program\n" - "template\n" - "__global__\n" - "void my_kernel(T* data) {\n" - " T data0 = data[0];\n" - " for( int i=0; i\n" - "__global__\n" - "void my_kernel(T* data) {\n" - " T data0 = data[0];\n" - " for( int i=0; i\n" - "__global__\n" - "void my_kernel(T* data, T* out) {\n" - " T data0 = data[0];\n" - " for( int i=0; i output; - auto start = cudf::detail::make_counting_transform_iterator(0, [size] __device__(size_type row) { + auto start = cudf::detail::make_counting_transform_iterator(0, [size](size_type row) { return std::accumulate(thrust::make_counting_iterator(std::max(0, row - 2 + 1)), thrust::make_counting_iterator(std::min(size, row + 2 + 1)), 0); }); - auto valid = cudf::detail::make_counting_transform_iterator(0, [size] __device__(size_type row) { - return (row != 0 && row != size - 2 && row != size - 1); - }); + auto valid = cudf::detail::make_counting_transform_iterator( + 0, [size](size_type row) { return (row != 0 && row != size - 2 && row != size - 1); }); fixed_width_column_wrapper expected{start, start + size, valid}; @@ -895,7 +894,7 @@ TEST_F(RollingTestUdf, StaticWindow) auto cuda_udf_agg = cudf::make_udf_aggregation( cudf::udf_type::CUDA, this->cuda_func, cudf::data_type{cudf::type_id::INT64}); - EXPECT_NO_THROW(output = cudf::rolling_window(input, 2, 2, 4, cuda_udf_agg)); + output = cudf::rolling_window(input, 2, 2, 4, cuda_udf_agg); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected); @@ -903,7 +902,7 @@ TEST_F(RollingTestUdf, StaticWindow) auto ptx_udf_agg = cudf::make_udf_aggregation( cudf::udf_type::PTX, this->ptx_func, cudf::data_type{cudf::type_id::INT64}); - EXPECT_NO_THROW(output = cudf::rolling_window(input, 2, 2, 4, ptx_udf_agg)); + output = cudf::rolling_window(input, 2, 2, 4, ptx_udf_agg); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected); } @@ -941,7 +940,7 @@ TEST_F(RollingTestUdf, DynamicWindow) auto cuda_udf_agg = cudf::make_udf_aggregation( cudf::udf_type::CUDA, this->cuda_func, cudf::data_type{cudf::type_id::INT64}); - EXPECT_NO_THROW(output = cudf::rolling_window(input, preceding, following, 2, cuda_udf_agg)); + output = cudf::rolling_window(input, preceding, following, 2, cuda_udf_agg); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected); @@ -949,7 +948,7 @@ TEST_F(RollingTestUdf, DynamicWindow) auto ptx_udf_agg = cudf::make_udf_aggregation( cudf::udf_type::PTX, this->ptx_func, cudf::data_type{cudf::type_id::INT64}); - EXPECT_NO_THROW(output = cudf::rolling_window(input, preceding, following, 2, ptx_udf_agg)); + output = cudf::rolling_window(input, preceding, following, 2, ptx_udf_agg); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected); } diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index a54c86405a5..5205124c129 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -32,7 +32,7 @@ #include #include -#include +#include #include