diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index bf108ddf66..690da3f702 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -808,6 +808,13 @@ struct json_path_processing_data { * * The number of warps processing each row is computed as `ceil(num_paths / warp_size)`. * + * We explicitly set a value for `min_block_per_sm` parameter in the launch bounds to avoid + * spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between + * the maximum number of registers used by a kernel and the parallelism of the kernel. + * If lots of registers are used the parallelism may suffer. But in our case NVCC gets this wrong + * and we want to avoid spilling all the time or else the performance is really bad. This + * essentially tells NVCC to prefer using lots of registers over spilling. + * * @param input The input JSON strings stored in a strings column * @param path_data Array containing all path data * @param num_threads_per_row Number of threads processing each input row @@ -848,51 +855,27 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL /** * @brief A utility class to launch the main kernel. - * - * It caches the kernel launch parameters to reuse multiple times. */ -class kernel_launcher { - public: - explicit kernel_launcher(cudf::size_type _input_size, std::size_t _path_size) - : input_size{_input_size}, - path_size{_path_size}, - num_threads_per_row{cudf::util::div_rounding_up_safe( - path_size, static_cast(cudf::detail::warp_size)) * - cudf::detail::warp_size}, - num_blocks{cudf::util::div_rounding_up_safe(num_threads_per_row * input_size, - static_cast(block_size))} +struct kernel_launcher { + static void exec(cudf::column_device_view const& input, + cudf::device_span path_data, + rmm::cuda_stream_view stream) { - } - - void exec(cudf::column_device_view const& input, - cudf::device_span path_data, - rmm::cuda_stream_view stream) const - { - CUDF_EXPECTS(input.size() == input_size && path_data.size() == path_size, - "Unexpected data sizes upon launching kernel."); + // The optimal values for block_size and min_block_per_sm were found through testing, + // which are either 128-8 or 256-4. The pair 128-8 seems a bit better. + static constexpr int block_size = 128; + static constexpr int min_block_per_sm = 8; + + // The number of threads for processing one input row is at least one warp. + auto const num_threads_per_row = + cudf::util::div_rounding_up_safe(path_data.size(), + static_cast(cudf::detail::warp_size)) * + cudf::detail::warp_size; + auto const num_blocks = cudf::util::div_rounding_up_safe(num_threads_per_row * input.size(), + static_cast(block_size)); get_json_object_kernel <<>>(input, path_data, num_threads_per_row); } - - private: - // We explicitly set the minBlocksPerMultiprocessor parameter in the launch bounds to avoid - // spilling from the kernel itself. By default NVCC uses a heuristic to find a balance between - // the maximum number of registers used by a kernel and the parallelism of the kernel. - // If lots of registers are used the parallelism may suffer. But in our case - // NVCC gets this wrong and we want to avoid spilling all the time or else - // the performance is really bad. This essentially tells NVCC to prefer using lots - // of registers over spilling. - // - // The optimal values for block_size and min_block_per_sm were found through testing, - // which are 128-8 or 256-4. - static constexpr int block_size = 128; - static constexpr int min_block_per_sm = 8; - - cudf::size_type const input_size; - std::size_t const path_size; - - std::size_t const num_threads_per_row; - std::size_t const num_blocks; }; /** @@ -1038,9 +1021,7 @@ std::vector> get_json_object( h_path_data, stream, rmm::mr::get_current_device_resource()); thrust::uninitialized_fill( rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0); - - auto const kernel = kernel_launcher{input.size(), json_paths.size()}; - kernel.exec(*d_input_ptr, d_path_data, stream); + kernel_launcher::exec(*d_input_ptr, d_path_data, stream); // Do not use parallel check since we do not have many elements. auto h_has_out_of_bound = cudf::detail::make_host_vector_sync(d_has_out_of_bound, stream); @@ -1112,7 +1093,7 @@ std::vector> get_json_object( h_path_data, stream, rmm::mr::get_current_device_resource()); thrust::uninitialized_fill( rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0); - kernel.exec(*d_input_ptr, d_path_data, stream); + kernel_launcher::exec(*d_input_ptr, d_path_data, stream); // Check out of bound again to make sure everything looks right. h_has_out_of_bound = cudf::detail::make_host_vector_sync(d_has_out_of_bound, stream); @@ -1123,12 +1104,14 @@ std::vector> get_json_object( // If OOB is still detected, there must be something wrong happened. CUDF_EXPECTS(has_no_oob, "Unexpected out-of-bound write in get_json_object kernel."); - for (auto const idx : oob_indices) { - output[idx] = cudf::make_strings_column(input.size(), - std::move(out_offsets_and_sizes[idx].first), - out_char_buffers[idx].release(), - out_null_masks_and_null_counts[idx].second, - std::move(out_null_masks_and_null_counts[idx].first)); + for (std::size_t idx = 0; idx < oob_indices.size(); ++idx) { + auto const out_idx = oob_indices[idx]; + output[out_idx] = + cudf::make_strings_column(input.size(), + std::move(out_offsets_and_sizes[idx].first), + out_char_buffers[idx].release(), + out_null_masks_and_null_counts[idx].second, + std::move(out_null_masks_and_null_counts[idx].first)); } return output; } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java index 9c3ba6a4d3..59d6c2bcb0 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java @@ -642,6 +642,29 @@ void getJsonObjectMultiplePathsTest() { } } + @Test + void getJsonObjectMultiplePathsTest_JNIKernelCalledTwice() { + List path0 = Arrays.asList(namedPath("k0")); + List path1 = Arrays.asList(namedPath("k1")); + List path2 = Arrays.asList(); + List> paths = Arrays.asList(path0, path1, path2); + try (ColumnVector jsonCv = ColumnVector.fromStrings("{\"k0\": \"v0\", \"k1\": \"v1\"}", "['\n\n\n\n\n\n\n\n\n\n']"); + ColumnVector expected0 = ColumnVector.fromStrings("v0", null); + ColumnVector expected1 = ColumnVector.fromStrings("v1", null); + ColumnVector expected2 = ColumnVector.fromStrings("{\"k0\":\"v0\",\"k1\":\"v1\"}", "[\"\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\"]")) { + ColumnVector[] output = JSONUtils.getJsonObjectMultiplePaths(jsonCv, paths); + try { + assertColumnsAreEqual(expected0, output[0]); + assertColumnsAreEqual(expected1, output[1]); + assertColumnsAreEqual(expected2, output[2]); + } finally { + for (ColumnVector cv : output) { + cv.close(); + } + } + } + } + private JSONUtils.PathInstructionJni wildcardPath() { return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.WILDCARD, "", -1); }