Skip to content

Commit

Permalink
Use offsetalator in cudf::get_json_object() (#15009)
Browse files Browse the repository at this point in the history
Updates `cudf::get_json_object()` to use the offsetalator to build the output strings column.
It adds a sizes vector to hold the output row lengths which is then converted to offsets using the new `make_offsets_child_column()` utitlity.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #15009
  • Loading branch information
davidwendt authored Feb 14, 2024
1 parent ac4debd commit 3547d41
Showing 1 changed file with 17 additions and 16 deletions.
33 changes: 17 additions & 16 deletions cpp/src/json/json_path.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/json/json.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
Expand Down Expand Up @@ -903,7 +905,8 @@ template <int block_size>
__launch_bounds__(block_size) CUDF_KERNEL
void get_json_object_kernel(column_device_view col,
path_operator const* const commands,
size_type* output_offsets,
size_type* d_sizes,
cudf::detail::input_offsetalator output_offsets,
thrust::optional<char*> out_buf,
thrust::optional<bitmask_type*> out_validity,
thrust::optional<size_type*> out_valid_count,
Expand Down Expand Up @@ -934,7 +937,7 @@ __launch_bounds__(block_size) CUDF_KERNEL

// filled in only during the precompute step. during the compute step, the offsets
// are fed back in so we do -not- want to write them out
if (!out_buf.has_value()) { output_offsets[tid] = static_cast<size_type>(output_size); }
if (!out_buf.has_value()) { d_sizes[tid] = output_size; }

// validity filled in only during the output step
if (out_validity.has_value()) {
Expand Down Expand Up @@ -971,11 +974,6 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c

if (col.is_empty()) return make_empty_column(type_id::STRING);

// allocate output offsets buffer.
auto offsets = cudf::make_fixed_width_column(
data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr);
cudf::mutable_column_view offsets_view(*offsets);

// if the query is empty, return a string column containing all nulls
if (!std::get<0>(preprocess).has_value()) {
return std::make_unique<column>(
Expand All @@ -986,6 +984,11 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
col.size()); // null count
}

// compute output sizes
auto sizes =
rmm::device_uvector<size_type>(col.size(), stream, rmm::mr::get_current_device_resource());
auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(col.offsets());

constexpr int block_size = 512;
cudf::detail::grid_1d const grid{col.size(), block_size};
auto cdv = column_device_view::create(col.parent(), stream);
Expand All @@ -994,20 +997,17 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*cdv,
std::get<0>(preprocess).value().data(),
offsets_view.head<size_type>(),
sizes.data(),
d_offsets,
thrust::nullopt,
thrust::nullopt,
thrust::nullopt,
options);

// convert sizes to offsets
thrust::exclusive_scan(rmm::exec_policy(stream),
offsets_view.head<size_type>(),
offsets_view.head<size_type>() + col.size() + 1,
offsets_view.head<size_type>(),
0);
size_type const output_size =
cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
auto [offsets, output_size] =
cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

// allocate output string column
rmm::device_uvector<char> chars(output_size, stream, mr);
Expand All @@ -1024,7 +1024,8 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*cdv,
std::get<0>(preprocess).value().data(),
offsets_view.head<size_type>(),
sizes.data(),
d_offsets,
chars.data(),
static_cast<bitmask_type*>(validity.data()),
d_valid_count.data(),
Expand Down

0 comments on commit 3547d41

Please sign in to comment.