diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 60cbfbc0dae..288a5690282 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -417,6 +418,7 @@ struct bitfield_block { * @param null_mask Null mask * @param null_count_data pointer to store null count * @param options Settings for controlling string processing behavior + * @param d_sizes Output size of each row * @param d_offsets Offsets to identify where to store the results for each string * @param d_chars Character array to store the characters of strings */ @@ -427,7 +429,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, bitmask_type* null_mask, size_type* null_count_data, cudf::io::parse_options_view const options, - size_type* d_offsets, + size_type* d_sizes, + cudf::detail::input_offsetalator d_offsets, char* d_chars) { constexpr auto BLOCK_SIZE = @@ -455,7 +458,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, istring = get_next_string()) { // skip nulls if (null_mask != nullptr && not bit_is_set(null_mask, istring)) { - if (!d_chars && lane == 0) d_offsets[istring] = 0; + if (!d_chars && lane == 0) { d_sizes[istring] = 0; } continue; // gride-stride return; } @@ -476,7 +479,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, if (lane == 0) { clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[istring] = 0; + if (!d_chars) { d_sizes[istring] = 0; } } continue; // gride-stride return; } @@ -491,7 +494,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, // Copy literal/numeric value if (not is_string_value) { if (!d_chars) { - if (lane == 0) { d_offsets[istring] = in_end - in_begin; } + if (lane == 0) { d_sizes[istring] = in_end - in_begin; } } else { for (thread_index_type char_index = lane; char_index < (in_end - in_begin); char_index += BLOCK_SIZE) { @@ -621,8 +624,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, clear_bit(null_mask, istring); atomicAdd(null_count_data, 1); } - last_offset = 0; - d_offsets[istring] = 0; + last_offset = 0; + d_sizes[istring] = 0; } if constexpr (!is_warp) { __syncthreads(); } break; // gride-stride return; @@ -729,7 +732,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples, } } } // char for-loop - if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; } + if (!d_chars && lane == 0) { d_sizes[istring] = last_offset; } } // grid-stride for-loop } @@ -739,13 +742,14 @@ struct string_parse { bitmask_type* null_mask; size_type* null_count_data; cudf::io::parse_options_view const options; - size_type* d_offsets{}; + size_type* d_sizes{}; + cudf::detail::input_offsetalator d_offsets; char* d_chars{}; __device__ void operator()(size_type idx) { if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } auto const in_begin = str_tuples[idx].first; @@ -761,7 +765,7 @@ struct string_parse { if (is_null_literal && null_mask != nullptr) { clear_bit(null_mask, idx); atomicAdd(null_count_data, 1); - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } return; } } @@ -773,9 +777,9 @@ struct string_parse { clear_bit(null_mask, idx); atomicAdd(null_count_data, 1); } - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_sizes[idx] = 0; } } else { - if (!d_chars) d_offsets[idx] = str_process_info.bytes; + if (!d_chars) { d_sizes[idx] = str_process_info.bytes; } } } }; @@ -811,13 +815,12 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, size_type{0}, thrust::maximum{}); - auto offsets = cudf::make_numeric_column( - data_type{type_to_id()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr); - auto d_offsets = offsets->mutable_view().data(); + auto sizes = rmm::device_uvector(col_size, stream); + auto d_sizes = sizes.data(); auto null_count_data = d_null_count.data(); auto single_thread_fn = string_parse{ - str_tuples, static_cast(null_mask.data()), null_count_data, options, d_offsets}; + str_tuples, static_cast(null_mask.data()), null_count_data, options, d_sizes}; thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, @@ -838,7 +841,8 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, - d_offsets, + d_sizes, + cudf::detail::input_offsetalator{}, nullptr); } @@ -853,20 +857,22 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, - d_offsets, + d_sizes, + cudf::detail::input_offsetalator{}, nullptr); } - auto const bytes = - cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream); - CUDF_EXPECTS(bytes <= std::numeric_limits::max(), - "Size of output exceeds the column size limit", - std::overflow_error); + + auto [offsets, bytes] = + cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr); + auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); // CHARS column rmm::device_uvector chars(bytes, stream, mr); auto d_chars = chars.data(); - single_thread_fn.d_chars = d_chars; + single_thread_fn.d_chars = d_chars; + single_thread_fn.d_offsets = d_offsets; + thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), col_size, @@ -882,6 +888,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, + d_sizes, d_offsets, d_chars); } @@ -897,6 +904,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, static_cast(null_mask.data()), null_count_data, options, + d_sizes, d_offsets, d_chars); } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 5d790e73246..57aa2721756 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2374,7 +2374,6 @@ TEST_F(JsonReaderTest, MapTypes) EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type"; i++; } - std::cout << "\n"; }; // json