Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use offsetalator in cudf::io::json::detail::parse_string #15900

Merged
merged 1 commit into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 32 additions & 24 deletions cpp/src/io/utilities/data_casting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/strings/detail/strings_children.cuh>
Expand Down Expand Up @@ -417,6 +418,7 @@ struct bitfield_block {
* @param null_mask Null mask
* @param null_count_data pointer to store null count
* @param options Settings for controlling string processing behavior
* @param d_sizes Output size of each row
* @param d_offsets Offsets to identify where to store the results for each string
* @param d_chars Character array to store the characters of strings
*/
Expand All @@ -427,7 +429,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
bitmask_type* null_mask,
size_type* null_count_data,
cudf::io::parse_options_view const options,
size_type* d_offsets,
size_type* d_sizes,
cudf::detail::input_offsetalator d_offsets,
char* d_chars)
{
constexpr auto BLOCK_SIZE =
Expand Down Expand Up @@ -455,7 +458,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
istring = get_next_string()) {
// skip nulls
if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
if (!d_chars && lane == 0) d_offsets[istring] = 0;
if (!d_chars && lane == 0) { d_sizes[istring] = 0; }
continue; // gride-stride return;
}

Expand All @@ -476,7 +479,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
if (lane == 0) {
clear_bit(null_mask, istring);
atomicAdd(null_count_data, 1);
if (!d_chars) d_offsets[istring] = 0;
if (!d_chars) { d_sizes[istring] = 0; }
}
continue; // gride-stride return;
}
Expand All @@ -491,7 +494,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
// Copy literal/numeric value
if (not is_string_value) {
if (!d_chars) {
if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
if (lane == 0) { d_sizes[istring] = in_end - in_begin; }
} else {
for (thread_index_type char_index = lane; char_index < (in_end - in_begin);
char_index += BLOCK_SIZE) {
Expand Down Expand Up @@ -621,8 +624,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
clear_bit(null_mask, istring);
atomicAdd(null_count_data, 1);
}
last_offset = 0;
d_offsets[istring] = 0;
last_offset = 0;
d_sizes[istring] = 0;
}
if constexpr (!is_warp) { __syncthreads(); }
break; // gride-stride return;
Expand Down Expand Up @@ -729,7 +732,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
}
}
} // char for-loop
if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
if (!d_chars && lane == 0) { d_sizes[istring] = last_offset; }
} // grid-stride for-loop
}

Expand All @@ -739,13 +742,14 @@ struct string_parse {
bitmask_type* null_mask;
size_type* null_count_data;
cudf::io::parse_options_view const options;
size_type* d_offsets{};
size_type* d_sizes{};
cudf::detail::input_offsetalator d_offsets;
char* d_chars{};

__device__ void operator()(size_type idx)
{
if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
auto const in_begin = str_tuples[idx].first;
Expand All @@ -761,7 +765,7 @@ struct string_parse {
if (is_null_literal && null_mask != nullptr) {
clear_bit(null_mask, idx);
atomicAdd(null_count_data, 1);
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
return;
}
}
Expand All @@ -773,9 +777,9 @@ struct string_parse {
clear_bit(null_mask, idx);
atomicAdd(null_count_data, 1);
}
if (!d_chars) d_offsets[idx] = 0;
if (!d_chars) { d_sizes[idx] = 0; }
} else {
if (!d_chars) d_offsets[idx] = str_process_info.bytes;
if (!d_chars) { d_sizes[idx] = str_process_info.bytes; }
}
}
};
Expand Down Expand Up @@ -811,13 +815,12 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
size_type{0},
thrust::maximum<size_type>{});

auto offsets = cudf::make_numeric_column(
data_type{type_to_id<size_type>()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
auto d_offsets = offsets->mutable_view().data<size_type>();
auto sizes = rmm::device_uvector<size_type>(col_size, stream);
auto d_sizes = sizes.data();
auto null_count_data = d_null_count.data();

auto single_thread_fn = string_parse<decltype(str_tuples)>{
str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_offsets};
str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_sizes};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
col_size,
Expand All @@ -838,7 +841,8 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
static_cast<bitmask_type*>(null_mask.data()),
null_count_data,
options,
d_offsets,
d_sizes,
cudf::detail::input_offsetalator{},
nullptr);
}

Expand All @@ -853,20 +857,22 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
static_cast<bitmask_type*>(null_mask.data()),
null_count_data,
options,
d_offsets,
d_sizes,
cudf::detail::input_offsetalator{},
nullptr);
}
auto const bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
"Size of output exceeds the column size limit",
std::overflow_error);

auto [offsets, bytes] =
cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());

// CHARS column
rmm::device_uvector<char> chars(bytes, stream, mr);
auto d_chars = chars.data();

single_thread_fn.d_chars = d_chars;
single_thread_fn.d_chars = d_chars;
single_thread_fn.d_offsets = d_offsets;

thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
col_size,
Expand All @@ -882,6 +888,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
static_cast<bitmask_type*>(null_mask.data()),
null_count_data,
options,
d_sizes,
d_offsets,
d_chars);
}
Expand All @@ -897,6 +904,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
static_cast<bitmask_type*>(null_mask.data()),
null_count_data,
options,
d_sizes,
d_offsets,
d_chars);
}
Expand Down
1 change: 0 additions & 1 deletion cpp/tests/io/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2374,7 +2374,6 @@ TEST_F(JsonReaderTest, MapTypes)
EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type";
i++;
}
std::cout << "\n";
};

// json
Expand Down
Loading