Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add null-count parameter to json experimental parse_data utility #13107

Merged
merged 14 commits into from
Apr 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions cpp/include/cudf/io/detail/data_casting.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ template <typename str_tuple_it>
struct string_parse {
str_tuple_it str_tuples;
bitmask_type* null_mask;
size_type* null_count_data;
cudf::io::parse_options_view const options;
size_type* d_offsets{};
char* d_chars{};
Expand All @@ -319,14 +320,18 @@ struct string_parse {
serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
if (is_null_literal && null_mask != nullptr) {
clear_bit(null_mask, idx);
atomicAdd(null_count_data, 1);
if (!d_chars) d_offsets[idx] = 0;
return;
}

char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
if (null_mask != nullptr) clear_bit(null_mask, idx);
if (null_mask != nullptr) {
clear_bit(null_mask, idx);
atomicAdd(null_count_data, 1);
}
if (!d_chars) d_offsets[idx] = 0;
} else {
if (!d_chars) d_offsets[idx] = str_process_info.bytes;
Expand All @@ -350,36 +355,44 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
size_type col_size,
data_type col_type,
B&& null_mask,
size_type null_count,
cudf::io::parse_options_view const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();

auto d_null_count = rmm::device_scalar<size_type>(null_count, stream);
auto null_count_data = d_null_count.data();

if (col_type == cudf::data_type{cudf::type_id::STRING}) {
// this utility calls the functor to build the offsets and chars columns
// this utility calls the functor to build the offsets and chars columns;
// the bitmask and null count may be updated by parse failures
auto [offsets, chars] = cudf::strings::detail::make_strings_children(
string_parse<decltype(str_tuples)>{
str_tuples, static_cast<bitmask_type*>(null_mask.data()), options},
str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
col_size,
stream,
mr);

auto null_count =
cudf::detail::null_count(static_cast<bitmask_type*>(null_mask.data()), 0, col_size, stream);
return make_strings_column(
col_size, std::move(offsets), std::move(chars), null_count, std::move(null_mask));
return make_strings_column(col_size,
std::move(offsets),
std::move(chars),
d_null_count.value(stream),
std::move(null_mask));
}

auto out_col = make_fixed_width_column(
col_type, col_size, std::move(null_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
auto out_col =
make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr);
auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream);

// use existing code (`ConvertFunctor`) to convert values
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
col_size,
[str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) {
[str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__(
size_type row) {
if (col.is_null(row)) { return; }
auto const in = str_tuples[row];

Expand All @@ -388,6 +401,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,

if (is_null_literal) {
col.set_null(row);
atomicAdd(null_count_data, 1);
return;
}

Expand All @@ -403,9 +417,14 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
col_type,
options,
false);
if (not is_parsed) { col.set_null(row); }
if (not is_parsed) {
col.set_null(row);
atomicAdd(null_count_data, 1);
}
});

out_col->set_null_count(d_null_count.value(stream));

return out_col;
}

Expand Down
9 changes: 6 additions & 3 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,8 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
auto d_column_names = experimental::detail::parse_data(string_views.begin(),
num_strings,
data_type{type_id::STRING},
rmm::device_buffer{0, stream},
rmm::device_buffer{},
0,
options_view,
stream,
rmm::mr::get_current_device_resource());
Expand Down Expand Up @@ -790,12 +791,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
target_type = cudf::io::detail::infer_data_type(
options.json_view(), d_input, string_ranges_it, col_size, stream);
}
validity_size_check(json_col);

auto [result_bitmask, null_count] = make_validity(json_col);
// Convert strings to the inferred data type
auto col = experimental::detail::parse_data(string_spans_it,
col_size,
target_type,
json_col.validity.release(),
std::move(result_bitmask),
null_count,
options.view(),
stream,
mr);
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1656,11 +1656,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream);
}

auto [result_bitmask, null_count] = make_validity(json_col);

// Convert strings to the inferred data type
auto col = experimental::detail::parse_data(string_spans_it,
col_size,
target_type,
make_validity(json_col).first,
std::move(result_bitmask),
null_count,
parsing_options(options).view(),
stream,
mr);
Expand Down
30 changes: 24 additions & 6 deletions cpp/tests/io/json_type_cast_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,14 @@ TEST_F(JSONTypeCastTest, String)
auto null_mask =
std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));

auto str_col = cudf::io::json::experimental::detail::parse_data(
svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr);
auto str_col = cudf::io::json::experimental::detail::parse_data(svs.data(),
svs.size(),
type,
std::move(null_mask),
0,
default_json_options().view(),
stream,
mr);

auto out_valids =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; });
Expand Down Expand Up @@ -109,8 +115,14 @@ TEST_F(JSONTypeCastTest, Int)
auto null_mask =
std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));

auto col = cudf::io::json::experimental::detail::parse_data(
svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr);
auto col = cudf::io::json::experimental::detail::parse_data(svs.data(),
svs.size(),
type,
std::move(null_mask),
0,
default_json_options().view(),
stream,
mr);

auto expected =
cudf::test::fixed_width_column_wrapper<int64_t>{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}};
Expand Down Expand Up @@ -146,8 +158,14 @@ TEST_F(JSONTypeCastTest, StringEscapes)
auto null_mask =
std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));

auto col = cudf::io::json::experimental::detail::parse_data(
svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr);
auto col = cudf::io::json::experimental::detail::parse_data(svs.data(),
svs.size(),
type,
std::move(null_mask),
0,
default_json_options().view(),
stream,
mr);

auto expected = cudf::test::strings_column_wrapper{
{"🚀", "A🚀AA", "", "", "", "\\", "➩", "", "\"\\/\b\f\n\r\t"},
Expand Down