Skip to content

Commit

Permalink
Add null-count parameter to json experimental parse_data utility (#13107
Browse files Browse the repository at this point in the history
)

Add `null_count` parameter to the `cudf::io::json::experimental::detail::parse_data` function which already accepts a `null_mask`. Normally, the callers already know the count. This unction can use the parameter to help build the output column.

Found while working on #13081
Contributes to: #11968

Authors:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: #13107
  • Loading branch information
davidwendt authored Apr 17, 2023
1 parent a6fb6a2 commit 7c3a34e
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 21 deletions.
41 changes: 30 additions & 11 deletions cpp/include/cudf/io/detail/data_casting.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ template <typename str_tuple_it>
struct string_parse {
str_tuple_it str_tuples;
bitmask_type* null_mask;
size_type* null_count_data;
cudf::io::parse_options_view const options;
size_type* d_offsets{};
char* d_chars{};
Expand All @@ -319,14 +320,18 @@ struct string_parse {
serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
if (is_null_literal && null_mask != nullptr) {
clear_bit(null_mask, idx);
atomicAdd(null_count_data, 1);
if (!d_chars) d_offsets[idx] = 0;
return;
}

char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
if (null_mask != nullptr) clear_bit(null_mask, idx);
if (null_mask != nullptr) {
clear_bit(null_mask, idx);
atomicAdd(null_count_data, 1);
}
if (!d_chars) d_offsets[idx] = 0;
} else {
if (!d_chars) d_offsets[idx] = str_process_info.bytes;
Expand All @@ -350,36 +355,44 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
size_type col_size,
data_type col_type,
B&& null_mask,
size_type null_count,
cudf::io::parse_options_view const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();

auto d_null_count = rmm::device_scalar<size_type>(null_count, stream);
auto null_count_data = d_null_count.data();

if (col_type == cudf::data_type{cudf::type_id::STRING}) {
// this utility calls the functor to build the offsets and chars columns
// this utility calls the functor to build the offsets and chars columns;
// the bitmask and null count may be updated by parse failures
auto [offsets, chars] = cudf::strings::detail::make_strings_children(
string_parse<decltype(str_tuples)>{
str_tuples, static_cast<bitmask_type*>(null_mask.data()), options},
str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options},
col_size,
stream,
mr);

auto null_count =
cudf::detail::null_count(static_cast<bitmask_type*>(null_mask.data()), 0, col_size, stream);
return make_strings_column(
col_size, std::move(offsets), std::move(chars), null_count, std::move(null_mask));
return make_strings_column(col_size,
std::move(offsets),
std::move(chars),
d_null_count.value(stream),
std::move(null_mask));
}

auto out_col = make_fixed_width_column(
col_type, col_size, std::move(null_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
auto out_col =
make_fixed_width_column(col_type, col_size, std::move(null_mask), null_count, stream, mr);
auto output_dv_ptr = mutable_column_device_view::create(*out_col, stream);

// use existing code (`ConvertFunctor`) to convert values
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
col_size,
[str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) {
[str_tuples, col = *output_dv_ptr, options, col_type, null_count_data] __device__(
size_type row) {
if (col.is_null(row)) { return; }
auto const in = str_tuples[row];

Expand All @@ -388,6 +401,7 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,

if (is_null_literal) {
col.set_null(row);
atomicAdd(null_count_data, 1);
return;
}

Expand All @@ -403,9 +417,14 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
col_type,
options,
false);
if (not is_parsed) { col.set_null(row); }
if (not is_parsed) {
col.set_null(row);
atomicAdd(null_count_data, 1);
}
});

out_col->set_null_count(d_null_count.value(stream));

return out_col;
}

Expand Down
9 changes: 6 additions & 3 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,8 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
auto d_column_names = experimental::detail::parse_data(string_views.begin(),
num_strings,
data_type{type_id::STRING},
rmm::device_buffer{0, stream},
rmm::device_buffer{},
0,
options_view,
stream,
rmm::mr::get_current_device_resource());
Expand Down Expand Up @@ -790,12 +791,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
target_type = cudf::io::detail::infer_data_type(
options.json_view(), d_input, string_ranges_it, col_size, stream);
}
validity_size_check(json_col);

auto [result_bitmask, null_count] = make_validity(json_col);
// Convert strings to the inferred data type
auto col = experimental::detail::parse_data(string_spans_it,
col_size,
target_type,
json_col.validity.release(),
std::move(result_bitmask),
null_count,
options.view(),
stream,
mr);
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1656,11 +1656,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream);
}

auto [result_bitmask, null_count] = make_validity(json_col);

// Convert strings to the inferred data type
auto col = experimental::detail::parse_data(string_spans_it,
col_size,
target_type,
make_validity(json_col).first,
std::move(result_bitmask),
null_count,
parsing_options(options).view(),
stream,
mr);
Expand Down
30 changes: 24 additions & 6 deletions cpp/tests/io/json_type_cast_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,14 @@ TEST_F(JSONTypeCastTest, String)
auto null_mask =
std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));

auto str_col = cudf::io::json::experimental::detail::parse_data(
svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr);
auto str_col = cudf::io::json::experimental::detail::parse_data(svs.data(),
svs.size(),
type,
std::move(null_mask),
0,
default_json_options().view(),
stream,
mr);

auto out_valids =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2 and i != 4; });
Expand Down Expand Up @@ -109,8 +115,14 @@ TEST_F(JSONTypeCastTest, Int)
auto null_mask =
std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));

auto col = cudf::io::json::experimental::detail::parse_data(
svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr);
auto col = cudf::io::json::experimental::detail::parse_data(svs.data(),
svs.size(),
type,
std::move(null_mask),
0,
default_json_options().view(),
stream,
mr);

auto expected =
cudf::test::fixed_width_column_wrapper<int64_t>{{1, 2, 3, 1, 5, 0}, {1, 0, 1, 1, 1, 1}};
Expand Down Expand Up @@ -146,8 +158,14 @@ TEST_F(JSONTypeCastTest, StringEscapes)
auto null_mask =
std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + d_column->size()));

auto col = cudf::io::json::experimental::detail::parse_data(
svs.data(), svs.size(), type, std::move(null_mask), default_json_options().view(), stream, mr);
auto col = cudf::io::json::experimental::detail::parse_data(svs.data(),
svs.size(),
type,
std::move(null_mask),
0,
default_json_options().view(),
stream,
mr);

auto expected = cudf::test::strings_column_wrapper{
{"🚀", "A🚀AA", "", "", "", "\\", "", "", "\"\\/\b\f\n\r\t"},
Expand Down

0 comments on commit 7c3a34e

Please sign in to comment.