Skip to content

Commit

Permalink
Fix all null list column with missing child column in JSON reader (#1…
Browse files Browse the repository at this point in the history
  • Loading branch information
karthikeyann authored Dec 6, 2024
1 parent cbeefd8 commit 14b4321
Show file tree
Hide file tree
Showing 5 changed files with 317 additions and 86 deletions.
126 changes: 86 additions & 40 deletions cpp/src/io/json/host_tree_algorithms.cu
Original file line number Diff line number Diff line change
Expand Up @@ -222,18 +222,19 @@ struct json_column_data {
using hashmap_of_device_columns =
std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;

std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
device_json_column& root,
host_span<uint8_t const> is_str_column_all_nulls,
tree_meta_t& d_column_tree,
device_span<NodeIndexT const> d_unique_col_ids,
device_span<size_type const> d_max_row_offsets,
std::vector<std::string> const& column_names,
NodeIndexT row_array_parent_col_id,
bool is_array_of_arrays,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
std::
tuple<cudf::detail::host_vector<bool>, cudf::detail::host_vector<bool>, hashmap_of_device_columns>
build_tree(device_json_column& root,
host_span<uint8_t const> is_str_column_all_nulls,
tree_meta_t& d_column_tree,
device_span<NodeIndexT const> d_unique_col_ids,
device_span<size_type const> d_max_row_offsets,
std::vector<std::string> const& column_names,
NodeIndexT row_array_parent_col_id,
bool is_array_of_arrays,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

void scatter_offsets(tree_meta_t const& tree,
device_span<NodeIndexT const> col_ids,
Expand All @@ -242,6 +243,7 @@ void scatter_offsets(tree_meta_t const& tree,
device_span<size_type> sorted_col_ids, // Reuse this for parent_col_ids
tree_meta_t const& d_column_tree,
host_span<const bool> ignore_vals,
host_span<const bool> is_mixed,
hashmap_of_device_columns const& columns,
rmm::cuda_stream_view stream);

Expand Down Expand Up @@ -363,17 +365,17 @@ void make_device_json_column(device_span<SymbolT const> input,
}
return std::vector<uint8_t>();
}();
auto const [ignore_vals, columns] = build_tree(root,
is_str_column_all_nulls,
d_column_tree,
d_unique_col_ids,
d_max_row_offsets,
column_names,
row_array_parent_col_id,
is_array_of_arrays,
options,
stream,
mr);
auto const [ignore_vals, is_mixed_pruned, columns] = build_tree(root,
is_str_column_all_nulls,
d_column_tree,
d_unique_col_ids,
d_max_row_offsets,
column_names,
row_array_parent_col_id,
is_array_of_arrays,
options,
stream,
mr);
if (ignore_vals.empty()) return;
scatter_offsets(tree,
col_ids,
Expand All @@ -382,22 +384,24 @@ void make_device_json_column(device_span<SymbolT const> input,
sorted_col_ids,
d_column_tree,
ignore_vals,
is_mixed_pruned,
columns,
stream);
}

std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
device_json_column& root,
host_span<uint8_t const> is_str_column_all_nulls,
tree_meta_t& d_column_tree,
device_span<NodeIndexT const> d_unique_col_ids,
device_span<size_type const> d_max_row_offsets,
std::vector<std::string> const& column_names,
NodeIndexT row_array_parent_col_id,
bool is_array_of_arrays,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
std::
tuple<cudf::detail::host_vector<bool>, cudf::detail::host_vector<bool>, hashmap_of_device_columns>
build_tree(device_json_column& root,
host_span<uint8_t const> is_str_column_all_nulls,
tree_meta_t& d_column_tree,
device_span<NodeIndexT const> d_unique_col_ids,
device_span<size_type const> d_max_row_offsets,
std::vector<std::string> const& column_names,
NodeIndexT row_array_parent_col_id,
bool is_array_of_arrays,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
bool const is_enabled_lines = options.is_enabled_lines();
bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
Expand Down Expand Up @@ -488,7 +492,9 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
// NoPruning: iterate through schema and enforce type.

if (adj[parent_node_sentinel].empty())
return {cudf::detail::make_host_vector<bool>(0, stream), {}}; // for empty file
return {cudf::detail::make_host_vector<bool>(0, stream),
cudf::detail::make_host_vector<bool>(0, stream),
{}}; // for empty file
CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1");
auto expected_types = cudf::detail::make_host_vector<NodeT>(num_columns, stream);
std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES);
Expand Down Expand Up @@ -551,11 +557,14 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
auto list_child = schema.child_types.at(this_list_child_name);
for (auto const& child_id : child_ids)
mark_is_pruned(child_id, list_child);
// TODO: Store null map of non-target types for list children to mark list entry as null.
}
};
if (is_array_of_arrays) {
if (adj[adj[parent_node_sentinel][0]].empty())
return {cudf::detail::make_host_vector<bool>(0, stream), {}};
return {cudf::detail::make_host_vector<bool>(0, stream),
cudf::detail::make_host_vector<bool>(0, stream),
{}};
auto root_list_col_id =
is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0];
// mark root and row array col_id as not pruned.
Expand Down Expand Up @@ -647,8 +656,12 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
? adj[parent_node_sentinel][0]
: (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);

// List children which are pruned mixed types, nullify parent list row.
auto is_mixed_pruned = cudf::detail::make_host_vector<bool>(num_columns, stream);
std::fill_n(is_mixed_pruned.begin(), num_columns, false);
auto handle_mixed_types = [&column_categories,
&is_str_column_all_nulls,
&is_mixed_pruned,
&is_pruned,
&expected_types,
&is_enabled_mixed_types_as_string,
Expand Down Expand Up @@ -794,6 +807,14 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
"list child column insertion failed, duplicate column name in the parent");
ref.get().column_order.emplace_back(list_child_name);
auto this_ref = std::ref(ref.get().child_columns.at(list_child_name));
if (options.is_enabled_experimental()) {
for (auto const& child_id : child_ids) {
if (is_pruned[child_id]) {
// store this child_id for mixed_type nullify parent list_id.
is_mixed_pruned[child_id] = is_pruned[child_id];
}
}
}
// Mixed type handling
handle_mixed_types(child_ids);
if (child_ids.empty()) {
Expand Down Expand Up @@ -829,7 +850,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
[](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, expected_types, stream);

return {is_pruned, columns};
return {is_pruned, is_mixed_pruned, columns};
}

void scatter_offsets(tree_meta_t const& tree,
Expand All @@ -839,6 +860,7 @@ void scatter_offsets(tree_meta_t const& tree,
device_span<size_type> sorted_col_ids, // Reuse this for parent_col_ids
tree_meta_t const& d_column_tree,
host_span<const bool> ignore_vals,
host_span<const bool> is_mixed_pruned,
hashmap_of_device_columns const& columns,
rmm::cuda_stream_view stream)
{
Expand All @@ -857,6 +879,8 @@ void scatter_offsets(tree_meta_t const& tree,

auto d_ignore_vals = cudf::detail::make_device_uvector_async(
ignore_vals, stream, cudf::get_current_device_resource_ref());
auto d_is_mixed_pruned = cudf::detail::make_device_uvector_async(
is_mixed_pruned, stream, cudf::get_current_device_resource_ref());
auto d_columns_data = cudf::detail::make_device_uvector_async(
columns_data, stream, cudf::get_current_device_resource_ref());

Expand Down Expand Up @@ -921,9 +945,31 @@ void scatter_offsets(tree_meta_t const& tree,
column_categories[col_ids[parent_node_id]] == NC_LIST and
(!d_ignore_vals[col_ids[parent_node_id]]);
});
// For children of list and in ignore_vals, find it's parent node id, and set corresponding
// parent's null mask to null. Setting mixed type list rows to null.
auto const num_list_children = thrust::distance(
thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), list_children_end);
thrust::for_each_n(
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator<size_type>(0),
num_list_children,
[node_ids = node_ids.begin(),
parent_node_ids = tree.parent_node_ids.begin(),
column_categories = d_column_tree.node_categories.begin(),
col_ids = col_ids.begin(),
row_offsets = row_offsets.begin(),
d_is_mixed_pruned = d_is_mixed_pruned.begin(),
d_ignore_vals = d_ignore_vals.begin(),
d_columns_data = d_columns_data.begin()] __device__(size_type i) {
auto const node_id = node_ids[i];
auto const parent_node_id = parent_node_ids[node_id];
if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return;
if (column_categories[col_ids[parent_node_id]] == NC_LIST and
d_is_mixed_pruned[col_ids[node_id]]) {
clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]);
}
});

auto const num_list_children =
list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
parent_col_ids.begin(),
parent_col_ids.begin() + num_list_children,
Expand Down
67 changes: 35 additions & 32 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -464,46 +464,49 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
column_names.emplace_back(
json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);

// Note: json_col modified here, reuse the memory
// If child is not present, set the null mask correctly, but offsets are zero, and children
// are empty. Note: json_col modified here, reuse the memory
auto offsets_column = std::make_unique<column>(data_type{type_id::INT32},
num_rows + 1,
json_col.child_offsets.release(),
rmm::device_buffer{},
0);
// Create children column
auto child_schema_element =
json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
auto [child_column, names] =
json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
? std::pair<std::unique_ptr<column>,
// EMPTY type could not used because gather throws exception on EMPTY type.
std::vector<column_name_info>>{std::make_unique<column>(
data_type{type_id::INT8},
0,
rmm::device_buffer{},
rmm::device_buffer{},
0),
std::vector<column_name_info>{}}
: device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
d_input,
options,
prune_columns,
child_schema_element,
stream,
mr);
auto child_schema_element = get_list_child_schema();
auto [child_column, names] = [&]() {
if (json_col.child_columns.empty()) {
// EMPTY type could not used because gather throws exception on EMPTY type.
auto empty_col = make_empty_column(
child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), stream, mr);
auto children_metadata = std::vector<column_name_info>{
make_column_name_info(
child_schema_element.value_or(schema_element{data_type{type_id::INT8}}),
list_child_name)
.children};

return std::pair<std::unique_ptr<column>, std::vector<column_name_info>>{
std::move(empty_col), children_metadata};
}
return device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
d_input,
options,
prune_columns,
child_schema_element,
stream,
mr);
}();
column_names.back().children = names;
auto [result_bitmask, null_count] = make_validity(json_col);
auto ret_col = make_lists_column(num_rows,
std::move(offsets_column),
std::move(child_column),
0,
rmm::device_buffer{0, stream, mr},
stream,
mr);
// The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and
// null validation applied in make_lists_column factory, which is not needed for json
// parent column cannot be null when its children is non-empty in JSON
if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); }
auto ret_col = make_lists_column(
num_rows,
std::move(offsets_column),
std::move(child_column),
null_count,
null_count == 0 ? rmm::device_buffer{0, stream, mr} : std::move(result_bitmask),
stream,
mr);
// Since some rows in child column may need to be nullified due to mixed types, we can not
// skip the purge_nonempty_nulls call in make_lists_column factory
return {std::move(ret_col), std::move(column_names)};
}
default: CUDF_FAIL("Unsupported column type"); break;
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,18 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Create empty column of a given nested schema
*
* @param schema The schema of the column to create
* @param stream The CUDA stream to which kernels are dispatched
* @param mr resource with which to allocate
* @return The empty column
*/
std::unique_ptr<column> make_empty_column(schema_element const& schema,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

/**
* @brief Create all null column of a given nested schema
*
Expand Down
Loading

0 comments on commit 14b4321

Please sign in to comment.