Skip to content

Commit

Permalink
remove host_parse_nested_json impl
Browse files Browse the repository at this point in the history
  • Loading branch information
vuule committed Jun 28, 2024
1 parent fb12d98 commit 7c43a94
Showing 1 changed file with 0 additions and 125 deletions.
125 changes: 0 additions & 125 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2244,131 +2244,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
return {};
}

table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
// Range of orchestrating/encapsulating function
CUDF_FUNC_RANGE();

auto const h_input = cudf::detail::make_std_vector_async(d_input, stream);

auto const new_line_delimited_json = options.is_enabled_lines();

// Get internal JSON column
json_column root_column{};
std::stack<tree_node> data_path{};

constexpr uint32_t row_offset_zero = 0;
constexpr uint32_t token_begin_offset_zero = 0;
constexpr uint32_t token_end_offset_zero = 0;
constexpr uint32_t node_init_child_count_zero = 0;

// Whether the tokenizer stage should keep quote characters for string values
// If the tokenizer keeps the quote characters, they may be stripped during type casting
constexpr bool include_quote_chars = true;

// We initialize the very root node and root column, which represent the JSON document being
// parsed. That root node is a list node and that root column is a list column. The column has the
// root node as its only row. The values parsed from the JSON input will be treated as follows:
// (1) For JSON lines: we expect to find a list of JSON values that all
// will be inserted into this root list column. (2) For regular JSON: we expect to have only a
// single value (list, struct, string, number, literal) that will be inserted into this root
// column.
root_column.append_row(
row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1);

// Push the root node onto the stack for the data path
data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero});

make_json_column(
root_column, data_path, h_input, d_input, options, include_quote_chars, stream, mr);

// data_root refers to the root column of the data represented by the given JSON string
auto const& data_root =
new_line_delimited_json ? root_column : root_column.child_columns.begin()->second;

// Zero row entries
if (data_root.type == json_col_t::ListColumn && data_root.child_columns.empty()) {
return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
}

// Verify that we were in fact given a list of structs (or in JSON speech: an array of objects)
auto constexpr single_child_col_count = 1;
CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and
data_root.child_columns.size() == single_child_col_count and
data_root.child_columns.begin()->second.type == json_col_t::StructColumn,
"Currently the nested JSON parser only supports an array of (nested) objects");

// Slice off the root list column, which has only a single row that contains all the structs
auto const& root_struct_col = data_root.child_columns.begin()->second;

// Initialize meta data to be populated while recursing through the tree of columns
std::vector<std::unique_ptr<column>> out_columns;
std::vector<column_name_info> out_column_names;

// Iterate over the struct's child columns and convert to cudf column
size_type column_index = 0;
for (auto const& col_name : root_struct_col.column_order) {
auto const& json_col = root_struct_col.child_columns.find(col_name)->second;
// Insert this columns name into the schema
out_column_names.emplace_back(col_name);

std::optional<schema_element> child_schema_element = std::visit(
cudf::detail::visitor_overload{
[column_index](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
auto ret = (static_cast<std::size_t>(column_index) < user_dtypes.size())
? std::optional<schema_element>{{user_dtypes[column_index]}}
: std::optional<schema_element>{};
#ifdef NJP_DEBUG_PRINT
std::cout << "Column by index: #" << column_index << ", type id: "
<< (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
<< ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
<< "\n";
#endif
return ret;
},
[col_name](
std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes))
? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
: std::optional<schema_element>{};
#ifdef NJP_DEBUG_PRINT
std::cout << "Column by flat name: '" << col_name << "', type id: "
<< (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
<< ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
<< "\n";
#endif
return ret;
},
[col_name](std::map<std::string, schema_element> const& user_dtypes)
-> std::optional<schema_element> {
auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes))
? user_dtypes.find(col_name)->second
: std::optional<schema_element>{};
#ifdef NJP_DEBUG_PRINT
std::cout << "Column by nested name: #" << col_name << ", type id: "
<< (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
<< ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
<< "\n";
#endif
return ret;
}},
options.get_dtypes());

// Get this JSON column's cudf column and schema info
auto [cudf_col, col_name_info] =
json_column_to_cudf_column(json_col, d_input, options, child_schema_element, stream, mr);
out_column_names.back().children = std::move(col_name_info);
out_columns.emplace_back(std::move(cudf_col));

column_index++;
}

return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
}

} // namespace detail
} // namespace cudf::io::json

Expand Down

0 comments on commit 7c43a94

Please sign in to comment.