diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 3e136e947cf..6798557e14e 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -282,8 +282,8 @@ class json_reader_options { /** * @brief Whether to enable experimental features. * - * When set to true, new experimental features such new column tree construction - * will be enabled. + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. * @return true if experimental features are enabled */ [[nodiscard]] bool is_enabled_experimental() const { return _experimental; } @@ -467,8 +467,8 @@ class json_reader_options { /** * @brief Set whether to enable experimental features. * - * When set to true, new experimental features such new column tree construction - * will be enabled. + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. * * @param val Boolean value to enable/disable experimental features */ @@ -719,8 +719,8 @@ class json_reader_options_builder { /** * @brief Set whether to enable experimental features. * - * When set to true, new experimental features such new column tree construction - * will be enabled. + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. * * @param val Boolean value to enable/disable experimental features * @return this for chaining diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 23e78762ed2..f82cf0d0b2f 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -60,11 +60,10 @@ namespace cudf::io::json::detail { */ rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, tree_meta_t const& d_tree, - device_span col_ids, + device_span col_ids, size_type const num_columns, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); auto [level2_nodes, level2_indices] = get_array_children_indices( row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); @@ -92,7 +91,6 @@ std::vector copy_strings_to_host_sync( device_span node_range_end, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); auto const num_strings = node_range_begin.size(); rmm::device_uvector string_offsets(num_strings, stream); rmm::device_uvector string_lengths(num_strings, stream); @@ -163,7 +161,7 @@ std::vector copy_strings_to_host_sync( rmm::device_uvector is_all_nulls_each_column(device_span input, tree_meta_t const& d_column_tree, tree_meta_t const& tree, - device_span col_ids, + device_span col_ids, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { @@ -195,7 +193,7 @@ rmm::device_uvector is_all_nulls_each_column(device_span return is_all_nulls; } -NodeIndexT get_row_array_parent_col_id(device_span col_ids, +NodeIndexT get_row_array_parent_col_id(device_span col_ids, bool is_enabled_lines, rmm::cuda_stream_view stream) { @@ -223,29 +221,30 @@ struct json_column_data { bitmask_type* validity; }; -std::pair, - std::unordered_map>> -build_tree(device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -void scatter_offsets( - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids +using hashmap_of_device_columns = + std::unordered_map>; + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, tree_meta_t& d_column_tree, - host_span ignore_vals, - std::unordered_map>& columns, - rmm::cuda_stream_view stream); + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream); /** * @brief Constructs `d_json_column` from node tree representation @@ -267,22 +266,20 @@ void scatter_offsets( * of child_offets and validity members of `d_json_column` */ void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, device_json_column& root, bool is_array_of_arrays, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - bool const is_enabled_lines = options.is_enabled_lines(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + // make a copy + auto sorted_col_ids = cudf::detail::make_device_uvector_async( + col_ids, stream, cudf::get_current_device_resource_ref()); // sort by {col_id} on {node_ids} stable rmm::device_uvector node_ids(col_ids.size(), stream); @@ -318,7 +315,7 @@ void make_device_json_column(device_span input, cudf::detail::make_host_vector_sync(values_column_indices, stream); std::transform(unique_col_ids.begin(), unique_col_ids.end(), - column_names.begin(), + column_names.cbegin(), column_names.begin(), [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( auto col_id, auto name) mutable { @@ -335,17 +332,17 @@ void make_device_json_column(device_span input, } return std::vector(); }(); - auto [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); + auto const [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); scatter_offsets(tree, col_ids, @@ -358,21 +355,19 @@ void make_device_json_column(device_span input, stream); } -std::pair, - std::unordered_map>> -build_tree(device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); auto column_categories = @@ -442,7 +437,7 @@ build_tree(device_json_column& root, }); // use hash map because we may skip field name's col_ids - std::unordered_map> columns; + hashmap_of_device_columns columns; // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking std::map, NodeIndexT> mapped_columns; // find column_ids which are values, but should be ignored in validity @@ -667,18 +662,16 @@ build_tree(device_json_column& root, return {ignore_vals, columns}; } -void scatter_offsets( - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t& d_column_tree, - host_span ignore_vals, - std::unordered_map>& columns, - rmm::cuda_stream_view stream) +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); auto const num_nodes = col_ids.size(); auto const num_columns = d_column_tree.node_categories.size(); // move columns data to device. @@ -841,19 +834,18 @@ std::map unified_schema(cudf::io::json_reader_optio options.get_dtypes()); } -std::pair, - std::unordered_map>> -build_tree2(device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Constructs `d_json_column` from node tree representation @@ -875,22 +867,20 @@ build_tree2(device_json_column& root, * of child_offets and validity members of `d_json_column` */ void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, device_json_column& root, bool is_array_of_arrays, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - bool const is_enabled_lines = options.is_enabled_lines(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + // make a copy + auto sorted_col_ids = cudf::detail::make_device_uvector_async( + col_ids, stream, cudf::get_current_device_resource_ref()); // sort by {col_id} on {node_ids} stable rmm::device_uvector node_ids(col_ids.size(), stream); @@ -927,7 +917,7 @@ void make_device_json_column(device_span input, cudf::detail::make_host_vector_sync(values_column_indices, stream); std::transform(unique_col_ids.begin(), unique_col_ids.end(), - column_names.begin(), + column_names.cbegin(), column_names.begin(), [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( auto col_id, auto name) mutable { @@ -944,17 +934,17 @@ void make_device_json_column(device_span input, } return std::vector(); }(); - auto [ignore_vals, columns] = build_tree2(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); + auto const [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); if (ignore_vals.empty()) return; scatter_offsets(tree, col_ids, @@ -967,21 +957,19 @@ void make_device_json_column(device_span input, stream); } -std::pair, - std::unordered_map>> -build_tree2(device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); bool const is_enabled_lines = options.is_enabled_lines(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); @@ -1069,7 +1057,8 @@ build_tree2(device_json_column& root, if (adj[parent_node_sentinel].empty()) return {}; // for empty file CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1"); - std::vector expected_types(num_columns, NUM_NODE_CLASSES); + auto expected_types = cudf::detail::make_host_vector(num_columns, stream); + std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); auto lookup_names = [&column_names](auto child_ids, auto name) { for (auto const& child_id : child_ids) { @@ -1389,9 +1378,9 @@ build_tree2(device_json_column& root, if (columns.count(i)) { columns.at(i).get().forced_as_string_column = true; } } } - std::transform(expected_types.begin(), - expected_types.end(), - column_categories.begin(), + std::transform(expected_types.cbegin(), + expected_types.cend(), + column_categories.cbegin(), expected_types.begin(), [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; }); cudaMemcpyAsync(d_column_tree.node_categories.begin(), diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 389a989b206..f1c79b15bb6 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -105,11 +105,11 @@ void print_tree(host_span input, * max row offsets of columns */ std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, +reduce_to_column_tree(tree_meta_t const& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream) @@ -313,7 +313,7 @@ std::pair, std::vector> device_json_co // Note: json_col modified here, moves this memory }; - auto get_child_schema = [schema](auto child_name) -> std::optional { + auto get_child_schema = [&schema](auto child_name) -> std::optional { if (schema.has_value()) { auto const result = schema.value().child_types.find(child_name); if (result != std::end(schema.value().child_types)) { return result->second; } @@ -321,7 +321,7 @@ std::pair, std::vector> device_json_co return {}; }; - auto get_list_child_schema = [schema]() -> std::optional { + auto get_list_child_schema = [&schema]() -> std::optional { if (schema.has_value()) { if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second; } @@ -481,6 +481,16 @@ std::pair, std::vector> device_json_co } } +template +auto make_device_json_column_dispatch(bool experimental, Args&&... args) +{ + if (experimental) { + return experimental::make_device_json_column(std::forward(args)...); + } else { + return make_device_json_column(std::forward(args)...); + } +} + table_with_metadata device_parse_nested_json(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, @@ -538,27 +548,16 @@ table_with_metadata device_parse_nested_json(device_span d_input, 0); // Get internal JSON column - if (options.is_enabled_experimental()) { - experimental::make_device_json_column(d_input, - gpu_tree, - gpu_col_id, - gpu_row_offsets, - root_column, - is_array_of_arrays, - options, - stream, - mr); - } else { - make_device_json_column(d_input, - gpu_tree, - gpu_col_id, - gpu_row_offsets, - root_column, - is_array_of_arrays, - options, - stream, - mr); - } + make_device_json_column_dispatch(options.is_enabled_experimental(), + d_input, + gpu_tree, + gpu_col_id, + gpu_row_offsets, + root_column, + is_array_of_arrays, + options, + stream, + mr); // data_root refers to the root column of the data represented by the given JSON string auto& data_root = diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 3821fa396e8..306597f09d1 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -315,11 +315,11 @@ get_array_children_indices(TreeDepthT row_array_children_level, * max row offsets of columns */ std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, +reduce_to_column_tree(tree_meta_t const& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); @@ -343,9 +343,9 @@ reduce_to_column_tree(tree_meta_t& tree, * of child_offets and validity members of `d_json_column` */ void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, device_json_column& root, bool is_array_of_arrays, cudf::io::json_reader_options const& options, @@ -353,10 +353,13 @@ void make_device_json_column(device_span input, rmm::device_async_resource_ref mr); namespace experimental { +/** + * @copydoc cudf::io::json::detail::make_device_json_column + */ void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, device_json_column& root, bool is_array_of_arrays, cudf::io::json_reader_options const& options,