Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restructure JSON code to correctly reflect legacy/experimental status #13757

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -371,12 +371,12 @@ add_library(
src/io/csv/writer_impl.cu
src/io/functions.cpp
src/io/json/json_column.cu
src/io/json/json_gpu.cu
src/io/json/legacy/json_gpu.cu
src/io/json/json_tree.cu
src/io/json/nested_json_gpu.cu
src/io/json/reader_impl.cu
src/io/json/experimental/byte_range_info.cu
src/io/json/experimental/read_json.cu
src/io/json/read_json.cu
src/io/json/byte_range_info.cu
src/io/json/legacy/reader_impl.cu
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/cudf/io/detail/data_casting.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

#include <memory>

namespace cudf::io::json::experimental::detail {
namespace cudf::io::json::detail {

// Unicode code point escape sequence
static constexpr char UNICODE_SEQ = 0x7F;
Expand Down Expand Up @@ -428,4 +428,4 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
return out_col;
}

} // namespace cudf::io::json::experimental::detail
} // namespace cudf::io::json::detail
2 changes: 1 addition & 1 deletion cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ namespace cudf::io::json::detail {
*
* @return cudf::table object that contains the array of cudf::column.
*/
table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of scope for this PR, but a host_span<unique_ptr> feels off when it comes to ownership semantics. This pattern seems prevalent in cuIO though. Also I don't know if there's a nice idiomatic way to write this otherwise though since most implicit conversions won't quite work and there's limited interop between implicit conversions and template deduction.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, really sends a mixed message :D
Surely we can lose the unique_ptrs somewhere along the call chain...

json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,7 +21,7 @@
#include <rmm/exec_policy.hpp>
#include <thrust/find.h>

namespace cudf::io::detail::json::experimental {
namespace cudf::io::json::detail {

// Extract the first character position in the string.
size_type find_first_delimiter(device_span<char const> d_data,
Expand All @@ -33,4 +33,4 @@ size_type find_first_delimiter(device_span<char const> d_data,
return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
}

} // namespace cudf::io::detail::json::experimental
} // namespace cudf::io::json::detail
38 changes: 18 additions & 20 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@
#include <algorithm>
#include <cstdint>

namespace cudf::io::json {
namespace detail {
namespace cudf::io::json::detail {

// DEBUG prints
auto to_cat = [](auto v) -> std::string {
Expand Down Expand Up @@ -348,14 +347,14 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
cudf::io::parse_options_view options_view{};
options_view.quotechar = '\0'; // no quotes
options_view.keepquotes = true;
auto d_column_names = experimental::detail::parse_data(string_views.begin(),
num_strings,
data_type{type_id::STRING},
rmm::device_buffer{},
0,
options_view,
stream,
rmm::mr::get_current_device_resource());
auto d_column_names = parse_data(string_views.begin(),
num_strings,
data_type{type_id::STRING},
rmm::device_buffer{},
0,
options_view,
stream,
rmm::mr::get_current_device_resource());
auto to_host = [](auto const& col) {
if (col.is_empty()) return std::vector<std::string>{};
auto const scv = cudf::strings_column_view(col);
Expand Down Expand Up @@ -796,14 +795,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co

auto [result_bitmask, null_count] = make_validity(json_col);
// Convert strings to the inferred data type
auto col = experimental::detail::parse_data(string_spans_it,
col_size,
target_type,
std::move(result_bitmask),
null_count,
options.view(),
stream,
mr);
auto col = parse_data(string_spans_it,
col_size,
target_type,
std::move(result_bitmask),
null_count,
options.view(),
stream,
mr);

// Reset nullable if we do not have nulls
// This is to match the existing JSON reader's behaviour:
Expand Down Expand Up @@ -1044,5 +1043,4 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
}

} // namespace detail
} // namespace cudf::io::json
} // namespace cudf::io::json::detail
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,7 @@

using cudf::device_span;

namespace cudf {
namespace io {
namespace json {
namespace gpu {
using namespace ::cudf;
namespace cudf::io::json::detail::legacy {

namespace {
/**
Expand Down Expand Up @@ -515,7 +511,7 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
} // namespace

/**
* @copydoc cudf::io::json::gpu::convert_json_to_columns
* @copydoc cudf::io::json::detail::legacy::convert_json_to_columns
*/
void convert_json_to_columns(parse_options_view const& opts,
device_span<char const> const data,
Expand Down Expand Up @@ -547,7 +543,7 @@ void convert_json_to_columns(parse_options_view const& opts,
}

/**
* @copydoc cudf::io::gpu::detect_data_types
* @copydoc cudf::io::json::detail::legacy::detect_data_types
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the namespace was incorrect here AFAICT

*/

std::vector<cudf::io::column_type_histogram> detect_data_types(
Expand Down Expand Up @@ -592,7 +588,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
}

/**
* @copydoc cudf::io::json::gpu::gpu_collect_keys_info
* @copydoc cudf::io::json::detail::legacy::collect_keys_info
*/
void collect_keys_info(parse_options_view const& options,
device_span<char const> const data,
Expand All @@ -615,7 +611,4 @@ void collect_keys_info(parse_options_view const& options,
CUDF_CHECK_CUDA(stream.value());
}

} // namespace gpu
} // namespace json
} // namespace io
} // namespace cudf
} // namespace cudf::io::json::detail::legacy
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -31,10 +31,7 @@

using cudf::device_span;

namespace cudf {
namespace io {
namespace json {
namespace gpu {
namespace cudf::io::json::detail::legacy {

using col_map_type = concurrent_unordered_map<uint32_t, cudf::size_type>;
/**
Expand Down Expand Up @@ -100,7 +97,4 @@ void collect_keys_info(parse_options_view const& options,
thrust::optional<mutable_table_device_view> keys_info,
rmm::cuda_stream_view stream);

} // namespace gpu
} // namespace json
} // namespace io
} // namespace cudf
} // namespace cudf::io::json::detail::legacy
33 changes: 33 additions & 0 deletions cpp/src/io/json/legacy/read_json.hpp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this new header?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we now call legacy::read_json from read_json.cu. It does make me wonder if we still need the other read_json.hpp.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checked, the other header is needed for tests; keeping the files as they are.

Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/mr/memory_resource.h>

#include <memory>
#include <vector>

namespace cudf::io::json::detail::legacy {

table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
json_reader_options const& reader_opts,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace cudf::io::json::detail::legacy
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

#include "json_gpu.hpp"

#include "experimental/read_json.hpp"

#include <hash/concurrent_unordered_map.cuh>

#include <io/comp/io_uncomp.hpp>
Expand Down Expand Up @@ -56,9 +54,8 @@

using cudf::host_span;

namespace cudf::io::json::detail {
namespace cudf::io::json::detail::legacy {

using col_map_type = cudf::io::json::gpu::col_map_type;
using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;

/**
Expand Down Expand Up @@ -129,8 +126,7 @@ std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& par
{
// Count keys
rmm::device_scalar<unsigned long long int> key_counter(0, stream);
cudf::io::json::gpu::collect_keys_info(
parse_opts, data, row_offsets, key_counter.data(), {}, stream);
collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {}, stream);

// Allocate columns to store hash value, length, and offset of each JSON object key in the input
auto const num_keys = key_counter.value(stream);
Expand All @@ -148,8 +144,7 @@ std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& par
// Reset the key counter - now used for indexing
key_counter.set_value_to_zero_async(stream);
// Fill the allocated columns
cudf::io::json::gpu::collect_keys_info(
parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
return info_table;
}

Expand Down Expand Up @@ -213,7 +208,7 @@ std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashe
create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
}

std::vector<uint8_t> ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
compression_type compression,
size_t range_offset,
size_t range_size,
Expand Down Expand Up @@ -447,7 +442,7 @@ std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
auto const num_columns = column_names.size();
auto const do_set_null_count = column_map->capacity() > 0;

auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
auto const h_column_infos = detect_data_types(
parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream);

auto get_type_id = [&](auto const& cinfo) {
Expand Down Expand Up @@ -523,7 +518,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
num_columns, stream, rmm::mr::get_current_device_resource());

cudf::io::json::gpu::convert_json_to_columns(
convert_json_to_columns(
parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);

stream.synchronize();
Expand Down Expand Up @@ -591,16 +586,11 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
*
* @return Table and its metadata
*/
table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
json_reader_options const& reader_opts,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
if (not reader_opts.is_enabled_legacy()) {
return cudf::io::detail::json::experimental::read_json(sources, reader_opts, stream, mr);
}

CUDF_EXPECTS(not sources.empty(), "No sources were defined");
CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE,
"Multiple compressed inputs are not supported");
Expand Down Expand Up @@ -664,4 +654,4 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
mr);
}

} // namespace cudf::io::json::detail
} // namespace cudf::io::json::detail::legacy
16 changes: 8 additions & 8 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1993,14 +1993,14 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
auto [result_bitmask, null_count] = make_validity(json_col);

// Convert strings to the inferred data type
auto col = experimental::detail::parse_data(string_spans_it,
col_size,
target_type,
std::move(result_bitmask),
null_count,
parsing_options(options).view(),
stream,
mr);
auto col = parse_data(string_spans_it,
col_size,
target_type,
std::move(result_bitmask),
null_count,
parsing_options(options).view(),
stream,
mr);

// Reset nullable if we do not have nulls
// This is to match the existing JSON reader's behaviour:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "read_json.hpp"

#include <io/comp/io_uncomp.hpp>
#include <io/json/legacy/read_json.hpp>
#include <io/json/nested_json.hpp>

#include <cudf/detail/nvtx/ranges.hpp>
Expand All @@ -30,7 +31,7 @@

#include <numeric>

namespace cudf::io::detail::json::experimental {
namespace cudf::io::json::detail {

size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
size_t range_offset,
Expand All @@ -44,7 +45,7 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
});
}

rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> const& sources,
rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
compression_type compression,
size_t range_offset,
size_t range_size,
Expand Down Expand Up @@ -197,6 +198,11 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();

if (reader_opts.is_enabled_legacy()) {
return cudf::io::json::detail::legacy::read_json(sources, reader_opts, stream, mr);
}

if (not should_load_whole_source(reader_opts)) {
CUDF_EXPECTS(reader_opts.is_enabled_lines(),
"Specifying a byte range is supported only for JSON Lines");
Expand All @@ -213,8 +219,8 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,

auto const buffer = get_record_range_raw_input(sources, reader_opts, stream);

return cudf::io::json::detail::device_parse_nested_json(buffer, reader_opts, stream, mr);
return device_parse_nested_json(buffer, reader_opts, stream, mr);
// For debug purposes, use host_parse_nested_json()
}

} // namespace cudf::io::detail::json::experimental
} // namespace cudf::io::json::detail
Loading