diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1e85ae8b1b7..9a9b5d1e5ed 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -343,6 +343,7 @@ add_library( src/io/json/reader_impl.cu src/io/json/experimental/byte_range_info.cu src/io/json/experimental/read_json.cpp + src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu src/io/orc/orc.cpp diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 6e71fd4b17d..9091745cd2e 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -93,6 +93,27 @@ __device__ __forceinline__ char get_escape_char(char escaped_char) } } +/** + * @brief Returns the escaped characters for a given character. + * + * @param escaped_char The character to escape. + * @return The escaped characters for a given character. + */ +__device__ __forceinline__ thrust::pair get_escaped_char(char escaped_char) +{ + switch (escaped_char) { + case '"': return {'\\', '"'}; + case '\\': return {'\\', '\\'}; + case '/': return {'\\', '/'}; + case '\b': return {'\\', 'b'}; + case '\f': return {'\\', 'f'}; + case '\n': return {'\\', 'n'}; + case '\r': return {'\\', 'r'}; + case '\t': return {'\\', 't'}; + // case 'u': return UNICODE_SEQ; + default: return {'\0', escaped_char}; + } +} /** * @brief Parses the hex value from the four hex digits of a unicode code point escape sequence * \uXXXX. @@ -162,8 +183,10 @@ process_string(in_iterator_t in_begin, int32_t bytes = 0; const auto num_in_chars = thrust::distance(in_begin, in_end); // String values are indicated by keeping the quote character - bool const is_string_value = num_in_chars >= 2LL && (*in_begin == options.quotechar) && - (*thrust::prev(in_end) == options.quotechar); + bool const is_string_value = + num_in_chars >= 2LL && + (options.quotechar == '\0' || + (*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar)); // Copy literal/numeric value if (not is_string_value) { @@ -282,7 +305,7 @@ struct string_parse { __device__ void operator()(size_type idx) { - if (not bit_is_set(null_mask, idx)) { + if (null_mask != nullptr && not bit_is_set(null_mask, idx)) { if (!d_chars) d_offsets[idx] = 0; return; } @@ -294,7 +317,7 @@ struct string_parse { auto const is_null_literal = (!d_chars) && serialized_trie_contains(options.trie_na, {in_begin, static_cast(num_in_chars)}); - if (is_null_literal) { + if (is_null_literal && null_mask != nullptr) { clear_bit(null_mask, idx); if (!d_chars) d_offsets[idx] = 0; return; @@ -303,7 +326,7 @@ struct string_parse { char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; auto str_process_info = process_string(in_begin, in_end, d_buffer, options); if (str_process_info.result != data_casting_result::PARSING_SUCCESS) { - clear_bit(null_mask, idx); + if (null_mask != nullptr) clear_bit(null_mask, idx); if (!d_chars) d_offsets[idx] = 0; } else { if (!d_chars) d_offsets[idx] = str_process_info.bytes; diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 42717fe36df..7d2884880e7 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,10 +21,7 @@ #include -namespace cudf { -namespace io { -namespace detail { -namespace json { +namespace cudf::io::json::detail { /** * @brief Reads and returns the entire data set. @@ -42,7 +39,18 @@ table_with_metadata read_json( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -} // namespace json -} // namespace detail -} // namespace io -} // namespace cudf +/** + * @brief Write an entire dataset to JSON format. + * + * @param sink Output sink + * @param table The set of columns + * @param options Settings for controlling behavior + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource to use for device memory allocation + */ +void write_json(data_sink* sink, + table_view const& table, + json_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +} // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index ffc54618992..4d5fcb0ee9d 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -484,6 +484,351 @@ table_with_metadata read_json( json_reader_options options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** @} */ // end of group + +/** + * @addtogroup io_writers + * @{ + * @file + */ + +/** + *@brief Builder to build options for `writer_json()`. + */ +class json_writer_options_builder; + +/** + * @brief Settings to use for `write_json()`. + */ +class json_writer_options { + // Specify the sink to use for writer output + sink_info _sink; + // Set of columns to output + table_view _table; + // string to use for null entries + std::string _na_rep = ""; + // Indicates whether to output nulls as 'null' or exclude the field + bool _include_nulls = false; + // Indicates whether to use JSON lines for records format + bool _lines = false; + // maximum number of rows to write in each chunk (limits memory use) + size_type _rows_per_chunk = std::numeric_limits::max(); + // string to use for values != 0 in INT8 types (default 'true') + std::string _true_value = std::string{"true"}; + // string to use for values == 0 in INT8 types (default 'false') + std::string _false_value = std::string{"false"}; + // Names of all columns; if empty, writer will generate column names + std::optional _metadata; // Optional column names + + /** + * @brief Constructor from sink and table. + * + * @param sink The sink used for writer output + * @param table Table to be written to output + */ + explicit json_writer_options(sink_info const& sink, table_view const& table) + : _sink(sink), _table(table), _rows_per_chunk(table.num_rows()) + { + } + + friend json_writer_options_builder; + + public: + /** + * @brief Default constructor. + * + * This has been added since Cython requires a default constructor to create objects on stack. + */ + explicit json_writer_options() = default; + + /** + * @brief Create builder to create `json_writer_options`. + * + * @param sink The sink used for writer output + * @param table Table to be written to output + * + * @return Builder to build json_writer_options + */ + static json_writer_options_builder builder(sink_info const& sink, table_view const& table); + + /** + * @brief Returns sink used for writer output. + * + * @return sink used for writer output + */ + [[nodiscard]] sink_info const& get_sink() const { return _sink; } + + /** + * @brief Returns table that would be written to output. + * + * @return Table that would be written to output + */ + [[nodiscard]] table_view const& get_table() const { return _table; } + + /** + * @brief Returns metadata information. + * + * @return Metadata information + */ + [[nodiscard]] std::optional const& get_metadata() const { return _metadata; } + + /** + * @brief Returns string to used for null entries. + * + * @return string to used for null entries + */ + [[nodiscard]] std::string const& get_na_rep() const { return _na_rep; } + + /** + * @brief Whether to output nulls as 'null'. + * + * @return `true` if nulls are output as 'null' + */ + [[nodiscard]] bool is_enabled_include_nulls() const { return _include_nulls; } + + /** + * @brief Whether to use JSON lines for records format. + * + * @return `true` if JSON lines is used for records format + */ + [[nodiscard]] bool is_enabled_lines() const { return _lines; } + + /** + * @brief Returns maximum number of rows to process for each file write. + * + * @return Maximum number of rows to process for each file write + */ + [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; } + + /** + * @brief Returns string used for values != 0 in INT8 types. + * + * @return string used for values != 0 in INT8 types + */ + [[nodiscard]] std::string const& get_true_value() const { return _true_value; } + + /** + * @brief Returns string used for values == 0 in INT8 types. + * + * @return string used for values == 0 in INT8 types + */ + [[nodiscard]] std::string const& get_false_value() const { return _false_value; } + + // Setter + + /** + * @brief Sets table to be written to output. + * + * @param tbl Table for the output + */ + void set_table(table_view tbl) { _table = tbl; } + + /** + * @brief Sets metadata. + * + * @param metadata Associated metadata + */ + void set_metadata(table_metadata metadata) { _metadata = std::move(metadata); } + + /** + * @brief Sets string to used for null entries. + * + * @param val String to represent null value + */ + void set_na_rep(std::string val) { _na_rep = std::move(val); } + + /** + * @brief Enables/Disables output of nulls as 'null'. + * + * @param val Boolean value to enable/disable + */ + void enable_include_nulls(bool val) { _include_nulls = val; } + + /** + * @brief Enables/Disables JSON lines for records format. + * + * @param val Boolean value to enable/disable JSON lines + */ + void enable_lines(bool val) { _lines = val; } + + /** + * @brief Sets maximum number of rows to process for each file write. + * + * @param val Number of rows per chunk + */ + void set_rows_per_chunk(size_type val) { _rows_per_chunk = val; } + + /** + * @brief Sets string used for values != 0 in INT8 types. + * + * @param val String to represent values != 0 in INT8 types + */ + void set_true_value(std::string val) { _true_value = std::move(val); } + + /** + * @brief Sets string used for values == 0 in INT8 types. + * + * @param val String to represent values == 0 in INT8 types + */ + void set_false_value(std::string val) { _false_value = std::move(val); } +}; + +/** + * @brief Builder to build options for `writer_json()` + */ +class json_writer_options_builder { + json_writer_options options; ///< Options to be built. + + public: + /** + * @brief Default constructor. + * + * This has been added since Cython requires a default constructor to create objects on stack. + */ + explicit json_writer_options_builder() = default; + + /** + * @brief Constructor from sink and table. + * + * @param sink The sink used for writer output + * @param table Table to be written to output + */ + explicit json_writer_options_builder(sink_info const& sink, table_view const& table) + : options{sink, table} + { + } + + /** + * @brief Sets table to be written to output. + * + * @param tbl Table for the output + * @return this for chaining + */ + json_writer_options_builder& table(table_view tbl) + { + options._table = tbl; + return *this; + } + + /** + * @brief Sets optional metadata (with column names). + * + * @param metadata metadata (with column names) + * @return this for chaining + */ + json_writer_options_builder& metadata(table_metadata metadata) + { + options._metadata = std::move(metadata); + return *this; + } + + /** + * @brief Sets string to used for null entries. + * + * @param val String to represent null value + * @return this for chaining + */ + json_writer_options_builder& na_rep(std::string val) + { + options._na_rep = std::move(val); + return *this; + }; + + /** + * @brief Enables/Disables output of nulls as 'null'. + * + * @param val Boolean value to enable/disable + * @return this for chaining + */ + json_writer_options_builder& include_nulls(bool val) + { + options._include_nulls = val; + return *this; + } + + /** + * @brief Enables/Disables JSON lines for records format. + * + * @param val Boolean value to enable/disable + * @return this for chaining + */ + json_writer_options_builder& lines(bool val) + { + options._lines = val; + return *this; + } + + /** + * @brief Sets maximum number of rows to process for each file write. + * + * @param val Number of rows per chunk + * @return this for chaining + */ + json_writer_options_builder& rows_per_chunk(int val) + { + options._rows_per_chunk = val; + return *this; + } + + /** + * @brief Sets string used for values != 0 in INT8 types. + * + * @param val String to represent values != 0 in INT8 types + * @return this for chaining + */ + json_writer_options_builder& true_value(std::string val) + { + options._true_value = std::move(val); + return *this; + } + + /** + * @brief Sets string used for values == 0 in INT8 types. + * + * @param val String to represent values == 0 in INT8 types + * @return this for chaining + */ + json_writer_options_builder& false_value(std::string val) + { + options._false_value = std::move(val); + return *this; + } + + /** + * @brief move `json_writer_options` member once it's built. + */ + operator json_writer_options&&() { return std::move(options); } + + /** + * @brief move `json_writer_options` member once it's built. + * + * This has been added since Cython does not support overloading of conversion operators. + * + * @return Built `json_writer_options` object's r-value reference + */ + json_writer_options&& build() { return std::move(options); } +}; + +/** + * @brief Writes a set of columns to JSON format. + * + * The following code snippet demonstrates how to write columns to a file: + * @code + * auto destination = cudf::io::sink_info("dataset.json"); + * auto options = cudf::io::json_writer_options(destination, table->view()) + * .na_rep(na) + * .lines(lines) + * .rows_per_chunk(rows_per_chunk); + * + * cudf::io::write_json(options); + * @endcode + * + * @param options Settings for controlling writing behavior + * @param mr Device memory resource to use for device memory allocation + */ +void write_json(json_writer_options const& options, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace io } // namespace cudf diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 2847912a2e6..b8a4d8a8388 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -83,6 +83,13 @@ json_reader_options_builder json_reader_options::builder(source_info const& src) return json_reader_options_builder(src); } +// Returns builder for orc_writer_options +json_writer_options_builder json_writer_options::builder(sink_info const& sink, + table_view const& table) +{ + return json_writer_options_builder{sink, table}; +} + // Returns builder for parquet_reader_options parquet_reader_options_builder parquet_reader_options::builder(source_info const& src) { @@ -202,7 +209,20 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor options.get_byte_range_offset(), options.get_byte_range_size_with_padding()); - return detail::json::read_json(datasources, options, cudf::get_default_stream(), mr); + return json::detail::read_json(datasources, options, cudf::get_default_stream(), mr); +} + +void write_json(json_writer_options const& options, rmm::mr::device_memory_resource* mr) +{ + auto sinks = make_datasinks(options.get_sink()); + CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing"); + + return json::detail::write_json( // + sinks[0].get(), + options.get_table(), + options, + cudf::get_default_stream(), + mr); } table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index be911eca193..16273b35a11 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -342,8 +342,18 @@ std::vector copy_strings_to_host(device_span input, data + thrust::get<0>(offsets), static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); }); - auto d_column_names = cudf::make_strings_column(string_views, stream); - auto to_host = [](auto const& col) { + + cudf::io::parse_options_view options_view{}; + options_view.quotechar = '\0'; // no quotes + options_view.keepquotes = true; + auto d_column_names = experimental::detail::parse_data(string_views.begin(), + num_strings, + data_type{type_id::STRING}, + rmm::device_buffer{0, stream}, + options_view, + stream, + rmm::mr::get_current_device_resource()); + auto to_host = [](auto const& col) { if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); auto const h_chars = cudf::detail::make_std_vector_sync( diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 0ff1b1fa340..6e1089796de 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -57,13 +57,7 @@ using cudf::host_span; -namespace cudf { -namespace io { -namespace detail { -namespace json { - -using namespace cudf::io; -using namespace cudf::io::json; +namespace cudf::io::json::detail { using col_map_type = cudf::io::json::gpu::col_map_type; using col_map_ptr_type = std::unique_ptr>; @@ -503,7 +497,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, const auto num_records = rec_starts.size(); // alloc output buffers. - std::vector out_buffers; + std::vector out_buffers; for (size_t col = 0; col < num_columns; ++col) { out_buffers.emplace_back(dtypes[col], num_records, true, stream, mr); } @@ -594,7 +588,7 @@ table_with_metadata read_json(std::vector>& sources, { CUDF_FUNC_RANGE(); if (not reader_opts.is_enabled_legacy()) { - return experimental::read_json(sources, reader_opts, stream, mr); + return cudf::io::detail::json::experimental::read_json(sources, reader_opts, stream, mr); } CUDF_EXPECTS(not sources.empty(), "No sources were defined"); @@ -659,7 +653,4 @@ table_with_metadata read_json(std::vector>& sources, mr); } -} // namespace json -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu new file mode 100644 index 00000000000..49d035e6cb9 --- /dev/null +++ b/cpp/src/io/json/write_json.cu @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file write_json.cu + * @brief cuDF-IO JSON writer implementation + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf::io::json::detail { + +std::unique_ptr make_column_names_column(host_span column_names, + size_type num_columns, + rmm::cuda_stream_view stream); +namespace { + +/** + * @brief Functor to modify a string column for JSON format. + * + * This will convert escape characters and wrap quotes around strings. + */ +struct escape_strings_fn { + column_device_view const d_column; + offset_type* d_offsets{}; + char* d_chars{}; + + __device__ void write_char(char_utf8 chr, char*& d_buffer, offset_type& bytes) + { + if (d_buffer) + d_buffer += cudf::strings::detail::from_char_utf8(chr, d_buffer); + else + bytes += cudf::strings::detail::bytes_in_char_utf8(chr); + } + + __device__ void operator()(size_type idx) + { + if (d_column.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + + auto const d_str = d_column.element(idx); + + // entire string must be double-quoted. + constexpr char_utf8 const quote = '\"'; // wrap quotes + bool constexpr quote_row = true; + + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + offset_type bytes = 0; + + if (quote_row) write_char(quote, d_buffer, bytes); + for (auto chr : d_str) { + auto escaped_chars = cudf::io::json::experimental::detail::get_escaped_char(chr); + if (escaped_chars.first == '\0') { + write_char(escaped_chars.second, d_buffer, bytes); + } else { + write_char(escaped_chars.first, d_buffer, bytes); + write_char(escaped_chars.second, d_buffer, bytes); + } + } + if (quote_row) write_char(quote, d_buffer, bytes); + + if (!d_chars) d_offsets[idx] = bytes; + } + + std::unique_ptr get_escaped_strings(column_view const& column_v, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto children = + cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); + + return make_strings_column(column_v.size(), + std::move(children.first), + std::move(children.second), + column_v.null_count(), + cudf::detail::copy_bitmask(column_v, stream, mr)); + } +}; + +// Struct concatenation. +struct concat_structs_base { + table_device_view const d_table; + column_device_view const d_column_names; + string_view const row_prefix; //{ + string_view const row_suffix; //} or }\n for json-lines + string_view const d_col_separator; //: + string_view const d_val_separator; //, + string_scalar_device_view const d_narep; // null + bool include_nulls = false; + offset_type* d_offsets{}; + char* d_chars{}; + + /** + * @brief Concatenate each table row to a single output string. + * + * This will concatenate the strings from each row of the given table + * and apply the separator. The null-replacement string `d_narep` is + * used in place of any string in a row that contains a null entry. + * + * @param idx The current row to process + * @param d_separator String to place in between each column's row + */ + __device__ void operator()(size_type idx) + { + if (!d_narep.is_valid() && + thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { + return col.is_null(idx); + })) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + offset_type bytes = 0; + bool write_separator = false; + + if (d_buffer) d_buffer = strings::detail::copy_string(d_buffer, row_prefix); + bytes += row_prefix.size_bytes(); + + for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) { + auto const col_idx = thrust::distance(d_table.begin(), itr); + auto const d_column = *itr; + bool const is_null_element = d_column.is_null(idx); + bool const include_element = (include_nulls == true || !is_null_element); + + if (write_separator && include_element) { + if (d_buffer) d_buffer = strings::detail::copy_string(d_buffer, d_val_separator); + bytes += d_val_separator.size_bytes(); + write_separator = false; + } + + // column_name: + if (include_element && !d_column_names.is_null(col_idx)) { + auto const d_name = d_column_names.element(col_idx); + if (d_buffer) d_buffer = strings::detail::copy_string(d_buffer, d_name); + bytes += d_name.size_bytes(); + if (d_buffer) d_buffer = strings::detail::copy_string(d_buffer, d_col_separator); + bytes += d_col_separator.size_bytes(); + } + + // write out column's row data (or narep if the row is null) + if (include_element) { + auto const d_str = is_null_element ? d_narep.value() : d_column.element(idx); + if (d_buffer) d_buffer = strings::detail::copy_string(d_buffer, d_str); + bytes += d_str.size_bytes(); + } + + write_separator = write_separator || include_element; + } + if (d_buffer) { + d_buffer = strings::detail::copy_string(d_buffer, row_suffix); + } else { + d_offsets[idx] = bytes + row_suffix.size_bytes(); + } + } +}; + +/** + * @brief Concatenate the strings from each row of the given table as structs in JSON string + * + * Each row will be struct with field name as column names and values from each column in the table. + * + * @param strings_columns Table of strings columns + * @param column_names Column of names for each column in the table + * @param row_prefix Prepend this string to each row + * @param row_suffix Append this string to each row + * @param column_name_separator Separator between column name and value + * @param value_separator Separator between values + * @param narep Null-String replacement + * @param include_nulls Include null string entries in the output + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource to use for device memory allocation. + * @return New strings column of JSON structs in each row + */ +std::unique_ptr struct_to_strings(table_view const& strings_columns, + column_view const& column_names, + string_view const row_prefix, + string_view const row_suffix, + string_view const column_name_separator, + string_view const value_separator, + string_scalar const& narep, + bool include_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(column_names.type().id() == type_id::STRING, "Column names must be of type string"); + auto const num_columns = strings_columns.num_columns(); + CUDF_EXPECTS(num_columns == column_names.size(), + "Number of column names should be equal to number of columns in the table"); + auto const strings_count = strings_columns.num_rows(); + if (strings_count == 0) // empty begets empty + return make_empty_column(type_id::STRING); + // check all columns are of type string + CUDF_EXPECTS(std::all_of(strings_columns.begin(), + strings_columns.end(), + [](auto const& c) { return c.type().id() == type_id::STRING; }), + "All columns must be of type string"); + + // Create device views from the strings columns. + auto d_table = table_device_view::create(strings_columns, stream); + auto d_column_names = column_device_view::create(column_names, stream); + auto d_narep = get_scalar_device_view(const_cast(narep)); + concat_structs_base fn{*d_table, + *d_column_names, + row_prefix, + row_suffix, + column_name_separator, + value_separator, + d_narep, + include_nulls}; + auto children = strings::detail::make_strings_children(fn, strings_count, stream, mr); + + // create resulting null mask + auto [null_mask, null_count] = cudf::detail::valid_if( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + [d_table = *d_table, d_narep] __device__(size_type idx) { + if (d_narep.is_valid()) return true; + return !thrust::any_of( + thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); + }, + stream, + mr); + + return make_strings_column(strings_count, + std::move(children.first), + std::move(children.second), + null_count, + std::move(null_mask)); +} + +/** + * @brief Functor to convert a column to string representation for JSON format. + */ +struct column_to_strings_fn { + /** + * @brief Returns true if the specified type is not supported by the JSON writer. + */ + template + constexpr static bool is_not_handled() + { + // Note: the case (not std::is_same_v) is already covered by is_integral) + return not((std::is_same_v) || + (std::is_integral_v) || (std::is_floating_point_v) || + (cudf::is_fixed_point()) || (cudf::is_timestamp()) || + (cudf::is_duration())); + } + + explicit column_to_strings_fn(json_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : options_(options), stream_(stream), mr_(mr), narep(options.get_na_rep()) + { + } + + // unsupported type of column: + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const&) const + { + CUDF_FAIL("Unsupported column type."); + } + + // Note: `null` replacement with `na_rep` deferred to `concatenate()` + // instead of column-wise; might be faster. + + // bools: + template + std::enable_if_t, std::unique_ptr> operator()( + column_view const& column) const + { + return cudf::strings::detail::from_booleans( + column, options_.get_true_value(), options_.get_false_value(), stream_, mr_); + } + + // strings: + template + std::enable_if_t, std::unique_ptr> + operator()(column_view const& column_v) const + { + auto d_column = column_device_view::create(column_v, stream_); + return escape_strings_fn{*d_column}.get_escaped_strings(column_v, stream_, mr_); + } + + // ints: + template + std::enable_if_t && !std::is_same_v, + std::unique_ptr> + operator()(column_view const& column) const + { + return cudf::strings::detail::from_integers(column, stream_, mr_); + } + + // floats: + template + std::enable_if_t, std::unique_ptr> operator()( + column_view const& column) const + { + return cudf::strings::detail::from_floats(column, stream_, mr_); + } + + // fixed point: + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& column) const + { + return cudf::strings::detail::from_fixed_point(column, stream_, mr_); + } + + // timestamps: + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& column) const + { + std::string format = [&]() { + if (std::is_same_v) { + return std::string{"%Y-%m-%dT%H:%M:%SZ"}; + } else if (std::is_same_v) { + return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"}; + } else if (std::is_same_v) { + return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"}; + } else if (std::is_same_v) { + return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"}; + } else { + return std::string{"%Y-%m-%d"}; + } + }(); + + // Since format uses ":", we need to add quotes to the format + format = "\"" + format + "\""; + + return cudf::strings::detail::from_timestamps( + column, + format, + strings_column_view(column_view{data_type{type_id::STRING}, 0, nullptr}), + stream_, + mr_); + } + + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& column) const + { + auto duration_string = cudf::io::detail::csv::pandas_format_durations(column, stream_, mr_); + auto quotes = make_column_from_scalar(string_scalar{"\""}, column.size(), stream_, mr_); + return cudf::strings::detail::concatenate( + table_view{{quotes->view(), duration_string->view(), quotes->view()}}, + string_scalar(""), + string_scalar("", false), + strings::separator_on_nulls::YES, + stream_, + mr_); + } + + // lists: + template + std::enable_if_t, std::unique_ptr> + operator()(column_view const& column, host_span children_names) const + { + auto child_view = lists_column_view(column).get_sliced_child(stream_); + auto constexpr child_index = lists_column_view::child_column_index; + auto list_string = [&]() { + auto child_string = [&]() { + if (child_view.type().id() == type_id::STRUCT) { + return (*this).template operator()( + child_view, + children_names.size() > child_index ? children_names[child_index].children + : std::vector{}); + } else if (child_view.type().id() == type_id::LIST) { + return (*this).template operator()( + child_view, + children_names.size() > child_index ? children_names[child_index].children + : std::vector{}); + } else { + return cudf::type_dispatcher(child_view.type(), *this, child_view); + } + }(); + auto const list_child_string = + column_view(column.type(), + column.size(), + column.head(), + column.null_mask(), + column.null_count(), + column.offset(), + {lists_column_view(column).offsets(), child_string->view()}); + return strings::detail::join_list_elements(lists_column_view(list_child_string), + string_scalar{","}, + narep, + strings::separator_on_nulls::YES, + strings::output_if_empty_list::EMPTY_STRING, + stream_, + mr_); + }(); + // create column with "[", "]" to wrap around list string + auto prepend = make_column_from_scalar(string_scalar{"["}, column.size(), stream_, mr_); + auto append = make_column_from_scalar(string_scalar{"]"}, column.size(), stream_, mr_); + return cudf::strings::detail::concatenate( + table_view{{prepend->view(), list_string->view(), append->view()}}, + string_scalar(""), + string_scalar("", false), + strings::separator_on_nulls::YES, + stream_, + mr_); + } + + // structs: + template + std::enable_if_t, std::unique_ptr> + operator()(column_view const& column, host_span children_names) const + { + auto col_string = operator()(column.child_begin(), column.child_end(), children_names); + col_string->set_null_mask(cudf::detail::copy_bitmask(column, stream_, mr_), + column.null_count()); + return col_string; + } + + // Table: + template + std::unique_ptr operator()(column_iterator column_begin, + column_iterator column_end, + host_span children_names) const + { + auto const num_columns = std::distance(column_begin, column_end); + auto column_names = make_column_names_column(children_names, num_columns, stream_); + auto column_names_view = column_names->view(); + std::vector> str_column_vec; + + // populate vector of string-converted columns: + // + auto i_col_begin = + thrust::make_zip_iterator(thrust::counting_iterator(0), column_begin); + std::transform(i_col_begin, + i_col_begin + num_columns, + std::back_inserter(str_column_vec), + [this, &children_names](auto const& i_current_col) { + auto const i = thrust::get<0>(i_current_col); + auto const& current_col = thrust::get<1>(i_current_col); + // Struct needs children's column names + if (current_col.type().id() == type_id::STRUCT) { + return (*this).template operator()( + current_col, + children_names.size() > i ? children_names[i].children + : std::vector{}); + } else if (current_col.type().id() == type_id::LIST) { + return (*this).template operator()( + current_col, + children_names.size() > i ? children_names[i].children + : std::vector{}); + } else { + return cudf::type_dispatcher(current_col.type(), *this, current_col); + } + }); + + // create string table view from str_column_vec: + // + auto str_table_ptr = std::make_unique(std::move(str_column_vec)); + auto str_table_view = str_table_ptr->view(); + + // concatenate columns in each row into one big string column + // (using null representation and delimiter): + // + return struct_to_strings(str_table_view, + column_names_view, + row_begin_wrap.value(stream_), + row_end_wrap.value(stream_), + column_seperator.value(stream_), + value_seperator.value(stream_), + narep, + options_.is_enabled_include_nulls(), + stream_, + rmm::mr::get_current_device_resource()); + } + + private: + json_writer_options const& options_; + rmm::cuda_stream_view stream_; + rmm::mr::device_memory_resource* mr_; + string_scalar const column_seperator{":"}; + string_scalar const value_seperator{","}; + string_scalar const row_begin_wrap{"{"}; + string_scalar const row_end_wrap{"}"}; + string_scalar const narep; +}; + +} // namespace + +std::unique_ptr make_strings_column_from_host(host_span host_strings, + rmm::cuda_stream_view stream) +{ + std::string const host_chars = + std::accumulate(host_strings.begin(), host_strings.end(), std::string("")); + auto d_chars = cudf::detail::make_device_uvector_async(host_chars, stream); + std::vector offsets(host_strings.size() + 1, 0); + std::transform_inclusive_scan(host_strings.begin(), + host_strings.end(), + offsets.begin() + 1, + std::plus{}, + [](auto& str) { return str.size(); }); + auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream); + return cudf::make_strings_column( + host_strings.size(), std::move(d_offsets), std::move(d_chars), {}, 0); +} + +std::unique_ptr make_column_names_column(host_span column_names, + size_type num_columns, + rmm::cuda_stream_view stream) +{ + std::vector unescaped_column_names; + if (column_names.empty()) { + std::generate_n(std::back_inserter(unescaped_column_names), num_columns, [v = 0]() mutable { + return std::to_string(v++); + }); + } else { + std::transform(column_names.begin(), + column_names.end(), + std::back_inserter(unescaped_column_names), + [](column_name_info const& name_info) { return name_info.name; }); + } + auto unescaped_string_col = make_strings_column_from_host(unescaped_column_names, stream); + auto d_column = column_device_view::create(*unescaped_string_col, stream); + return escape_strings_fn{*d_column}.get_escaped_strings( + *unescaped_string_col, stream, rmm::mr::get_current_device_resource()); +} + +void write_chunked(data_sink* out_sink, + strings_column_view const& str_column_view, + std::string const& line_terminator, + json_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); + + string_scalar d_line_terminator{line_terminator}; + auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, + d_line_terminator, + string_scalar("", false), + stream, + rmm::mr::get_current_device_resource()); + strings_column_view strings_column{p_str_col_w_nl->view()}; + + auto total_num_bytes = strings_column.chars_size(); + char const* ptr_all_bytes = strings_column.chars_begin(); + + if (out_sink->is_device_write_preferred(total_num_bytes)) { + // Direct write from device memory + out_sink->device_write(ptr_all_bytes, total_num_bytes, stream); + } else { + // copy the bytes to host to write them out + auto const h_bytes = cudf::detail::make_host_vector_sync( + device_span(ptr_all_bytes, total_num_bytes), stream); + + out_sink->host_write(h_bytes.data(), total_num_bytes); + } + + // Needs newline at the end, to separate from next chunk + if (options.is_enabled_lines()) { + if (out_sink->is_device_write_preferred(d_line_terminator.size())) { + out_sink->device_write(d_line_terminator.data(), d_line_terminator.size(), stream); + } else { + out_sink->host_write(line_terminator.data(), line_terminator.size()); + } + } +} + +void write_json(data_sink* out_sink, + table_view const& table, + json_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + std::vector user_column_names = [&]() { + auto const& metadata = options.get_metadata(); + if (metadata.has_value() and not metadata->schema_info.empty()) { + return metadata->schema_info; + } else { + std::vector names; + // generate strings 0 to table.num_columns() + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(table.num_columns()), + std::back_inserter(names), + [](auto i) { return column_name_info{std::to_string(i)}; }); + return names; + } + }(); + auto const line_terminator = std::string(options.is_enabled_lines() ? "\n" : ","); + string_scalar d_line_terminator{line_terminator}; + + // write header: required for non-record oriented output + // header varies depending on orient. + // write_chunked_begin(out_sink, table, user_column_names, options, stream, mr); + // TODO This should go into the write_chunked_begin function + std::string const list_braces{"[]"}; + string_scalar d_list_braces{list_braces}; + if (!options.is_enabled_lines()) { + if (out_sink->is_device_write_preferred(1)) { + out_sink->device_write(d_list_braces.data(), 1, stream); + } else { + out_sink->host_write(list_braces.data(), 1); + } + } + + if (table.num_rows() > 0) { + auto n_rows_per_chunk = options.get_rows_per_chunk(); + + // This outputs the JSON in row chunks to save memory. + // Maybe we can use the total_rows*count calculation and a memory threshold + // instead of an arbitrary chunk count. + // The entire JSON chunk must fit in CPU memory before writing it out. + // + if (n_rows_per_chunk % 8) // must be divisible by 8 + n_rows_per_chunk += 8 - (n_rows_per_chunk % 8); + + CUDF_EXPECTS(n_rows_per_chunk >= 8, "write_json: invalid chunk_rows; must be at least 8"); + + auto num_rows = table.num_rows(); + std::vector vector_views; + + if (num_rows <= n_rows_per_chunk) { + vector_views.push_back(table); + } else { + auto const n_chunks = num_rows / n_rows_per_chunk; + std::vector splits(n_chunks); + thrust::tabulate(splits.begin(), splits.end(), [n_rows_per_chunk](auto idx) { + return (idx + 1) * n_rows_per_chunk; + }); + + // split table_view into chunks: + vector_views = cudf::detail::split(table, splits, stream); + } + + // convert each chunk to JSON: + column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()}; + + for (auto&& sub_view : vector_views) { + // Skip if the table has no rows + if (sub_view.num_rows() == 0) continue; + std::vector> str_column_vec; + + // struct converter for the table + auto str_concat_col = converter(sub_view.begin(), sub_view.end(), user_column_names); + + write_chunked(out_sink, str_concat_col->view(), line_terminator, options, stream, mr); + } + } else { + if (options.is_enabled_lines()) { + if (out_sink->is_device_write_preferred(1)) { + out_sink->device_write(d_line_terminator.data(), d_line_terminator.size(), stream); + } else { + out_sink->host_write(line_terminator.data(), line_terminator.size()); + } + } + } + // TODO write_chunked_end(out_sink, options, stream, mr); + if (!options.is_enabled_lines()) { + if (out_sink->is_device_write_preferred(1)) { + out_sink->device_write(d_list_braces.data() + 1, 1, stream); + } else { + out_sink->host_write(list_braces.data() + 1, 1); + } + } +} + +} // namespace cudf::io::json::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 575bd5f5647..053acafdd3d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -225,6 +225,7 @@ ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp io/parquet_chunked_reader_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp) +ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp) ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu) ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index a2f677986e8..613d66332bf 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -749,7 +749,7 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects) EXPECT_EQ(result.tbl->num_rows(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result.metadata.schema_info[0].name, "co\\\"l1"); + EXPECT_EQ(result.metadata.schema_info[0].name, is_legacy_test(test_opt) ? "co\\\"l1" : "co\"l1"); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); EXPECT_EQ(result.metadata.schema_info[1].name, "col2"); diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp new file mode 100644 index 00000000000..d129ed306e4 --- /dev/null +++ b/cpp/tests/io/json_writer.cpp @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +struct JsonWriterTest : public cudf::test::BaseFixture { +}; + +TEST_F(JsonWriterTest, EmptyInput) +{ + cudf::test::strings_column_wrapper col1; + cudf::test::strings_column_wrapper col2; + cudf::test::fixed_width_column_wrapper col3; + cudf::test::fixed_width_column_wrapper col4; + cudf::test::fixed_width_column_wrapper col5; + cudf::table_view tbl_view{{col1, col2, col3, col4}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + + // Empty columns in table + cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + std::string const expected = R"([])"; + EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + + // Empty columns in table - JSON Lines + out_buffer.clear(); + out_options.enable_lines(true); + cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + std::string const expected_lines = "\n"; + EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); + + // Empty table - JSON Lines + cudf::table_view tbl_view2{}; + out_options.set_table(tbl_view2); + out_buffer.clear(); + cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); +} + +TEST_F(JsonWriterTest, ErrorCases) +{ + cudf::test::strings_column_wrapper col1{"a", "b", "c"}; + cudf::test::strings_column_wrapper col2{"d", "e", "f"}; + cudf::test::fixed_width_column_wrapper col3{1, 2, 3}; + cudf::test::fixed_width_column_wrapper col4{1.5, 2.5, 3.5}; + cudf::test::fixed_width_column_wrapper col5{{1, 2, 3}, + cudf::test::iterators::nulls_at({0, 2})}; + cudf::table_view tbl_view{{col1, col2, col3, col4, col5}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + + // not enough column names + EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()), + cudf::logic_error); + + mt.schema_info.emplace_back("int16"); + out_options.set_metadata(mt); + EXPECT_NO_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource())); + + // chunk_rows must be at least 8 + out_options.set_rows_per_chunk(0); + EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()), + cudf::logic_error); +} + +TEST_F(JsonWriterTest, PlainTable) +{ + cudf::test::strings_column_wrapper col1{"a", "b", "c"}; + cudf::test::strings_column_wrapper col2{"d", "e", "f"}; + cudf::test::fixed_width_column_wrapper col3{1, 2, 3}; + cudf::test::fixed_width_column_wrapper col4{1.5, 2.5, 3.5}; + cudf::test::fixed_width_column_wrapper col5{{1, 2, 3}, + cudf::test::iterators::nulls_at({0, 2})}; + cudf::table_view tbl_view{{col1, col2, col3, col4, col5}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null"); + + cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + + std::string const expected = + R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])"; + EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); +} + +TEST_F(JsonWriterTest, SimpleNested) +{ + std::string const data = R"( +{"a": 1, "b": 2, "c": {"d": 3 }, "f": 5.5, "g": [1]} +{"a": 6, "b": 7, "c": {"d": 8 }, "f": 10.5, "g": null} +{"a": 1, "b": 2, "c": { "e": 4}, "f": 5.5, "g": [2, null]} +{"a": 6, "b": 7, "c": { "e": 9}, "f": 10.5, "g": [3, 4, 5]} )"; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + cudf::table_view tbl_view = result.tbl->view(); + cudf::io::table_metadata mt{result.metadata}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null"); + + cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} +{"a":6,"b":7,"c":{"d":8},"f":10.5} +{"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} +{"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[3,4,5]} +)"; + EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); +} + +TEST_F(JsonWriterTest, MixedNested) +{ + std::string const data = R"( +{"a": 1, "b": 2, "c": {"d": [3] }, "f": 5.5, "g": [ {"h": 1}]} +{"a": 6, "b": 7, "c": {"d": [8] }, "f": 10.5, "g": null} +{"a": 1, "b": 2, "c": { "e": 4}, "f": 5.5, "g": [{"h": 2}, null]} +{"a": 6, "b": 7, "c": { "e": 9}, "f": 10.5, "g": [{"h": 3}, {"h": 4}, {"h": 5}]} )"; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + cudf::table_view tbl_view = result.tbl->view(); + cudf::io::table_metadata mt{result.metadata}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(false) + .na_rep("null"); + + cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + std::string const expected = + R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)" + R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)" + R"({"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[{"h":2},null]},)" + R"({"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[{"h":3},{"h":4},{"h":5}]}])"; + EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); +} + +TEST_F(JsonWriterTest, WriteReadNested) +{ + using namespace cudf::test::iterators; + using LCW = cudf::test::lists_column_wrapper; + cudf::test::fixed_width_column_wrapper a{1, 6, 1, 6}; + cudf::test::fixed_width_column_wrapper b{2, 7, 2, 7}; + cudf::test::fixed_width_column_wrapper d{{3, 8, 0, 0}, nulls_at({2, 3})}; + cudf::test::fixed_width_column_wrapper e{{0, 0, 4, 9}, nulls_at({0, 1})}; + cudf::test::structs_column_wrapper c{{d, e}}; + cudf::test::fixed_width_column_wrapper f{5.5, 10.5, 5.5, 10.5}; + LCW g{{LCW{1}, LCW{0}, LCW{{2, 0}, null_at(1)}, LCW{3, 4, 5}}, null_at(1)}; + cudf::table_view tbl_view{{a, b, c, f, g}}; + cudf::io::table_metadata mt{{{"a"}, {"b"}, {"c"}, {"f"}, {"g"}}}; + mt.schema_info[2].children = {{"d"}, {"e"}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); + + cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} +{"a":6,"b":7,"c":{"d":8},"f":10.5} +{"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} +{"a":6,"b":7,"c":{"e":9},"f":10.5,"g":[3,4,5]} +)"; + auto const output_string = std::string(out_buffer.data(), out_buffer.size()); + EXPECT_EQ(expected, output_string); + + // Read back the written JSON, and compare with the original table + // Without type information + auto in_options = cudf::io::json_reader_options::builder( + cudf::io::source_info{output_string.data(), output_string.size()}) + .lines(true) + .build(); + + auto result = cudf::io::read_json(in_options); + auto tbl_out = result.tbl->view(); + auto const int64_dtype = cudf::data_type{cudf::type_id::INT64}; + auto const double_dtype = cudf::data_type{cudf::type_id::FLOAT64}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*cudf::cast(a, int64_dtype), tbl_out.column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*cudf::cast(b, int64_dtype), tbl_out.column(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, tbl_out.column(2)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*cudf::cast(f, double_dtype), tbl_out.column(3)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(g, tbl_out.column(4)); + + mt.schema_info[4].children = {{"offsets"}, {"element"}}; // list child column names + EXPECT_EQ(mt.schema_info.size(), result.metadata.schema_info.size()); + for (auto i = 0UL; i < mt.schema_info.size(); i++) { + EXPECT_EQ(mt.schema_info[i].name, result.metadata.schema_info[i].name) << "[" << i << "]"; + EXPECT_EQ(mt.schema_info[i].children.size(), result.metadata.schema_info[i].children.size()) + << "[" << i << "]"; + for (auto j = 0UL; j < mt.schema_info[i].children.size(); j++) { + EXPECT_EQ(mt.schema_info[i].children[j].name, result.metadata.schema_info[i].children[j].name) + << "[" << i << "][" << j << "]"; + } + } + + // Read with type information + std::map types; + types["a"] = cudf::io::schema_element{cudf::data_type{cudf::type_id::INT32}}; + types["b"] = cudf::io::schema_element{cudf::data_type{cudf::type_id::UINT8}}; + types["c"] = cudf::io::schema_element{cudf::data_type{cudf::type_id::STRUCT}}; + types["c"].child_types["d"] = cudf::io::schema_element{cudf::data_type{cudf::type_id::INT64}}; + types["c"].child_types["e"] = cudf::io::schema_element{cudf::data_type{cudf::type_id::INT64}}; + types["f"] = cudf::io::schema_element{cudf::data_type{cudf::type_id::FLOAT32}}; + types["g"] = cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST}}; + types["g"].child_types["element"] = + cudf::io::schema_element{cudf::data_type{cudf::type_id::INT64}}; + + in_options.set_dtypes(types); + result = cudf::io::read_json(in_options); + tbl_out = result.tbl->view(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(a, tbl_out.column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(b, tbl_out.column(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, tbl_out.column(2)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(f, tbl_out.column(3)); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(g, tbl_out.column(4)); + EXPECT_EQ(mt.schema_info.size(), result.metadata.schema_info.size()); + for (auto i = 0UL; i < mt.schema_info.size(); i++) { + EXPECT_EQ(mt.schema_info[i].name, result.metadata.schema_info[i].name) << "[" << i << "]"; + EXPECT_EQ(mt.schema_info[i].children.size(), result.metadata.schema_info[i].children.size()) + << "[" << i << "]"; + for (auto j = 0UL; j < mt.schema_info[i].children.size(); j++) { + EXPECT_EQ(mt.schema_info[i].children[j].name, result.metadata.schema_info[i].children[j].name) + << "[" << i << "][" << j << "]"; + } + } + + // Without children column names + mt.schema_info[2].children.clear(); + out_options.set_metadata(mt); + out_buffer.clear(); + cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + + in_options = cudf::io::json_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}) + .lines(true) + .build(); + result = cudf::io::read_json(in_options); + + mt.schema_info[2].children = {{"0"}, {"1"}}; + EXPECT_EQ(mt.schema_info.size(), result.metadata.schema_info.size()); + for (auto i = 0UL; i < mt.schema_info.size(); i++) { + EXPECT_EQ(mt.schema_info[i].name, result.metadata.schema_info[i].name) << "[" << i << "]"; + EXPECT_EQ(mt.schema_info[i].children.size(), result.metadata.schema_info[i].children.size()) + << "[" << i << "]"; + for (auto j = 0UL; j < mt.schema_info[i].children.size(); j++) { + EXPECT_EQ(mt.schema_info[i].children[j].name, result.metadata.schema_info[i].children[j].name) + << "[" << i << "][" << j << "]"; + } + } + + // without column names + out_options.set_metadata(cudf::io::table_metadata{}); + out_buffer.clear(); + cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + in_options = cudf::io::json_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}) + .lines(true) + .build(); + result = cudf::io::read_json(in_options); + + mt.schema_info = {{"0"}, {"1"}, {"2"}, {"3"}, {"4"}}; + mt.schema_info[2].children = {{"0"}, {"1"}}; + mt.schema_info[4].children = {{"offsets"}, {"element"}}; // list child column names + EXPECT_EQ(mt.schema_info.size(), result.metadata.schema_info.size()); + for (auto i = 0UL; i < mt.schema_info.size(); i++) { + EXPECT_EQ(mt.schema_info[i].name, result.metadata.schema_info[i].name) << "[" << i << "]"; + EXPECT_EQ(mt.schema_info[i].children.size(), result.metadata.schema_info[i].children.size()) + << "[" << i << "]"; + for (auto j = 0UL; j < mt.schema_info[i].children.size(); j++) { + EXPECT_EQ(mt.schema_info[i].children[j].name, result.metadata.schema_info[i].children[j].name) + << "[" << i << "][" << j << "]"; + } + } +} + +TEST_F(JsonWriterTest, SpecialChars) +{ + cudf::test::fixed_width_column_wrapper a{1, 6, 1, 6}; + cudf::test::strings_column_wrapper b{"abcd", "b\b\f\n\r\t", "\"c\"", "/\\"}; + cudf::table_view tbl_view{{a, b}}; + cudf::io::table_metadata mt{{{"\"a\""}, {"\'b\'"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(false) + .metadata(mt) + .lines(true) + .na_rep("null") + .build(); + + cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + std::string const expected = R"({"\"a\"":1,"'b'":"abcd"} +{"\"a\"":6,"'b'":"b\b\f\n\r\t"} +{"\"a\"":1,"'b'":"\"c\""} +{"\"a\"":6,"'b'":"\/\\"} +)"; + auto const output_string = std::string(out_buffer.data(), out_buffer.size()); + EXPECT_EQ(expected, output_string); +} + +CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 37a9f44e882..ad618cc4ed6 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -88,3 +88,55 @@ cdef extern from "cudf/io/json.hpp" \ cdef cudf_io_types.table_with_metadata read_json( json_reader_options &options) except + + + cdef cppclass json_writer_options: + json_writer_options() except + + cudf_io_types.sink_info get_sink() except + + cudf_table_view.table_view get_table() except + + string get_na_rep() except + + bool is_enabled_include_nulls() except + + bool is_enabled_lines() except + + bool is_enabled_experimental() except + + size_type get_rows_per_chunk() except + + string get_true_value() except + + string get_false_value() except + + + # setter + void set_table(cudf_table_view.table_view tbl) except + + void set_metadata(cudf_io_types.table_metadata meta) except + + void set_na_rep(string val) except + + void enable_include_nulls(bool val) except + + void enable_lines(bool val) except + + void set_rows_per_chunk(size_type val) except + + void set_true_value(string val) except + + void set_false_value(string val) except + + + @staticmethod + json_writer_options_builder builder( + cudf_io_types.sink_info sink, + cudf_table_view.table_view tbl + ) except + + + cdef cppclass json_writer_options_builder: + json_writer_options_builder() except + + json_writer_options_builder( + cudf_io_types.source_info src, + cudf_table_view.table_view tbl + ) except + + json_writer_options_builder& table( + cudf_table_view.table_view tbl + ) except + + json_writer_options_builder& metadata( + cudf_io_types.table_metadata meta + ) except + + json_writer_options_builder& na_rep(string val) except + + json_writer_options_builder& include_nulls(bool val) except + + json_writer_options_builder& lines(bool val) except + + json_writer_options_builder& rows_per_chunk(size_type val) except + + json_writer_options_builder& true_value(string val) except + + json_writer_options_builder& false_value(string val) except + + + json_writer_options build() except + + + cdef cudf_io_types.table_with_metadata write_json( + json_writer_options &options) except + diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 33e70419ee2..a40ba7862b2 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -7,9 +7,11 @@ import os from collections import abc import cudf +from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool from libcpp.map cimport map +from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector @@ -17,13 +19,32 @@ from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types from cudf._lib.cpp.io.json cimport ( json_reader_options, + json_writer_options, read_json as libcudf_read_json, schema_element, + write_json as libcudf_write_json, ) +from cudf._lib.cpp.io.types cimport ( + column_name_info, + compression_type, + data_sink, + sink_info, + table_metadata, + table_with_metadata, +) +from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type -from cudf._lib.io.utils cimport make_source_info, update_struct_field_names +from cudf._lib.io.utils cimport ( + make_sink_info, + make_source_info, + update_struct_field_names, +) from cudf._lib.types cimport dtype_to_data_type -from cudf._lib.utils cimport data_from_unique_ptr +from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table + +from cudf.api.types import is_list_dtype, is_struct_dtype + +from cudf._lib.column cimport Column cpdef read_json(object filepaths_or_buffers, @@ -127,6 +148,63 @@ cpdef read_json(object filepaths_or_buffers, return df +@acquire_spill_lock() +def write_json( + table, + object path_or_buf=None, + object na_rep="null", + bool include_nulls=True, + bool lines=False, + bool index=False, + int rows_per_chunk=8, +): + """ + Cython function to call into libcudf API, see `write_json`. + + See Also + -------- + cudf.to_json + """ + cdef table_view input_table_view = table_view_from_table( + table, ignore_index=True + ) + + cdef unique_ptr[data_sink] data_sink_c + cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) + cdef string na_c = na_rep.encode() + cdef bool include_nulls_c = include_nulls + cdef bool lines_c = lines + cdef int rows_per_chunk_c = rows_per_chunk + cdef string true_value_c = 'true'.encode() + cdef string false_value_c = 'false'.encode() + cdef table_metadata tbl_meta + + num_index_cols_meta = 0 + cdef column_name_info child_info + for i, name in enumerate(table._column_names, num_index_cols_meta): + child_info.name = name.encode() + tbl_meta.schema_info.push_back(child_info) + _set_col_children_metadata( + table[name]._column, + tbl_meta.schema_info[i] + ) + + cdef json_writer_options options = move( + json_writer_options.builder(sink_info_c, input_table_view) + .metadata(tbl_meta) + .na_rep(na_c) + .include_nulls(include_nulls_c) + .lines(lines_c) + .rows_per_chunk(rows_per_chunk_c) + .true_value(true_value_c) + .false_value(false_value_c) + .build() + ) + + with nogil: + libcudf_write_json(options) + + cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except +: cdef schema_element s_element cdef data_type lib_type @@ -161,3 +239,23 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +: dtype = cudf.dtype(dtype) return dtype_to_data_type(dtype) + +cdef _set_col_children_metadata(Column col, + column_name_info& col_meta): + cdef column_name_info child_info + if is_struct_dtype(col): + for i, (child_col, name) in enumerate( + zip(col.children, list(col.dtype.fields)) + ): + child_info.name = name.encode() + col_meta.children.push_back(child_info) + _set_col_children_metadata( + child_col, col_meta.children[i] + ) + elif is_list_dtype(col): + _set_col_children_metadata( + col.children[0], + col_meta.children[0] + ) + else: + return diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index cdf4fce9dd6..4de9a92a068 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -179,12 +179,65 @@ def read_json( @ioutils.doc_to_json() -def to_json(cudf_val, path_or_buf=None, *args, **kwargs): +def to_json( + cudf_val, + path_or_buf=None, + engine="auto", + orient=None, + storage_options=None, + *args, + **kwargs, +): """{docstring}""" - warnings.warn( - "Using CPU via Pandas to write JSON dataset, this may " - "be GPU accelerated in the future" - ) - pd_value = cudf_val.to_pandas(nullable=True) - return pd.io.json.to_json(path_or_buf, pd_value, *args, **kwargs) + if engine == "auto": + engine = "pandas" + + if engine == "cudf": + if orient not in {"records", None}: + raise ValueError( + f"Only the `orient='records'` is supported for JSON writer" + f" with `engine='cudf'`, got {orient}" + ) + + if path_or_buf is None: + path_or_buf = StringIO() + return_as_string = True + else: + path_or_buf = ioutils.get_writer_filepath_or_buffer( + path_or_data=path_or_buf, + mode="w", + storage_options=storage_options, + ) + return_as_string = False + + if ioutils.is_fsspec_open_file(path_or_buf): + with path_or_buf as file_obj: + file_obj = ioutils.get_IOBase_writer(file_obj) + libjson.write_json( + cudf_val, path_or_buf=file_obj, *args, **kwargs + ) + else: + libjson.write_json( + cudf_val, path_or_buf=path_or_buf, *args, **kwargs + ) + + if return_as_string: + path_or_buf.seek(0) + return path_or_buf.read() + elif engine == "pandas": + warnings.warn("Using CPU via Pandas to write JSON dataset") + pd_value = cudf_val.to_pandas(nullable=True) + return pd.io.json.to_json( + path_or_buf, + pd_value, + orient=orient, + storage_options=storage_options, + *args, + **kwargs, + ) + else: + raise ValueError( + f"`engine` only support {{'auto', 'cudf', 'pandas'}}, " + f"got: {engine}" + ) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index d693ea112e7..81acb43ee7d 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -14,7 +14,12 @@ import cudf from cudf.core._compat import PANDAS_GE_110 -from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq +from cudf.testing._utils import ( + DATETIME_TYPES, + NUMERIC_TYPES, + TIMEDELTA_TYPES, + assert_eq, +) def make_numeric_dataframe(nrows, dtype): @@ -52,6 +57,32 @@ def gdf(pdf): return cudf.DataFrame.from_pandas(pdf) +@pytest.fixture(params=[0, 1, 10, 100]) +def gdf_writer_types(request): + # datetime64[us], datetime64[ns] are unsupported due to a bug in parser + types = ( + NUMERIC_TYPES + + ["datetime64[s]", "datetime64[ms]"] + + TIMEDELTA_TYPES + + ["bool", "str"] + ) + typer = {"col_" + val: val for val in types} + ncols = len(types) + nrows = request.param + + # Create a pandas dataframe with random data of mixed types + test_pdf = cudf.DataFrame( + [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)], + columns=pd.Index([f"col_{typ}" for typ in types]), + ) + + # Cast all the column dtypes to objects, rename them, and then cast to + # appropriate types + test_pdf = test_pdf.astype(typer) + + return test_pdf + + index_params = [True, False] compression_params = ["gzip", "bz2", "zip", "xz", None] orient_params = ["columns", "records", "table", "split"] @@ -156,6 +187,60 @@ def test_json_writer(tmpdir, pdf, gdf): assert_eq(pdf_string, gdf_string) +def test_cudf_json_writer(pdf): + # removing datetime column because pandas doesn't support it + for col_name in pdf.columns: + if "datetime" in col_name: + pdf.drop(col_name, axis=1, inplace=True) + gdf = cudf.DataFrame.from_pandas(pdf) + pdf_string = pdf.to_json(orient="records", lines=True) + gdf_string = gdf.to_json(orient="records", lines=True, engine="cudf") + + assert_eq(pdf_string, gdf_string) + + +def test_cudf_json_writer_read(gdf_writer_types): + dtypes = { + col_name: col_name[len("col_") :] + for col_name in gdf_writer_types.columns + } + gdf_string = gdf_writer_types.to_json( + orient="records", lines=True, engine="cudf" + ) + gdf2 = cudf.read_json( + StringIO(gdf_string), + lines=True, + engine="cudf", + dtype=dict(dtypes), + ) + pdf2 = pd.read_json(StringIO(gdf_string), lines=True, dtype=dict(dtypes)) + + # Bug in pandas https://github.com/pandas-dev/pandas/issues/28558 + if pdf2.empty: + pdf2.reset_index(drop=True, inplace=True) + pdf2.columns = pdf2.columns.astype("object") + assert_eq(pdf2, gdf2) + + +@pytest.mark.parametrize("sink", ["string", "file"]) +def test_cudf_json_writer_sinks(sink, tmp_path_factory): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + target = None + if sink == "string": + target = StringIO() + elif sink == "file": + target = tmp_path_factory.mktemp("json") / "test_df.json" + df.to_json(target, engine="cudf") + if sink == "string": + assert ( + target.getvalue() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' + ) + elif sink == "file": + assert os.path.exists(target) + with open(target, "r") as f: + assert f.read() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' + + @pytest.fixture( params=["string", "filepath", "pathobj", "bytes_io", "string_io", "url"] ) @@ -436,6 +521,22 @@ def test_json_corner_case_with_escape_and_double_quote_char_with_strings(): assert expected[col_name][i] == df[col_name][i] +def test_json_to_json_special_characters(): + df = cudf.DataFrame( + { + "'a'": ['ab"cd', "\\\b", "\r\\", "'"], + "b": ["a\tb\t", "\\", '\\"', "\t"], + "c": ["aeiou", "try", "json", "cudf"], + } + ) + + actual = StringIO() + df.to_json(actual, engine="cudf", lines=True, orient="records") + expected = StringIO() + df.to_pandas().to_json(expected, lines=True, orient="records") + assert expected.getvalue() == actual.getvalue() + + @pytest.mark.parametrize( "gdf,pdf", [ diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 7f5880bd6c1..56e2e539e01 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -764,6 +764,9 @@ ---------- path_or_buf : string or file handle, optional File path or object. If not specified, the result is returned as a string. +engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto' + Parser engine to use. If 'auto' is passed, the `pandas` engine + will be selected. orient : string Indication of expected JSON string format.