Skip to content

Commit

Permalink
Add JSON Writer (#12474)
Browse files Browse the repository at this point in the history
Adds JSON writer with nested support.
It supports numeric, datetime, duration, strings,  nested types such as struct and list types.
`orient='records'` is only supported now, with `lines=True/False`.
Usage: `df.to_json(engine='cudf')`

closes #11165

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - David Wendt (https://github.com/davidwendt)
  - Michael Wang (https://github.com/isVoid)
  - Robert Maynard (https://github.com/robertmaynard)

URL: #12474
  • Loading branch information
karthikeyann authored Jan 28, 2023
1 parent 7695850 commit 55ef601
Show file tree
Hide file tree
Showing 16 changed files with 1,833 additions and 41 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ add_library(
src/io/json/reader_impl.cu
src/io/json/experimental/byte_range_info.cu
src/io/json/experimental/read_json.cpp
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
src/io/orc/orc.cpp
Expand Down
33 changes: 28 additions & 5 deletions cpp/include/cudf/io/detail/data_casting.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,27 @@ __device__ __forceinline__ char get_escape_char(char escaped_char)
}
}

/**
* @brief Returns the escaped characters for a given character.
*
* @param escaped_char The character to escape.
* @return The escaped characters for a given character.
*/
__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
{
switch (escaped_char) {
case '"': return {'\\', '"'};
case '\\': return {'\\', '\\'};
case '/': return {'\\', '/'};
case '\b': return {'\\', 'b'};
case '\f': return {'\\', 'f'};
case '\n': return {'\\', 'n'};
case '\r': return {'\\', 'r'};
case '\t': return {'\\', 't'};
// case 'u': return UNICODE_SEQ;
default: return {'\0', escaped_char};
}
}
/**
* @brief Parses the hex value from the four hex digits of a unicode code point escape sequence
* \uXXXX.
Expand Down Expand Up @@ -162,8 +183,10 @@ process_string(in_iterator_t in_begin,
int32_t bytes = 0;
const auto num_in_chars = thrust::distance(in_begin, in_end);
// String values are indicated by keeping the quote character
bool const is_string_value = num_in_chars >= 2LL && (*in_begin == options.quotechar) &&
(*thrust::prev(in_end) == options.quotechar);
bool const is_string_value =
num_in_chars >= 2LL &&
(options.quotechar == '\0' ||
(*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));

// Copy literal/numeric value
if (not is_string_value) {
Expand Down Expand Up @@ -282,7 +305,7 @@ struct string_parse {

__device__ void operator()(size_type idx)
{
if (not bit_is_set(null_mask, idx)) {
if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}
Expand All @@ -294,7 +317,7 @@ struct string_parse {
auto const is_null_literal =
(!d_chars) &&
serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
if (is_null_literal) {
if (is_null_literal && null_mask != nullptr) {
clear_bit(null_mask, idx);
if (!d_chars) d_offsets[idx] = 0;
return;
Expand All @@ -303,7 +326,7 @@ struct string_parse {
char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
clear_bit(null_mask, idx);
if (null_mask != nullptr) clear_bit(null_mask, idx);
if (!d_chars) d_offsets[idx] = 0;
} else {
if (!d_chars) d_offsets[idx] = str_process_info.bytes;
Expand Down
26 changes: 17 additions & 9 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,10 +21,7 @@

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace io {
namespace detail {
namespace json {
namespace cudf::io::json::detail {

/**
* @brief Reads and returns the entire data set.
Expand All @@ -42,7 +39,18 @@ table_with_metadata read_json(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace json
} // namespace detail
} // namespace io
} // namespace cudf
/**
* @brief Write an entire dataset to JSON format.
*
* @param sink Output sink
* @param table The set of columns
* @param options Settings for controlling behavior
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource to use for device memory allocation
*/
void write_json(data_sink* sink,
table_view const& table,
json_writer_options const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
} // namespace cudf::io::json::detail
Loading

0 comments on commit 55ef601

Please sign in to comment.