Skip to content

Commit

Permalink
Merge pull request #12636 from rapidsai/branch-23.02
Browse files Browse the repository at this point in the history
Forward-merge branch-23.02 to branch-23.04
  • Loading branch information
GPUtester authored Jan 28, 2023
2 parents ae89e0d + 55ef601 commit 50ca2ac
Show file tree
Hide file tree
Showing 16 changed files with 1,833 additions and 41 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ add_library(
src/io/json/reader_impl.cu
src/io/json/experimental/byte_range_info.cu
src/io/json/experimental/read_json.cpp
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
src/io/orc/orc.cpp
Expand Down
33 changes: 28 additions & 5 deletions cpp/include/cudf/io/detail/data_casting.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,27 @@ __device__ __forceinline__ char get_escape_char(char escaped_char)
}
}

/**
* @brief Returns the escaped characters for a given character.
*
* @param escaped_char The character to escape.
* @return The escaped characters for a given character.
*/
__device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escaped_char)
{
switch (escaped_char) {
case '"': return {'\\', '"'};
case '\\': return {'\\', '\\'};
case '/': return {'\\', '/'};
case '\b': return {'\\', 'b'};
case '\f': return {'\\', 'f'};
case '\n': return {'\\', 'n'};
case '\r': return {'\\', 'r'};
case '\t': return {'\\', 't'};
// case 'u': return UNICODE_SEQ;
default: return {'\0', escaped_char};
}
}
/**
* @brief Parses the hex value from the four hex digits of a unicode code point escape sequence
* \uXXXX.
Expand Down Expand Up @@ -162,8 +183,10 @@ process_string(in_iterator_t in_begin,
int32_t bytes = 0;
const auto num_in_chars = thrust::distance(in_begin, in_end);
// String values are indicated by keeping the quote character
bool const is_string_value = num_in_chars >= 2LL && (*in_begin == options.quotechar) &&
(*thrust::prev(in_end) == options.quotechar);
bool const is_string_value =
num_in_chars >= 2LL &&
(options.quotechar == '\0' ||
(*in_begin == options.quotechar) && (*thrust::prev(in_end) == options.quotechar));

// Copy literal/numeric value
if (not is_string_value) {
Expand Down Expand Up @@ -282,7 +305,7 @@ struct string_parse {

__device__ void operator()(size_type idx)
{
if (not bit_is_set(null_mask, idx)) {
if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}
Expand All @@ -294,7 +317,7 @@ struct string_parse {
auto const is_null_literal =
(!d_chars) &&
serialized_trie_contains(options.trie_na, {in_begin, static_cast<std::size_t>(num_in_chars)});
if (is_null_literal) {
if (is_null_literal && null_mask != nullptr) {
clear_bit(null_mask, idx);
if (!d_chars) d_offsets[idx] = 0;
return;
Expand All @@ -303,7 +326,7 @@ struct string_parse {
char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
auto str_process_info = process_string(in_begin, in_end, d_buffer, options);
if (str_process_info.result != data_casting_result::PARSING_SUCCESS) {
clear_bit(null_mask, idx);
if (null_mask != nullptr) clear_bit(null_mask, idx);
if (!d_chars) d_offsets[idx] = 0;
} else {
if (!d_chars) d_offsets[idx] = str_process_info.bytes;
Expand Down
26 changes: 17 additions & 9 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,10 +21,7 @@

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace io {
namespace detail {
namespace json {
namespace cudf::io::json::detail {

/**
* @brief Reads and returns the entire data set.
Expand All @@ -42,7 +39,18 @@ table_with_metadata read_json(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace json
} // namespace detail
} // namespace io
} // namespace cudf
/**
* @brief Write an entire dataset to JSON format.
*
* @param sink Output sink
* @param table The set of columns
* @param options Settings for controlling behavior
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource to use for device memory allocation
*/
void write_json(data_sink* sink,
table_view const& table,
json_writer_options const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
} // namespace cudf::io::json::detail
Loading

0 comments on commit 50ca2ac

Please sign in to comment.