Skip to content

Commit

Permalink
Simplify read_json by removing unnecessary reader/impl classes (#9088)
Browse files Browse the repository at this point in the history
Depends on #9040

Removes the json reader and impl classes, replacing member variables with local variables, reduces cognitive overhead, and facilitates further refactoring.

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - MithunR (https://github.com/mythrocks)
  - Elias Stehle (https://github.com/elstehle)

URL: #9088
  • Loading branch information
cwharris authored Nov 11, 2021
1 parent 3ca7c96 commit 77dc477
Show file tree
Hide file tree
Showing 10 changed files with 272 additions and 577 deletions.
67 changes: 13 additions & 54 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,67 +25,26 @@

#include <rmm/cuda_stream_view.hpp>

// Forward declarations
namespace arrow {
namespace io {
class RandomAccessFile;
}
} // namespace arrow

namespace cudf {
namespace io {
namespace detail {
namespace json {

/**
* @brief Class to read JSON dataset data into columns.
* @brief Reads and returns the entire data set.
*
* @param[in] sources Input `datasource` objects to read the dataset from
* @param[in] options Settings for controlling reading behavior
* @param[in] stream CUDA stream used for device memory operations and kernel launches
* @param[in] mr Device memory resource to use for device memory allocation
*
* @return cudf::table object that contains the array of cudf::column.
*/
class reader {
private:
class impl;
std::unique_ptr<impl> _impl;

public:
/**
* @brief Constructor from an array of file paths
*
* @param filepaths Paths to the files containing the input dataset
* @param options Settings for controlling reading behavior
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
explicit reader(std::vector<std::string> const& filepaths,
json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Constructor from an array of datasources
*
* @param sources Input `datasource` objects to read the dataset from
* @param options Settings for controlling reading behavior
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Destructor explicitly-declared to avoid inlined in header
*/
~reader();

/*
* @brief Reads and returns the entire data set.
*
* @param[in] options Settings for controlling reading behavior
* @return cudf::table object that contains the array of cudf::column.
*/
table_with_metadata read(json_reader_options const& options,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
};
table_with_metadata read_json(
std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
json_reader_options const& options,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace json
} // namespace detail
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/hash/concurrent_unordered_map.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -538,8 +538,11 @@ class concurrent_unordered_map {
}
}

init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
if (m_capacity > 0) {
init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
}

CUDA_TRY(cudaGetLastError());
}
};
7 changes: 1 addition & 6 deletions cpp/src/io/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,6 @@ compression_type infer_compression_type(compression_type compression, source_inf

table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
{
namespace json = cudf::io::detail::json;

CUDF_FUNC_RANGE();

options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
Expand All @@ -193,10 +191,7 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
options.get_byte_range_offset(),
options.get_byte_range_size_with_padding());

auto reader =
std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);

return reader->read(options);
return detail::json::read_json(datasources, options, rmm::cuda_stream_default, mr);
}

table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
Expand Down
23 changes: 0 additions & 23 deletions cpp/src/io/json/json_common.h

This file was deleted.

31 changes: 20 additions & 11 deletions cpp/src/io/json/json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,17 @@
* limitations under the License.
*/

#include "json_common.h"
#include "json_gpu.h"

#include <io/csv/datetime.cuh>
#include <io/utilities/column_type_histogram.hpp>
#include <io/utilities/parsing_utils.cuh>

#include <cudf/detail/utilities/hash_functions.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/lists/list_view.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/bit.hpp>
#include <cudf/utilities/span.hpp>
#include <cudf/utilities/traits.hpp>
Expand Down Expand Up @@ -334,19 +335,19 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
const char* end,
parse_options_view const& opts,
cudf::size_type field_idx,
col_map_type* col_map)
col_map_type col_map)
{
auto const desc_pre_trim =
col_map == nullptr
col_map.capacity() == 0
// No key - column and begin are trivial
? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
: [&]() {
auto const key_range = get_next_key(begin, end, opts.quotechar);
auto const key_hash = MurmurHash3_32<cudf::string_view>{}(
cudf::string_view(key_range.first, key_range.second - key_range.first));
auto const hash_col = col_map->find(key_hash);
auto const hash_col = col_map.find(key_hash);
// Fall back to field index if not found (parsing error)
auto const column = (hash_col != col_map->end()) ? (*hash_col).second : field_idx;
auto const column = (hash_col != col_map.end()) ? (*hash_col).second : field_idx;

// Skip the colon between the key and the value
auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1;
Expand Down Expand Up @@ -401,7 +402,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
device_span<char const> const data,
device_span<uint64_t const> const row_offsets,
device_span<data_type const> const column_types,
col_map_type* col_map,
col_map_type col_map,
device_span<void* const> const output_columns,
device_span<bitmask_type* const> const valid_fields,
device_span<cudf::size_type> const num_valid_fields)
Expand All @@ -421,6 +422,8 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,

current = desc.value_end + 1;

using string_index_pair = thrust::pair<const char*, size_type>;

// Empty fields are not legal values
if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
// Type dispatcher does not handle strings
Expand Down Expand Up @@ -472,14 +475,14 @@ __global__ void detect_data_types_kernel(
parse_options_view const opts,
device_span<char const> const data,
device_span<uint64_t const> const row_offsets,
col_map_type* col_map,
col_map_type col_map,
int num_columns,
device_span<cudf::io::column_type_histogram> const column_infos)
{
auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
if (rec_id >= row_offsets.size()) return;

auto const are_rows_objects = col_map != nullptr;
auto const are_rows_objects = col_map.capacity() != 0;
auto const row_data_range = get_row_data_range(data, row_offsets, rec_id);

size_type input_field_index = 0;
Expand Down Expand Up @@ -678,8 +681,14 @@ void convert_json_to_columns(parse_options_view const& opts,

const int grid_size = (row_offsets.size() + block_size - 1) / block_size;

convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(
opts, data, row_offsets, column_types, col_map, output_columns, valid_fields, num_valid_fields);
convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(opts,
data,
row_offsets,
column_types,
*col_map,
output_columns,
valid_fields,
num_valid_fields);

CUDA_TRY(cudaGetLastError());
}
Expand Down Expand Up @@ -724,7 +733,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
const int grid_size = (row_offsets.size() + block_size - 1) / block_size;

detect_data_types_kernel<<<grid_size, block_size, 0, stream.value()>>>(
options, data, row_offsets, col_map, num_columns, d_column_infos);
options, data, row_offsets, *col_map, num_columns, d_column_infos);

return cudf::detail::make_std_vector_sync(d_column_infos, stream);
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/json/json_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

#pragma once

#include <io/utilities/column_type_histogram.hpp>
#include <io/utilities/parsing_utils.cuh>
#include "json_common.h"

#include <hash/concurrent_unordered_map.cuh>

Expand Down
Loading

0 comments on commit 77dc477

Please sign in to comment.