Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expand CSV and JSON reader APIs to accept dtypes as a vector or map of data_type objects #8856

Merged
merged 30 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
974c3a3
Merge branch 'branch-21.08' of https://github.com/rapidsai/cudf into …
vuule Jul 21, 2021
e6bd37f
add API that takes a map of data_types
vuule Jul 22, 2021
fbf05cc
using
vuule Jul 22, 2021
523e3ad
dtypes + selected columns fix
vuule Jul 22, 2021
1d5cf00
convert tests to new API
vuule Jul 22, 2021
64a2a07
infer_date -> parse_date for consistency; add parse_hex to libcudf
vuule Jul 22, 2021
d9f951e
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
vuule Jul 23, 2021
ee585c1
use new hex API in tests
vuule Jul 23, 2021
0e24ae8
re-enable json tests that were accidentally disabled
vuule Jul 23, 2021
40ab322
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
vuule Jul 23, 2021
584f180
small refactor to prepare for JSON API expansion
vuule Jul 24, 2021
6b37786
disable the xfail tests
vuule Jul 24, 2021
caf1dbc
Merge branch 'bug--disabled-json-tests' of https://github.com/vuule/c…
vuule Jul 24, 2021
5120d7b
extend JSON API (no tests)
vuule Jul 24, 2021
6e4a888
switch tests to new API
vuule Jul 24, 2021
ef03125
add new APIs to cython defs
vuule Jul 26, 2021
8ae3e1a
add to last missing place
vuule Jul 26, 2021
53c4d15
style fix; missed rename
vuule Jul 26, 2021
1b54375
docs fixes
vuule Jul 26, 2021
6638a94
deprecate APIs
vuule Jul 26, 2021
c5a9e68
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
vuule Aug 2, 2021
c654e10
doc fix
vuule Aug 2, 2021
044b698
Apply suggestions from code review
vuule Aug 2, 2021
eda0ec4
Merge branch 'fea-csv-dtypes-api' of https://github.com/vuule/cudf in…
vuule Aug 2, 2021
af69798
Apply suggestions from code review
vuule Aug 3, 2021
8fa2694
add missing empty line
vuule Aug 3, 2021
4873261
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
vuule Aug 3, 2021
f3d94e9
move visitor_overload to utilities
vuule Aug 3, 2021
829928a
update yaml
vuule Aug 3, 2021
3ebe478
fix copyright year
vuule Aug 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 91 additions & 16 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,19 @@ class csv_reader_options {
// Whether a quote inside a value is double-quoted
bool _doublequote = true;
// Names of columns to read as datetime
std::vector<std::string> _infer_date_names;
std::vector<std::string> _parse_dates_names;
// Indexes of columns to read as datetime
std::vector<int> _infer_date_indexes;
std::vector<int> _parse_dates_indexes;
// Names of columns to parse as hexadecimal
std::vector<std::string> _parse_hex_names;
// Indexes of columns to parse as hexadecimal
std::vector<int> _parse_hex_indexes;

// Conversion settings

// Per-column types; disables type inference on those columns
std::variant<std::vector<std::string>, std::vector<data_type>> _dtypes;
std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
_dtypes;
// Additional values to recognize as boolean true values
std::vector<std::string> _true_values{"True", "TRUE", "true"};
// Additional values to recognize as boolean false values
Expand Down Expand Up @@ -280,17 +285,30 @@ class csv_reader_options {
/**
* @brief Returns names of columns to read as datetime.
*/
std::vector<std::string> const& get_infer_date_names() const { return _infer_date_names; }
std::vector<std::string> const& get_parse_dates_names() const { return _parse_dates_names; }

/**
* @brief Returns indexes of columns to read as datetime.
*/
std::vector<int> const& get_infer_date_indexes() const { return _infer_date_indexes; }
std::vector<int> const& get_parse_dates_indexes() const { return _parse_dates_indexes; }

/**
* @brief Returns names of columns to read as datetime.
vuule marked this conversation as resolved.
Show resolved Hide resolved
*/
std::vector<std::string> const& get_parse_hex_names() const { return _parse_hex_names; }

/**
* @brief Returns indexes of columns to read as datetime.
vuule marked this conversation as resolved.
Show resolved Hide resolved
*/
std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }

/**
* @brief Returns per-column types.
*/
std::variant<std::vector<std::string>, std::vector<data_type>> const& get_dtypes() const
std::variant<std::vector<std::string>,
std::vector<data_type>,
std::map<std::string, data_type>> const&
get_dtypes() const
{
return _dtypes;
}
Expand Down Expand Up @@ -547,21 +565,42 @@ class csv_reader_options {
*
* @param col_names Vector of column names to infer as datetime.
*/
void set_infer_date_names(std::vector<std::string> col_names)
void set_parse_dates(std::vector<std::string> col_names)
{
_infer_date_names = std::move(col_names);
_parse_dates_names = std::move(col_names);
}

/**
* @brief Sets indexes of columns to read as datetime.
*
* @param col_names Vector of column indices to infer as datetime.
*/
void set_infer_date_indexes(std::vector<int> col_ind)
void set_parse_dates(std::vector<int> col_ind) { _parse_dates_indexes = std::move(col_ind); }

/**
* @brief Sets names of columns to parse as hexadecimal
*
* @param col_names Vector of column names to parse as hexadecimal
*/
void set_parse_hex(std::vector<std::string> col_names)
{
_infer_date_indexes = std::move(col_ind);
_parse_hex_names = std::move(col_names);
}

/**
* @brief Sets indexes of columns to parse as hexadecimal
*
* @param col_names Vector of column indices to parse as hexadecimal
*/
void set_parse_hex(std::vector<int> col_ind) { _parse_hex_indexes = std::move(col_ind); }

/**
* @brief Sets per-column types
*
* @param types Column name -> data type map specifying the columns' target data types
*/
void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }

/**
* @brief Sets per-column types
*
Expand Down Expand Up @@ -958,24 +997,60 @@ class csv_reader_options_builder {
/**
* @brief Sets names of columns to read as datetime.
*
* @param col_names Vector of column names to infer as datetime.
* @param col_names Vector of column names to read as datetime.
* @return this for chaining.
*/
csv_reader_options_builder& infer_date_names(std::vector<std::string> col_names)
csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
{
options._infer_date_names = std::move(col_names);
options._parse_dates_names = std::move(col_names);
return *this;
}

/**
* @brief Sets indexes of columns to read as datetime.
*
* @param col_names Vector of column indices to infer as datetime.
* @param col_names Vector of column indices to read as datetime.
vuule marked this conversation as resolved.
Show resolved Hide resolved
* @return this for chaining.
*/
csv_reader_options_builder& infer_date_indexes(std::vector<int> col_ind)
csv_reader_options_builder& parse_dates(std::vector<int> col_ind)
{
options._infer_date_indexes = std::move(col_ind);
options._parse_dates_indexes = std::move(col_ind);
return *this;
}

/**
* @brief Sets names of columns to parse as hexadecimal.
*
* @param col_names Vector of column names to parse as hexadecimal
* @return this for chaining.
*/
csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
{
options._parse_hex_names = std::move(col_names);
return *this;
}

/**
* @brief Sets indexes of columns to parse as hexadecimal.
*
* @param col_names Vector of column indices to parse as hexadecimal
vuule marked this conversation as resolved.
Show resolved Hide resolved
* @return this for chaining.
*/
csv_reader_options_builder& parse_hex(std::vector<int> col_ind)
{
options._parse_hex_indexes = std::move(col_ind);
return *this;
}

/**
* @brief Sets per-column types.
*
* @param types Column name -> data type map specifying the columns' target data types
* @return this for chaining.
*/
csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
{
options._dtypes = std::move(types);
return *this;
}

Expand Down
59 changes: 53 additions & 6 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@

#include <rmm/mr/device/per_device_resource.hpp>

#include <map>
#include <string>
#include <variant>
#include <vector>

namespace cudf {
Expand Down Expand Up @@ -66,7 +68,8 @@ class json_reader_options {
source_info _source;

// Data types of the column; empty to infer dtypes
std::vector<std::string> _dtypes;
std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
_dtypes;
// Specify the compression format of the source or infer from file extension
compression_type _compression = compression_type::AUTO;

Expand Down Expand Up @@ -114,7 +117,13 @@ class json_reader_options {
/**
* @brief Returns data types of the columns.
*/
std::vector<std::string> const& get_dtypes() const { return _dtypes; }
std::variant<std::vector<std::string>,
std::vector<data_type>,
std::map<std::string, data_type>> const&
get_dtypes() const
{
return _dtypes;
}

/**
* @brief Returns compression format of the source.
Expand Down Expand Up @@ -146,14 +155,28 @@ class json_reader_options {
*
* @param types Vector dtypes in string format.
vuule marked this conversation as resolved.
Show resolved Hide resolved
*/
void dtypes(std::vector<std::string> types) { _dtypes = std::move(types); }
void set_dtypes(std::vector<std::string> types) { _dtypes = std::move(types); }

/**
* @brief Set data types for columns to be read.
*
* @param types Vector dtypes in string format.
vuule marked this conversation as resolved.
Show resolved Hide resolved
*/

vuule marked this conversation as resolved.
Show resolved Hide resolved
void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
/**
vuule marked this conversation as resolved.
Show resolved Hide resolved
* @brief Set data types for columns to be read.
*
* @param types Vector dtypes in string format.
*/
void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }

/**
* @brief Set the compression type.
*
* @param comp_type The compression type used.
*/
void compression(compression_type comp_type) { _compression = comp_type; }
void set_compression(compression_type comp_type) { _compression = comp_type; }

/**
* @brief Set number of bytes to skip from source start.
Expand Down Expand Up @@ -205,15 +228,39 @@ class json_reader_options_builder {
/**
* @brief Set data types for columns to be read.
*
* @param types Vector dtypes in string format.
* @return this for chaining.
* @param types Vector of dtypes in string format
* @return this for chaining
*/
json_reader_options_builder& dtypes(std::vector<std::string> types)
{
options._dtypes = std::move(types);
return *this;
}

/**
* @brief Set data types for columns to be read.
*
* @param types Vector of dtypes
* @return this for chaining
*/
json_reader_options_builder& dtypes(std::vector<data_type> types)
{
options._dtypes = std::move(types);
return *this;
}

/**
* @brief Set data types for columns to be read.
*
* @param types Column name -> dtype map.
* @return this for chaining
*/
json_reader_options_builder& dtypes(std::map<std::string, data_type> types)
{
options._dtypes = std::move(types);
return *this;
}

/**
* @brief Set the compression type.
*
Expand Down
70 changes: 61 additions & 9 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,41 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
}

std::vector<data_type> reader::impl::select_data_types(
std::map<std::string, data_type> const& col_type_map)
{
std::vector<data_type> selected_dtypes;

for (int col = 0; col < num_actual_cols_; col++) {
if (column_flags_[col] & column_parse::enabled) {
auto const col_type_it = col_type_map.find(col_names_[col]);
CUDF_EXPECTS(col_type_it != col_type_map.end(),
"Must specify data types for all active columns");
selected_dtypes.emplace_back(col_type_it->second);
}
}
return selected_dtypes;
}

std::vector<data_type> reader::impl::select_data_types(std::vector<data_type> const& dtypes)
{
std::vector<data_type> selected_dtypes;

if (dtypes.size() == 1) {
// If it's a single dtype, assign that dtype to all active columns
selected_dtypes.resize(num_active_cols_, dtypes.front());
} else {
// If it's a list, assign dtypes to active columns in the given order
CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_cols_,
"Must specify data types for all columns");

for (int col = 0; col < num_actual_cols_; col++) {
if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); }
}
}
return selected_dtypes;
}

table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
{
auto const data_row_offsets = select_data_and_row_offsets(stream);
Expand Down Expand Up @@ -355,20 +390,34 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
}
}

// User can specify which columns should be inferred as datetime
if (!opts_.get_infer_date_indexes().empty() || !opts_.get_infer_date_names().empty()) {
for (const auto index : opts_.get_infer_date_indexes()) {
// User can specify which columns should be read as datetime
if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) {
for (const auto index : opts_.get_parse_dates_indexes()) {
column_flags_[index] |= column_parse::as_datetime;
}

for (const auto& name : opts_.get_infer_date_names()) {
for (const auto& name : opts_.get_parse_dates_names()) {
auto it = std::find(col_names_.begin(), col_names_.end(), name);
if (it != col_names_.end()) {
column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
}
}
}

// User can specify which columns should be inferred as datetime
if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) {
for (const auto index : opts_.get_parse_hex_indexes()) {
column_flags_[index] |= column_parse::as_hexadecimal;
}

for (const auto& name : opts_.get_parse_hex_names()) {
auto it = std::find(col_names_.begin(), col_names_.end(), name);
if (it != col_names_.end()) {
column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal;
}
}
}

// Return empty table rather than exception if nothing to load
if (num_active_cols_ == 0) { return {std::make_unique<table>(), {}}; }

Expand All @@ -382,11 +431,14 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
if (has_to_infer_column_types) {
column_types = infer_column_types(data, row_offsets, stream);
} else {
column_types =
std::visit(VisitorOverload{
[&](const std::vector<data_type>& data_types) { return data_types; },
[&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
opts_.get_dtypes());
column_types = std::visit(
VisitorOverload{
[&](const std::vector<data_type>& data_types) { return select_data_types(data_types); },
[&](const std::map<std::string, data_type>& data_types) {
return select_data_types(data_types);
},
[&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
opts_.get_dtypes());
}

out_columns.reserve(column_types.size());
Expand Down
Loading