Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch-21.10' into unflatten-nes…
Browse files Browse the repository at this point in the history
…ted-columns
  • Loading branch information
mythrocks committed Aug 20, 2021
2 parents 7582913 + 5869264 commit 94bb184
Show file tree
Hide file tree
Showing 20 changed files with 1,097 additions and 522 deletions.
38 changes: 2 additions & 36 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,7 @@ class csv_reader_options {
// Conversion settings

// Per-column types; disables type inference on those columns
std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
_dtypes;
std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
// Additional values to recognize as boolean true values
std::vector<std::string> _true_values{"True", "TRUE", "true"};
// Additional values to recognize as boolean false values
Expand Down Expand Up @@ -305,10 +304,7 @@ class csv_reader_options {
/**
* @brief Returns per-column types.
*/
std::variant<std::vector<std::string>,
std::vector<data_type>,
std::map<std::string, data_type>> const&
get_dtypes() const
std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
{
return _dtypes;
}
Expand Down Expand Up @@ -608,20 +604,6 @@ class csv_reader_options {
*/
void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }

/**
* @brief Sets per-column types, specified by the type's respective string representation.
*
* @param types Vector of dtypes in which the column needs to be read.
*/
[[deprecated(
"The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) or "
"dtypes(std::map<std::string, data_type>) instead.")]] void
set_dtypes(std::vector<std::string> types)
{
_dtypes = std::move(types);
}

/**
* @brief Sets additional values to recognize as boolean true values.
*
Expand Down Expand Up @@ -1067,22 +1049,6 @@ class csv_reader_options_builder {
return *this;
}

/**
* @brief Sets per-column types, specified by the type's respective string representation.
*
* @param types Vector of dtypes in which the column needs to be read.
* @return this for chaining.
*/
[[deprecated(
"The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) or "
"dtypes(std::map<std::string, data_type>) instead.")]] csv_reader_options_builder&
dtypes(std::vector<std::string> types)
{
options._dtypes = std::move(types);
return *this;
}

/**
* @brief Sets additional values to recognize as boolean true values.
*
Expand Down
38 changes: 2 additions & 36 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ class json_reader_options {
source_info _source;

// Data types of the column; empty to infer dtypes
std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
_dtypes;
std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
// Specify the compression format of the source or infer from file extension
compression_type _compression = compression_type::AUTO;

Expand Down Expand Up @@ -117,10 +116,7 @@ class json_reader_options {
/**
* @brief Returns data types of the columns.
*/
std::variant<std::vector<std::string>,
std::vector<data_type>,
std::map<std::string, data_type>> const&
get_dtypes() const
std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
{
return _dtypes;
}
Expand Down Expand Up @@ -150,20 +146,6 @@ class json_reader_options {
*/
bool is_enabled_dayfirst() const { return _dayfirst; }

/**
* @brief Set data types for columns to be read.
*
* @param types Vector of dtypes in string format.
*/
[[deprecated(
"The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) or "
"dtypes(std::map<std::string, data_type>) instead.")]] void
set_dtypes(std::vector<std::string> types)
{
_dtypes = std::move(types);
}

/**
* @brief Set data types for columns to be read.
*
Expand Down Expand Up @@ -232,22 +214,6 @@ class json_reader_options_builder {
*/
explicit json_reader_options_builder(source_info const& src) : options(src) {}

/**
* @brief Set data types for columns to be read.
*
* @param types Vector of dtypes in string format
* @return this for chaining
*/
[[deprecated(
"The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) or "
"dtypes(std::map<std::string, data_type>) instead.")]] json_reader_options_builder&
dtypes(std::vector<std::string> types)
{
options._dtypes = std::move(types);
return *this;
}

/**
* @brief Set data types for columns to be read.
*
Expand Down
86 changes: 2 additions & 84 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@

#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/detail/utilities/visitor_overload.hpp>
#include <cudf/io/types.hpp>
#include <cudf/strings/replace.hpp>
#include <cudf/table/table.hpp>
Expand Down Expand Up @@ -420,14 +419,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
if (has_to_infer_column_types) {
column_types = infer_column_types(data, row_offsets, stream);
} else {
column_types = std::visit(
cudf::detail::visitor_overload{
[&](const std::vector<data_type>& data_types) { return select_data_types(data_types); },
[&](const std::map<std::string, data_type>& data_types) {
return select_data_types(data_types);
},
[&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
opts_.get_dtypes());
column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
opts_.get_dtypes());
}

out_columns.reserve(column_types.size());
Expand Down Expand Up @@ -707,81 +700,6 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
return dtypes;
}

std::vector<data_type> reader::impl::parse_column_types(
const std::vector<std::string>& types_as_strings)
{
std::vector<data_type> dtypes;

const bool is_dict = std::all_of(types_as_strings.begin(),
types_as_strings.end(),
[](const auto& s) { return s.find(':') != std::string::npos; });

if (!is_dict) {
if (types_as_strings.size() == 1) {
// If it's a single dtype, assign that dtype to all active columns
data_type dtype_;
column_parse::flags col_flags_;
std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
dtypes.resize(num_active_cols_, dtype_);
for (int col = 0; col < num_actual_cols_; col++) {
column_flags_[col] |= col_flags_;
}
CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
} else {
// If it's a list, assign dtypes to active columns in the given order
CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_cols_,
"Must specify data types for all columns");

auto dtype_ = std::back_inserter(dtypes);

for (int col = 0; col < num_actual_cols_; col++) {
if (column_flags_[col] & column_parse::enabled) {
column_parse::flags col_flags_;
std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
column_flags_[col] |= col_flags_;
CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
}
}
}
} else {
// Translate vector of `name : dtype` strings to map
// NOTE: Incoming pairs can be out-of-order from column names in dataset
std::unordered_map<std::string, std::string> col_type_map;
for (const auto& pair : types_as_strings) {
const auto pos = pair.find_last_of(':');
const auto name = pair.substr(0, pos);
const auto dtype = pair.substr(pos + 1, pair.size());
col_type_map[name] = dtype;
}

auto dtype_ = std::back_inserter(dtypes);

for (int col = 0; col < num_actual_cols_; col++) {
if (column_flags_[col] & column_parse::enabled) {
CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(),
"Must specify data types for all active columns");
column_parse::flags col_flags_;
std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]);
column_flags_[col] |= col_flags_;
CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
}
}
}

if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
for (auto& type : dtypes) {
if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
}
}

for (size_t i = 0; i < dtypes.size(); i++) {
// Replace EMPTY dtype with STRING
if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
}

return dtypes;
}

std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
device_span<uint64_t const> row_offsets,
host_span<data_type const> column_types,
Expand Down
9 changes: 0 additions & 9 deletions cpp/src/io/csv/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,6 @@ class reader::impl {
*/
std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);

/**
* @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
*
* @param types_as_strings The vector of strings from which to parse the columns' target data
* types
* @return List of columns' data types
*/
std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings);

/**
* @brief Converts the row-column data and outputs to column bufferrs.
*
Expand Down
79 changes: 20 additions & 59 deletions cpp/src/io/json/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -466,71 +466,32 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
}
}

std::vector<data_type> reader::impl::parse_data_types(
std::vector<std::string> const& types_as_strings)
{
CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(),
"Need to specify the type of each column.\n");
std::vector<data_type> dtypes;
// Assume that the dtype is in dictionary format only if all elements contain a colon
const bool is_dict = std::all_of(
std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) {
return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
});

auto split_on_colon = [](std::string_view s) {
auto const i = s.find(":");
return std::pair{s.substr(0, i), s.substr(i + 1)};
};

if (is_dict) {
std::map<std::string, data_type> col_type_map;
std::transform(
std::cbegin(types_as_strings),
std::cend(types_as_strings),
std::inserter(col_type_map, col_type_map.end()),
[&](auto const& ts) {
auto const [col_name, type_str] = split_on_colon(ts);
return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
});

// Using the map here allows O(n log n) complexity
std::transform(std::cbegin(metadata_.column_names),
std::cend(metadata_.column_names),
std::back_inserter(dtypes),
[&](auto const& column_name) { return col_type_map[column_name]; });
} else {
std::transform(std::cbegin(types_as_strings),
std::cend(types_as_strings),
std::back_inserter(dtypes),
[](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
}
return dtypes;
}

void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
rmm::cuda_stream_view stream)
{
bool has_to_infer_column_types =
std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
if (!has_to_infer_column_types) {
dtypes_ = std::visit(
cudf::detail::visitor_overload{
[&](const std::vector<data_type>& dtypes) { return dtypes; },
[&](const std::map<std::string, data_type>& dtypes) {
std::vector<data_type> sorted_dtypes;
std::transform(std::cbegin(metadata_.column_names),
std::cend(metadata_.column_names),
std::back_inserter(sorted_dtypes),
[&](auto const& column_name) {
auto const it = dtypes.find(column_name);
CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
return it->second;
});
return sorted_dtypes;
},
[&](std::vector<std::string> const& dtypes) { return parse_data_types(dtypes); }},
options_.get_dtypes());
dtypes_ = std::visit(cudf::detail::visitor_overload{
[&](const std::vector<data_type>& dtypes) {
CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(),
"Must specify types for all columns");
return dtypes;
},
[&](const std::map<std::string, data_type>& dtypes) {
std::vector<data_type> sorted_dtypes;
std::transform(std::cbegin(metadata_.column_names),
std::cend(metadata_.column_names),
std::back_inserter(sorted_dtypes),
[&](auto const& column_name) {
auto const it = dtypes.find(column_name);
CUDF_EXPECTS(it != dtypes.end(),
"Must specify types for all columns");
return it->second;
});
return sorted_dtypes;
}},
options_.get_dtypes());
} else {
CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
auto const num_columns = metadata_.column_names.size();
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/io/json/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,6 @@ class reader::impl {
*/
void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);

std::vector<data_type> parse_data_types(std::vector<std::string> const& types_as_strings);

/**
* @brief Set the data type array data member
*
Expand Down
6 changes: 5 additions & 1 deletion cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1858,7 +1858,11 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
cudf_io::csv_reader_options in_opts =
cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
.names(names)
.dtypes(std::vector<std::string>{"int32", "str", "int32", "int32", "int32"});
.dtypes({dtype<int32_t>(),
dtype<cudf::string_view>(),
dtype<int32_t>(),
dtype<int32_t>(),
dtype<int32_t>()});
auto result = cudf_io::read_csv(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
Expand Down
23 changes: 23 additions & 0 deletions cpp/tests/io/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,4 +888,27 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs)
float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity});
}

TEST_F(JsonReaderTest, BadDtypeParams)
{
std::string buffer = "[1,2,3,4]";

cudf_io::json_reader_options options_vec =
cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
.lines(true)
.dtypes({dtype<int8_t>()});

// should throw because there are four columns and only one dtype
EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error);

cudf_io::json_reader_options options_map =
cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
.lines(true)
.dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
{"1", dtype<int8_t>()},
{"2", dtype<int8_t>()},
{"wrong_name", dtype<int8_t>()}});
// should throw because one of the columns is not in the dtype map
EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error);
}

CUDF_TEST_PROGRAM_MAIN()
Loading

0 comments on commit 94bb184

Please sign in to comment.