Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove the option to pass data types as strings to read_csv and read_json #9079

Merged
merged 2 commits into from
Aug 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 2 additions & 36 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,7 @@ class csv_reader_options {
// Conversion settings

// Per-column types; disables type inference on those columns
std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
_dtypes;
std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
// Additional values to recognize as boolean true values
std::vector<std::string> _true_values{"True", "TRUE", "true"};
// Additional values to recognize as boolean false values
Expand Down Expand Up @@ -305,10 +304,7 @@ class csv_reader_options {
/**
* @brief Returns per-column types.
*/
std::variant<std::vector<std::string>,
std::vector<data_type>,
std::map<std::string, data_type>> const&
get_dtypes() const
std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
{
return _dtypes;
}
Expand Down Expand Up @@ -608,20 +604,6 @@ class csv_reader_options {
*/
void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }

/**
* @brief Sets per-column types, specified by the type's respective string representation.
*
* @param types Vector of dtypes in which the column needs to be read.
*/
[[deprecated(
"The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) or "
"dtypes(std::map<std::string, data_type>) instead.")]] void
set_dtypes(std::vector<std::string> types)
{
_dtypes = std::move(types);
}

/**
* @brief Sets additional values to recognize as boolean true values.
*
Expand Down Expand Up @@ -1067,22 +1049,6 @@ class csv_reader_options_builder {
return *this;
}

/**
* @brief Sets per-column types, specified by the type's respective string representation.
*
* @param types Vector of dtypes in which the column needs to be read.
* @return this for chaining.
*/
[[deprecated(
"The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) or "
"dtypes(std::map<std::string, data_type>) instead.")]] csv_reader_options_builder&
dtypes(std::vector<std::string> types)
{
options._dtypes = std::move(types);
return *this;
}

/**
* @brief Sets additional values to recognize as boolean true values.
*
Expand Down
38 changes: 2 additions & 36 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ class json_reader_options {
source_info _source;

// Data types of the column; empty to infer dtypes
std::variant<std::vector<std::string>, std::vector<data_type>, std::map<std::string, data_type>>
_dtypes;
std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
// Specify the compression format of the source or infer from file extension
compression_type _compression = compression_type::AUTO;

Expand Down Expand Up @@ -117,10 +116,7 @@ class json_reader_options {
/**
* @brief Returns data types of the columns.
*/
std::variant<std::vector<std::string>,
std::vector<data_type>,
std::map<std::string, data_type>> const&
get_dtypes() const
std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
{
return _dtypes;
}
Expand Down Expand Up @@ -150,20 +146,6 @@ class json_reader_options {
*/
bool is_enabled_dayfirst() const { return _dayfirst; }

/**
* @brief Set data types for columns to be read.
*
* @param types Vector of dtypes in string format.
*/
[[deprecated(
"The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) or "
"dtypes(std::map<std::string, data_type>) instead.")]] void
set_dtypes(std::vector<std::string> types)
{
_dtypes = std::move(types);
}

/**
* @brief Set data types for columns to be read.
*
Expand Down Expand Up @@ -232,22 +214,6 @@ class json_reader_options_builder {
*/
explicit json_reader_options_builder(source_info const& src) : options(src) {}

/**
* @brief Set data types for columns to be read.
*
* @param types Vector of dtypes in string format
* @return this for chaining
*/
[[deprecated(
"The string-based interface will be deprecated."
"Use dtypes(std::vector<data_type>) or "
"dtypes(std::map<std::string, data_type>) instead.")]] json_reader_options_builder&
dtypes(std::vector<std::string> types)
{
options._dtypes = std::move(types);
return *this;
}

/**
* @brief Set data types for columns to be read.
*
Expand Down
86 changes: 2 additions & 84 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@

#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/detail/utilities/visitor_overload.hpp>
#include <cudf/io/types.hpp>
#include <cudf/strings/replace.hpp>
#include <cudf/table/table.hpp>
Expand Down Expand Up @@ -420,14 +419,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
if (has_to_infer_column_types) {
column_types = infer_column_types(data, row_offsets, stream);
} else {
column_types = std::visit(
cudf::detail::visitor_overload{
[&](const std::vector<data_type>& data_types) { return select_data_types(data_types); },
[&](const std::map<std::string, data_type>& data_types) {
return select_data_types(data_types);
},
[&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
opts_.get_dtypes());
column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
opts_.get_dtypes());
}

out_columns.reserve(column_types.size());
Expand Down Expand Up @@ -707,81 +700,6 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
return dtypes;
}

std::vector<data_type> reader::impl::parse_column_types(
const std::vector<std::string>& types_as_strings)
{
std::vector<data_type> dtypes;

const bool is_dict = std::all_of(types_as_strings.begin(),
types_as_strings.end(),
[](const auto& s) { return s.find(':') != std::string::npos; });

if (!is_dict) {
if (types_as_strings.size() == 1) {
// If it's a single dtype, assign that dtype to all active columns
data_type dtype_;
column_parse::flags col_flags_;
std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
dtypes.resize(num_active_cols_, dtype_);
for (int col = 0; col < num_actual_cols_; col++) {
column_flags_[col] |= col_flags_;
}
CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
} else {
// If it's a list, assign dtypes to active columns in the given order
CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_cols_,
"Must specify data types for all columns");

auto dtype_ = std::back_inserter(dtypes);

for (int col = 0; col < num_actual_cols_; col++) {
if (column_flags_[col] & column_parse::enabled) {
column_parse::flags col_flags_;
std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
column_flags_[col] |= col_flags_;
CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
}
}
}
} else {
// Translate vector of `name : dtype` strings to map
// NOTE: Incoming pairs can be out-of-order from column names in dataset
std::unordered_map<std::string, std::string> col_type_map;
for (const auto& pair : types_as_strings) {
const auto pos = pair.find_last_of(':');
const auto name = pair.substr(0, pos);
const auto dtype = pair.substr(pos + 1, pair.size());
col_type_map[name] = dtype;
}

auto dtype_ = std::back_inserter(dtypes);

for (int col = 0; col < num_actual_cols_; col++) {
if (column_flags_[col] & column_parse::enabled) {
CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(),
"Must specify data types for all active columns");
column_parse::flags col_flags_;
std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]);
column_flags_[col] |= col_flags_;
CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
}
}
}

if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
for (auto& type : dtypes) {
if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
}
}

for (size_t i = 0; i < dtypes.size(); i++) {
// Replace EMPTY dtype with STRING
if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
}

return dtypes;
}

std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
device_span<uint64_t const> row_offsets,
host_span<data_type const> column_types,
Expand Down
9 changes: 0 additions & 9 deletions cpp/src/io/csv/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,6 @@ class reader::impl {
*/
std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);

/**
* @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
*
* @param types_as_strings The vector of strings from which to parse the columns' target data
* types
* @return List of columns' data types
*/
std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings);

/**
* @brief Converts the row-column data and outputs to column bufferrs.
*
Expand Down
79 changes: 20 additions & 59 deletions cpp/src/io/json/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -466,71 +466,32 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
}
}

std::vector<data_type> reader::impl::parse_data_types(
std::vector<std::string> const& types_as_strings)
{
CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(),
"Need to specify the type of each column.\n");
std::vector<data_type> dtypes;
// Assume that the dtype is in dictionary format only if all elements contain a colon
const bool is_dict = std::all_of(
std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) {
return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
});

auto split_on_colon = [](std::string_view s) {
auto const i = s.find(":");
return std::pair{s.substr(0, i), s.substr(i + 1)};
};

if (is_dict) {
std::map<std::string, data_type> col_type_map;
std::transform(
std::cbegin(types_as_strings),
std::cend(types_as_strings),
std::inserter(col_type_map, col_type_map.end()),
[&](auto const& ts) {
auto const [col_name, type_str] = split_on_colon(ts);
return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
});

// Using the map here allows O(n log n) complexity
std::transform(std::cbegin(metadata_.column_names),
std::cend(metadata_.column_names),
std::back_inserter(dtypes),
[&](auto const& column_name) { return col_type_map[column_name]; });
} else {
std::transform(std::cbegin(types_as_strings),
std::cend(types_as_strings),
std::back_inserter(dtypes),
[](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
}
return dtypes;
}

void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
rmm::cuda_stream_view stream)
{
bool has_to_infer_column_types =
std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
if (!has_to_infer_column_types) {
dtypes_ = std::visit(
cudf::detail::visitor_overload{
[&](const std::vector<data_type>& dtypes) { return dtypes; },
[&](const std::map<std::string, data_type>& dtypes) {
std::vector<data_type> sorted_dtypes;
std::transform(std::cbegin(metadata_.column_names),
std::cend(metadata_.column_names),
std::back_inserter(sorted_dtypes),
[&](auto const& column_name) {
auto const it = dtypes.find(column_name);
CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
return it->second;
});
return sorted_dtypes;
},
[&](std::vector<std::string> const& dtypes) { return parse_data_types(dtypes); }},
options_.get_dtypes());
dtypes_ = std::visit(cudf::detail::visitor_overload{
[&](const std::vector<data_type>& dtypes) {
CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(),
"Must specify types for all columns");
return dtypes;
},
[&](const std::map<std::string, data_type>& dtypes) {
std::vector<data_type> sorted_dtypes;
std::transform(std::cbegin(metadata_.column_names),
std::cend(metadata_.column_names),
std::back_inserter(sorted_dtypes),
[&](auto const& column_name) {
auto const it = dtypes.find(column_name);
CUDF_EXPECTS(it != dtypes.end(),
"Must specify types for all columns");
return it->second;
});
return sorted_dtypes;
}},
options_.get_dtypes());
} else {
CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
auto const num_columns = metadata_.column_names.size();
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/io/json/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,6 @@ class reader::impl {
*/
void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);

std::vector<data_type> parse_data_types(std::vector<std::string> const& types_as_strings);

/**
* @brief Set the data type array data member
*
Expand Down
6 changes: 5 additions & 1 deletion cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1858,7 +1858,11 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
cudf_io::csv_reader_options in_opts =
cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
.names(names)
.dtypes(std::vector<std::string>{"int32", "str", "int32", "int32", "int32"});
.dtypes({dtype<int32_t>(),
dtype<cudf::string_view>(),
dtype<int32_t>(),
dtype<int32_t>(),
dtype<int32_t>()});
auto result = cudf_io::read_csv(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
Expand Down
23 changes: 23 additions & 0 deletions cpp/tests/io/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,4 +888,27 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs)
float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity});
}

TEST_F(JsonReaderTest, BadDtypeParams)
{
std::string buffer = "[1,2,3,4]";

cudf_io::json_reader_options options_vec =
cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
.lines(true)
.dtypes({dtype<int8_t>()});

// should throw because there are four columns and only one dtype
EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error);

cudf_io::json_reader_options options_map =
cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
.lines(true)
.dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
{"1", dtype<int8_t>()},
{"2", dtype<int8_t>()},
{"wrong_name", dtype<int8_t>()}});
// should throw because one of the columns is not in the dtype map
EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error);
}

CUDF_TEST_PROGRAM_MAIN()