diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index fece1cb52b0..455ffce7ed8 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -115,8 +115,7 @@ class csv_reader_options { // Conversion settings // Per-column types; disables type inference on those columns - std::variant, std::vector, std::map> - _dtypes; + std::variant, std::map> _dtypes; // Additional values to recognize as boolean true values std::vector _true_values{"True", "TRUE", "true"}; // Additional values to recognize as boolean false values @@ -305,10 +304,7 @@ class csv_reader_options { /** * @brief Returns per-column types. */ - std::variant, - std::vector, - std::map> const& - get_dtypes() const + std::variant, std::map> const& get_dtypes() const { return _dtypes; } @@ -608,20 +604,6 @@ class csv_reader_options { */ void set_dtypes(std::vector types) { _dtypes = std::move(types); } - /** - * @brief Sets per-column types, specified by the type's respective string representation. - * - * @param types Vector of dtypes in which the column needs to be read. - */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) or " - "dtypes(std::map) instead.")]] void - set_dtypes(std::vector types) - { - _dtypes = std::move(types); - } - /** * @brief Sets additional values to recognize as boolean true values. * @@ -1067,22 +1049,6 @@ class csv_reader_options_builder { return *this; } - /** - * @brief Sets per-column types, specified by the type's respective string representation. - * - * @param types Vector of dtypes in which the column needs to be read. - * @return this for chaining. - */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) or " - "dtypes(std::map) instead.")]] csv_reader_options_builder& - dtypes(std::vector types) - { - options._dtypes = std::move(types); - return *this; - } - /** * @brief Sets additional values to recognize as boolean true values. * diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 60f990c87d8..31201e30ac6 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -68,8 +68,7 @@ class json_reader_options { source_info _source; // Data types of the column; empty to infer dtypes - std::variant, std::vector, std::map> - _dtypes; + std::variant, std::map> _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -117,10 +116,7 @@ class json_reader_options { /** * @brief Returns data types of the columns. */ - std::variant, - std::vector, - std::map> const& - get_dtypes() const + std::variant, std::map> const& get_dtypes() const { return _dtypes; } @@ -150,20 +146,6 @@ class json_reader_options { */ bool is_enabled_dayfirst() const { return _dayfirst; } - /** - * @brief Set data types for columns to be read. - * - * @param types Vector of dtypes in string format. - */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) or " - "dtypes(std::map) instead.")]] void - set_dtypes(std::vector types) - { - _dtypes = std::move(types); - } - /** * @brief Set data types for columns to be read. * @@ -232,22 +214,6 @@ class json_reader_options_builder { */ explicit json_reader_options_builder(source_info const& src) : options(src) {} - /** - * @brief Set data types for columns to be read. - * - * @param types Vector of dtypes in string format - * @return this for chaining - */ - [[deprecated( - "The string-based interface will be deprecated." - "Use dtypes(std::vector) or " - "dtypes(std::map) instead.")]] json_reader_options_builder& - dtypes(std::vector types) - { - options._dtypes = std::move(types); - return *this; - } - /** * @brief Set data types for columns to be read. * diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 549b0474fe1..7f85589a8aa 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -27,7 +27,6 @@ #include #include -#include #include #include #include @@ -420,14 +419,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) if (has_to_infer_column_types) { column_types = infer_column_types(data, row_offsets, stream); } else { - column_types = std::visit( - cudf::detail::visitor_overload{ - [&](const std::vector& data_types) { return select_data_types(data_types); }, - [&](const std::map& data_types) { - return select_data_types(data_types); - }, - [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, - opts_.get_dtypes()); + column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); }, + opts_.get_dtypes()); } out_columns.reserve(column_types.size()); @@ -707,81 +700,6 @@ std::vector reader::impl::infer_column_types(device_span return dtypes; } -std::vector reader::impl::parse_column_types( - const std::vector& types_as_strings) -{ - std::vector dtypes; - - const bool is_dict = std::all_of(types_as_strings.begin(), - types_as_strings.end(), - [](const auto& s) { return s.find(':') != std::string::npos; }); - - if (!is_dict) { - if (types_as_strings.size() == 1) { - // If it's a single dtype, assign that dtype to all active columns - data_type dtype_; - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); - dtypes.resize(num_active_cols_, dtype_); - for (int col = 0; col < num_actual_cols_; col++) { - column_flags_[col] |= col_flags_; - } - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } else { - // If it's a list, assign dtypes to active columns in the given order - CUDF_EXPECTS(static_cast(types_as_strings.size()) >= num_actual_cols_, - "Must specify data types for all columns"); - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]); - column_flags_[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - } else { - // Translate vector of `name : dtype` strings to map - // NOTE: Incoming pairs can be out-of-order from column names in dataset - std::unordered_map col_type_map; - for (const auto& pair : types_as_strings) { - const auto pos = pair.find_last_of(':'); - const auto name = pair.substr(0, pos); - const auto dtype = pair.substr(pos + 1, pair.size()); - col_type_map[name] = dtype; - } - - auto dtype_ = std::back_inserter(dtypes); - - for (int col = 0; col < num_actual_cols_; col++) { - if (column_flags_[col] & column_parse::enabled) { - CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(), - "Must specify data types for all active columns"); - column_parse::flags col_flags_; - std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]); - column_flags_[col] |= col_flags_; - CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); - } - } - } - - if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) { - for (auto& type : dtypes) { - if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); } - } - } - - for (size_t i = 0; i < dtypes.size(); i++) { - // Replace EMPTY dtype with STRING - if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; } - } - - return dtypes; -} - std::vector reader::impl::decode_data(device_span data, device_span row_offsets, host_span column_types, diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 36c2bf4f9e7..4416457be16 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -197,15 +197,6 @@ class reader::impl { */ std::vector select_data_types(std::vector const& dtypes); - /** - * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. - * - * @param types_as_strings The vector of strings from which to parse the columns' target data - * types - * @return List of columns' data types - */ - std::vector parse_column_types(std::vector const& types_as_strings); - /** * @brief Converts the row-column data and outputs to column bufferrs. * diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 85608a0984a..f1080342312 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -466,71 +466,32 @@ void reader::impl::set_column_names(device_span rec_starts, } } -std::vector reader::impl::parse_data_types( - std::vector const& types_as_strings) -{ - CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(), - "Need to specify the type of each column.\n"); - std::vector dtypes; - // Assume that the dtype is in dictionary format only if all elements contain a colon - const bool is_dict = std::all_of( - std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) { - return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); - }); - - auto split_on_colon = [](std::string_view s) { - auto const i = s.find(":"); - return std::pair{s.substr(0, i), s.substr(i + 1)}; - }; - - if (is_dict) { - std::map col_type_map; - std::transform( - std::cbegin(types_as_strings), - std::cend(types_as_strings), - std::inserter(col_type_map, col_type_map.end()), - [&](auto const& ts) { - auto const [col_name, type_str] = split_on_colon(ts); - return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; - }); - - // Using the map here allows O(n log n) complexity - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(dtypes), - [&](auto const& column_name) { return col_type_map[column_name]; }); - } else { - std::transform(std::cbegin(types_as_strings), - std::cend(types_as_strings), - std::back_inserter(dtypes), - [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); - } - return dtypes; -} - void reader::impl::set_data_types(device_span rec_starts, rmm::cuda_stream_view stream) { bool has_to_infer_column_types = std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes()); if (!has_to_infer_column_types) { - dtypes_ = std::visit( - cudf::detail::visitor_overload{ - [&](const std::vector& dtypes) { return dtypes; }, - [&](const std::map& dtypes) { - std::vector sorted_dtypes; - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(sorted_dtypes), - [&](auto const& column_name) { - auto const it = dtypes.find(column_name); - CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); - return it->second; - }); - return sorted_dtypes; - }, - [&](std::vector const& dtypes) { return parse_data_types(dtypes); }}, - options_.get_dtypes()); + dtypes_ = std::visit(cudf::detail::visitor_overload{ + [&](const std::vector& dtypes) { + CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(), + "Must specify types for all columns"); + return dtypes; + }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(metadata_.column_names), + std::cend(metadata_.column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), + "Must specify types for all columns"); + return it->second; + }); + return sorted_dtypes; + }}, + options_.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = metadata_.column_names.size(); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 5cf51369cdf..bbda7e9ba74 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -158,8 +158,6 @@ class reader::impl { */ void set_column_names(device_span rec_starts, rmm::cuda_stream_view stream); - std::vector parse_data_types(std::vector const& types_as_strings); - /** * @brief Set the data type array data member * diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 53e0ab14fd3..5b6270a8be1 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1858,7 +1858,11 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str", "int32", "int32", "int32"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index a263fa0fce0..e83592a028a 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -888,4 +888,27 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs) float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity}); } +TEST_F(JsonReaderTest, BadDtypeParams) +{ + std::string buffer = "[1,2,3,4]"; + + cudf_io::json_reader_options options_vec = + cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) + .lines(true) + .dtypes({dtype()}); + + // should throw because there are four columns and only one dtype + EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error); + + cudf_io::json_reader_options options_map = + cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) + .lines(true) + .dtypes(std::map{{"0", dtype()}, + {"1", dtype()}, + {"2", dtype()}, + {"wrong_name", dtype()}}); + // should throw because one of the columns is not in the dtype map + EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN()