From e6bd37f04f79341f811a5a7a12d7c212071fd7d1 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 22 Jul 2021 01:07:57 -0700 Subject: [PATCH 01/23] add API that takes a map of data_types --- cpp/include/cudf/io/csv.hpp | 27 +++++++++++++++++++++++++-- cpp/src/io/csv/reader_impl.cu | 21 +++++++++++++++++++++ cpp/src/io/csv/reader_impl.hpp | 8 ++++++++ cpp/tests/io/csv_test.cpp | 33 ++++++++++++++++++++++++++++++++- 4 files changed, 86 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 1dff99735ec..e70353dff9f 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -111,7 +111,8 @@ class csv_reader_options { // Conversion settings // Per-column types; disables type inference on those columns - std::variant, std::vector> _dtypes; + std::variant, std::vector, std::map> + _dtypes; // Additional values to recognize as boolean true values std::vector _true_values{"True", "TRUE", "true"}; // Additional values to recognize as boolean false values @@ -290,7 +291,10 @@ class csv_reader_options { /** * @brief Returns per-column types. */ - std::variant, std::vector> const& get_dtypes() const + std::variant, + std::vector, + std::map> const& + get_dtypes() const { return _dtypes; } @@ -562,6 +566,13 @@ class csv_reader_options { _infer_date_indexes = std::move(col_ind); } + /** + * @brief Sets per-column types + * + * @param types Column name -> data type map specifying the columns' target data types + */ + void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** * @brief Sets per-column types * @@ -979,6 +990,18 @@ class csv_reader_options_builder { return *this; } + /** + * @brief Sets per-column types. + * + * @param types Column name -> data type map specifying the columns' target data types + * @return this for chaining. + */ + csv_reader_options_builder& dtypes(std::map types) + { + options._dtypes = std::move(types); + return *this; + } + /** * @brief Sets per-column types. * diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 70ce0fce1cc..2436e930046 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -280,6 +280,24 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; } +std::vector reader::impl::sort_data_types( + std::map const& col_type_map) +{ + std::vector dtypes; + dtypes.reserve(col_type_map.size()); + + for (int col = 0; col < num_actual_cols_; col++) { + if (column_flags_[col] & column_parse::enabled) { + auto const col_type_it = col_type_map.find(col_names_[col]); + CUDF_EXPECTS(col_type_it != col_type_map.end(), + "Must specify data types for all active columns"); + CUDF_EXPECTS(col_type_it->second.id() != cudf::type_id::EMPTY, "Unsupported data type"); + dtypes.emplace_back(col_type_it->second); + } + } + return dtypes; +} + table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) { auto const data_row_offsets = select_data_and_row_offsets(stream); @@ -385,6 +403,9 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) column_types = std::visit(VisitorOverload{ [&](const std::vector& data_types) { return data_types; }, + [&](const std::map& data_types) { + return sort_data_types(data_types); + }, [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, opts_.get_dtypes()); } diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 29c6b48bc8a..ef1b2a52f71 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -181,6 +181,14 @@ class reader::impl { device_span row_offsets, rmm::cuda_stream_view stream); + /** + * @brief Sorts the columns' data types from the map of dtypes. + * + * @param col_type_map Column name -> data type map specifying the columns' target data types + * @return Sorted ist of columns' data types + */ + std::vector sort_data_types(std::map const& col_type_map); + /** * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. * diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 4e1ad57080a..0a29832270f 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -80,7 +80,6 @@ struct CsvReaderTest : public cudf::test::BaseFixture { // Typed test fixture for timestamp type tests template struct CsvReaderNumericTypeTest : public CsvReaderTest { - auto type() { return cudf::data_type{cudf::type_to_id()}; } }; // Declare typed test cases @@ -2141,4 +2140,36 @@ TEST_F(CsvReaderTest, DefaultWriteChunkSize) } } +TEST_F(CsvReaderTest, DtypesMap) +{ + std::string csv_in{"12,9\n34,8\n56,7"}; + + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"A", "B"}) + .dtypes(std::map{{"B", cudf::data_type{cudf::type_id::INT16}}, + {"A", cudf::data_type{cudf::type_id::INT32}}}) + .header(-1); + auto result = cudf_io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + assert(result_table->num_columns() == 2); + assert(result_table.column(0).type() == cudf::data_type{cudf::type_id::INT32}); + assert(result_table.column(1).type() == cudf::data_type{cudf::type_id::INT16}); + expect_column_data_equal(std::vector{12, 34, 56}, result_table.column(0)); + expect_column_data_equal(std::vector{9, 8, 7}, result_table.column(1)); +} + +TEST_F(CsvReaderTest, DtypesMapInvalid) +{ + std::string csv_in{""}; + + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"A", "B"}) + .dtypes(std::map{{"C", cudf::data_type{cudf::type_id::INT16}}}); + + EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() From fbf05ccadff6e0ea44f5d0c505ec9d40f2f623b8 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 22 Jul 2021 01:30:50 -0700 Subject: [PATCH 02/23] using --- cpp/tests/io/csv_test.cpp | 103 ++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 0a29832270f..1f783754cfb 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -50,6 +50,10 @@ namespace cudf_io = cudf::io; +using cudf::data_type; +using cudf::type_id; +using cudf::type_to_id; + template using column_wrapper = typename std::conditional::value, @@ -93,7 +97,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest { { cudf::test::strings_column_wrapper strings(reference_strings.begin(), reference_strings.end()); auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), cudf::data_type{cudf::type_to_id(), scale}); + cudf::strings_column_view(strings), data_type{type_to_id(), scale}); std::string buffer = std::accumulate(reference_strings.begin(), reference_strings.end(), @@ -104,7 +108,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest { cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({cudf::data_type{cudf::type_to_id(), scale}}) + .dtypes({data_type{type_to_id(), scale}}) .header(-1); const auto result = cudf_io::read_csv(in_opts); @@ -390,7 +394,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) using DecimalType = TypeParam; auto input_column = cudf::strings::to_fixed_point( cudf::strings_column_view(strings), - cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}); + data_type{type_to_id(), numeric::scale_type{-2}}); auto input_table = cudf::table_view{std::vector{*input_column}}; @@ -436,7 +440,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) using DecimalType = TypeParam; auto input_column = cudf::strings::to_fixed_point( cudf::strings_column_view(strings), - cudf::data_type{cudf::type_to_id(), numeric::scale_type{3}}); + data_type{type_to_id(), numeric::scale_type{3}}); auto input_table = cudf::table_view{std::vector{*input_column}}; @@ -583,10 +587,10 @@ TEST_F(CsvReaderTest, Booleans) // Booleans are the same (integer) data type, but valued at 0 or 1 const auto view = result.tbl->view(); EXPECT_EQ(4, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::INT32, view.column(1).type().id()); - ASSERT_EQ(cudf::type_id::INT16, view.column(2).type().id()); - ASSERT_EQ(cudf::type_id::BOOL8, view.column(3).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(1).type().id()); + ASSERT_EQ(type_id::INT16, view.column(2).type().id()); + ASSERT_EQ(type_id::BOOL8, view.column(3).type().id()); expect_column_data_equal(std::vector{1, 0, 0, 0, 1}, view.column(0)); expect_column_data_equal(std::vector{0, 1, 1, 0, 1}, view.column(2)); @@ -613,7 +617,7 @@ TEST_F(CsvReaderTest, Dates) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_ms{983750400000ms}, @@ -645,12 +649,12 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds) .dtypes(std::vector{"date"}) .dayfirst(true) .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}); + .timestamp_type(data_type{type_id::TIMESTAMP_SECONDS}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_s{983750400s}, @@ -682,12 +686,12 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds) .dtypes(std::vector{"date"}) .dayfirst(true) .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + .timestamp_type(data_type{type_id::TIMESTAMP_MILLISECONDS}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_ms{983750400000ms}, @@ -719,12 +723,12 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds) .dtypes(std::vector{"date"}) .dayfirst(true) .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}); + .timestamp_type(data_type{type_id::TIMESTAMP_MICROSECONDS}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal(std::vector{cudf::timestamp_us{983750400000000us}, @@ -756,12 +760,12 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds) .dtypes(std::vector{"date"}) .dayfirst(true) .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}); + .timestamp_type(data_type{type_id::TIMESTAMP_NANOSECONDS}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; expect_column_data_equal( @@ -796,12 +800,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds) .names({"A"}) .dtypes(std::vector{"datetime64[s]"}) .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}); + .timestamp_type(data_type{type_id::TIMESTAMP_SECONDS}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -825,12 +829,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds) .names({"A"}) .dtypes(std::vector{"datetime64[ms]"}) .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS}); + .timestamp_type(data_type{type_id::TIMESTAMP_MILLISECONDS}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -854,12 +858,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds) .names({"A"}) .dtypes(std::vector{"datetime64[us]"}) .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS}); + .timestamp_type(data_type{type_id::TIMESTAMP_MICROSECONDS}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -883,12 +887,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds) .names({"A"}) .dtypes(std::vector{"datetime64[ns]"}) .header(-1) - .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS}); + .timestamp_type(data_type{type_id::TIMESTAMP_NANOSECONDS}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); + ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id()); using namespace cuda::std::chrono_literals; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0)); @@ -913,7 +917,7 @@ TEST_F(CsvReaderTest, FloatingPoint) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id()); + ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id()); const auto ref_vals = std::vector{5.6, 56.79, 12000000000, 0.7, 3.000, 12.34, 0.31, -73.98007199999998}; @@ -945,8 +949,8 @@ TEST_F(CsvReaderTest, Strings) const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}, @@ -975,8 +979,8 @@ TEST_F(CsvReaderTest, StringsQuotes) const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"abc,\ndef, ghi", "jkl, `mno`, pqr", "stu `vwx` yz"}, view.column(1)); @@ -1005,8 +1009,8 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) const auto view = result.tbl->view(); EXPECT_EQ(2, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); - ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( std::vector{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"}, @@ -1032,7 +1036,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{5, 6}, view.column(0)); } @@ -1056,7 +1060,7 @@ TEST_F(CsvReaderTest, ByteRange) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{4000, 5000, 6000}, view.column(0)); } @@ -1074,7 +1078,7 @@ TEST_F(CsvReaderTest, ByteRangeStrings) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::STRING, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(0).type().id()); expect_column_data_equal(std::vector{"c"}, view.column(0)); } @@ -1097,7 +1101,7 @@ TEST_F(CsvReaderTest, BlanksAndComments) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id()); + ASSERT_EQ(type_id::INT32, view.column(0).type().id()); expect_column_data_equal(std::vector{1, 3, 4, 5, 8, 9}, view.column(0)); } @@ -1170,7 +1174,7 @@ TEST_F(CsvReaderTest, ArrowFileSource) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id()); + ASSERT_EQ(type_id::INT8, view.column(0).type().id()); expect_column_data_equal(std::vector{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0)); } @@ -1192,7 +1196,7 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint) const auto view = result.tbl->view(); EXPECT_EQ(1, view.num_columns()); - ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id()); + ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id()); const auto col_data = cudf::test::to_host(view.column(0)); // col_data.first contains the column data @@ -1211,7 +1215,7 @@ TEST_F(CsvReaderTest, StringInference) const auto result = cudf_io::read_csv(in_opts); EXPECT_EQ(result.tbl->num_columns(), 1); - EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(0).type().id(), type_id::STRING); } TEST_F(CsvReaderTest, TypeInferenceThousands) @@ -1225,9 +1229,9 @@ TEST_F(CsvReaderTest, TypeInferenceThousands) const auto result_view = result.tbl->view(); EXPECT_EQ(result_view.num_columns(), 3); - EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(1).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64); auto tsnd_sep_col = std::vector{1400L, 123456L}; auto int_col = std::vector{123L, 123456L}; @@ -1253,9 +1257,9 @@ TEST_F(CsvReaderTest, TypeInferenceWithDecimal) const auto result_view = result.tbl->view(); EXPECT_EQ(result_view.num_columns(), 3); - EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64); - EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::STRING); - EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64); + EXPECT_EQ(result_view.column(1).type().id(), type_id::STRING); + EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64); auto int_col = std::vector{1400L, 123456L}; auto str_col = std::vector{"1.23", "123.456"}; @@ -2147,15 +2151,14 @@ TEST_F(CsvReaderTest, DtypesMap) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) .names({"A", "B"}) - .dtypes(std::map{{"B", cudf::data_type{cudf::type_id::INT16}}, - {"A", cudf::data_type{cudf::type_id::INT32}}}) + .dtypes({{"B", data_type{type_to_id()}}, {"A", data_type{type_to_id()}}}) .header(-1); auto result = cudf_io::read_csv(in_opts); const auto result_table = result.tbl->view(); assert(result_table->num_columns() == 2); - assert(result_table.column(0).type() == cudf::data_type{cudf::type_id::INT32}); - assert(result_table.column(1).type() == cudf::data_type{cudf::type_id::INT16}); + assert(result_table.column(0).type() == data_type{type_id::INT32}); + assert(result_table.column(1).type() == data_type{type_id::INT16}); expect_column_data_equal(std::vector{12, 34, 56}, result_table.column(0)); expect_column_data_equal(std::vector{9, 8, 7}, result_table.column(1)); } @@ -2167,7 +2170,7 @@ TEST_F(CsvReaderTest, DtypesMapInvalid) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) .names({"A", "B"}) - .dtypes(std::map{{"C", cudf::data_type{cudf::type_id::INT16}}}); + .dtypes({{"A", data_type{type_to_id()}}}); EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error); } From 523e3ad7fbefaf5651aab900285203e7cfd3cfa5 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 22 Jul 2021 13:34:11 -0700 Subject: [PATCH 03/23] dtypes + selected columns fix --- cpp/src/io/csv/reader_impl.cu | 45 +++++++++++++------ cpp/src/io/csv/reader_impl.hpp | 12 ++++- cpp/tests/io/csv_test.cpp | 81 ++++++++++++++++------------------ 3 files changed, 79 insertions(+), 59 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 2436e930046..02196d17fc1 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -280,22 +280,39 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) return {rmm::device_uvector{0, stream}, selected_rows_offsets{stream}}; } -std::vector reader::impl::sort_data_types( +std::vector reader::impl::select_data_types( std::map const& col_type_map) { - std::vector dtypes; - dtypes.reserve(col_type_map.size()); + std::vector selected_dtypes; for (int col = 0; col < num_actual_cols_; col++) { if (column_flags_[col] & column_parse::enabled) { auto const col_type_it = col_type_map.find(col_names_[col]); CUDF_EXPECTS(col_type_it != col_type_map.end(), "Must specify data types for all active columns"); - CUDF_EXPECTS(col_type_it->second.id() != cudf::type_id::EMPTY, "Unsupported data type"); - dtypes.emplace_back(col_type_it->second); + selected_dtypes.emplace_back(col_type_it->second); } } - return dtypes; + return selected_dtypes; +} + +std::vector reader::impl::select_data_types(std::vector const& dtypes) +{ + std::vector selected_dtypes; + + if (dtypes.size() == 1) { + // If it's a single dtype, assign that dtype to all active columns + selected_dtypes.resize(num_active_cols_, dtypes.front()); + } else { + // If it's a list, assign dtypes to active columns in the given order + CUDF_EXPECTS(static_cast(dtypes.size()) >= num_actual_cols_, + "Must specify data types for all columns"); + + for (int col = 0; col < num_actual_cols_; col++) { + if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); } + } + } + return selected_dtypes; } table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) @@ -400,14 +417,14 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) if (has_to_infer_column_types) { column_types = infer_column_types(data, row_offsets, stream); } else { - column_types = - std::visit(VisitorOverload{ - [&](const std::vector& data_types) { return data_types; }, - [&](const std::map& data_types) { - return sort_data_types(data_types); - }, - [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, - opts_.get_dtypes()); + column_types = std::visit( + VisitorOverload{ + [&](const std::vector& data_types) { return select_data_types(data_types); }, + [&](const std::map& data_types) { + return select_data_types(data_types); + }, + [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, + opts_.get_dtypes()); } out_columns.reserve(column_types.size()); diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index ef1b2a52f71..222bb2c5cb3 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -185,9 +185,17 @@ class reader::impl { * @brief Sorts the columns' data types from the map of dtypes. * * @param col_type_map Column name -> data type map specifying the columns' target data types - * @return Sorted ist of columns' data types + * @return Sorted ist of selected columns' data types */ - std::vector sort_data_types(std::map const& col_type_map); + std::vector select_data_types(std::map const& col_type_map); + + /** + * @brief Sorts the columns' data types from the map of dtypes. + * + * @param col_type_map Vector of deta types specifying the columns' target data types + * @return Sorted ist of selected columns' data types + */ + std::vector select_data_types(std::vector const& dtypes); /** * @brief Parses the columns' data types from the vector of dtypes that are provided as strings. diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 1f783754cfb..27ba1336774 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -54,6 +54,12 @@ using cudf::data_type; using cudf::type_id; using cudf::type_to_id; +template +auto dtype() +{ + return data_type{type_to_id()}; +} + template using column_wrapper = typename std::conditional::value, @@ -96,8 +102,8 @@ struct CsvFixedPointReaderTest : public CsvReaderTest { void run_tests(const std::vector& reference_strings, numeric::scale_type scale) { cudf::test::strings_column_wrapper strings(reference_strings.begin(), reference_strings.end()); - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), data_type{type_to_id(), scale}); + auto input_column = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), scale}); std::string buffer = std::accumulate(reference_strings.begin(), reference_strings.end(), @@ -392,9 +398,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) reference_strings = valid_reference_strings; using DecimalType = TypeParam; - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), - data_type{type_to_id(), numeric::scale_type{-2}}); + auto input_column = + cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), numeric::scale_type{-2}}); auto input_table = cudf::table_view{std::vector{*input_column}}; @@ -438,9 +444,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) reference_strings = valid_reference_strings; using DecimalType = TypeParam; - auto input_column = cudf::strings::to_fixed_point( - cudf::strings_column_view(strings), - data_type{type_to_id(), numeric::scale_type{3}}); + auto input_column = + cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + data_type{type_to_id(), numeric::scale_type{3}}); auto input_table = cudf::table_view{std::vector{*input_column}}; @@ -482,11 +488,10 @@ TEST_F(CsvReaderTest, MultiColumn) { std::ostringstream line; for (int i = 0; i < num_rows; ++i) { - line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int16_values[i] - << "," << int32_values[i] << "," << int32_values[i] << "," << int64_values[i] << "," - << int64_values[i] << "," << std::to_string(uint8_values[i]) << "," << uint16_values[i] - << "," << uint32_values[i] << "," << uint64_values[i] << "," << float32_values[i] << "," - << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "\n"; + line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int32_values[i] + << "," << int64_values[i] << "," << std::to_string(uint8_values[i]) << "," + << uint16_values[i] << "," << uint32_values[i] << "," << uint64_values[i] << "," + << float32_values[i] << "," << float64_values[i] << "\n"; } std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); @@ -495,39 +500,29 @@ TEST_F(CsvReaderTest, MultiColumn) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .header(-1) - .dtypes(std::vector{"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float", - "float32", - "double", - "float64"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); expect_column_data_equal(int8_values, view.column(0)); expect_column_data_equal(int16_values, view.column(1)); - expect_column_data_equal(int16_values, view.column(2)); - expect_column_data_equal(int32_values, view.column(3)); - expect_column_data_equal(int32_values, view.column(4)); - expect_column_data_equal(int64_values, view.column(5)); - expect_column_data_equal(int64_values, view.column(6)); - expect_column_data_equal(uint8_values, view.column(7)); - expect_column_data_equal(uint16_values, view.column(8)); - expect_column_data_equal(uint32_values, view.column(9)); - expect_column_data_equal(uint64_values, view.column(10)); - expect_column_data_equal(float32_values, view.column(11)); - expect_column_data_equal(float32_values, view.column(12)); - expect_column_data_equal(float64_values, view.column(13)); - expect_column_data_equal(float64_values, view.column(14)); + expect_column_data_equal(int32_values, view.column(2)); + expect_column_data_equal(int64_values, view.column(3)); + expect_column_data_equal(uint8_values, view.column(4)); + expect_column_data_equal(uint16_values, view.column(5)); + expect_column_data_equal(uint32_values, view.column(6)); + expect_column_data_equal(uint64_values, view.column(7)); + expect_column_data_equal(float32_values, view.column(8)); + expect_column_data_equal(float64_values, view.column(9)); } TEST_F(CsvReaderTest, RepeatColumn) @@ -552,7 +547,7 @@ TEST_F(CsvReaderTest, RepeatColumn) // repeats column in indexes and names, misses 1 column. cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes(std::vector{"int16", "int64", "uint64", "float"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) .names({"A", "B", "C", "D"}) .use_cols_indexes({1, 0, 0}) .use_cols_names({"D", "B", "B"}) @@ -578,7 +573,7 @@ TEST_F(CsvReaderTest, Booleans) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A", "B", "C", "D"}) - .dtypes(std::vector{"int32", "int32", "short", "bool"}) + .dtypes({dtype(), dtype(), dtype(), dtype()}) .true_values({"yes", "Yes", "YES", "foo", "FOO"}) .false_values({"no", "No", "NO", "Bar", "bar"}) .header(-1); From 1d5cf0090bef3f98aa776665703bec3b14770b9c Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 22 Jul 2021 14:37:29 -0700 Subject: [PATCH 04/23] convert tests to new API --- cpp/tests/io/csv_test.cpp | 128 ++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 73 deletions(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 27ba1336774..3af1e5d5ddb 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -605,7 +605,7 @@ TEST_F(CsvReaderTest, Dates) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -641,10 +641,9 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_SECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(data_type{type_id::TIMESTAMP_SECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -678,10 +677,9 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(data_type{type_id::TIMESTAMP_MILLISECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -715,10 +713,9 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(data_type{type_id::TIMESTAMP_MICROSECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -752,10 +749,9 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}}) .dayfirst(true) - .header(-1) - .timestamp_type(data_type{type_id::TIMESTAMP_NANOSECONDS}); + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -793,9 +789,8 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[s]"}) - .header(-1) - .timestamp_type(data_type{type_id::TIMESTAMP_SECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_SECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -822,9 +817,8 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[ms]"}) - .header(-1) - .timestamp_type(data_type{type_id::TIMESTAMP_MILLISECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -851,9 +845,8 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[us]"}) - .header(-1) - .timestamp_type(data_type{type_id::TIMESTAMP_MICROSECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -880,9 +873,8 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"datetime64[ns]"}) - .header(-1) - .timestamp_type(data_type{type_id::TIMESTAMP_NANOSECONDS}); + .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}}) + .header(-1); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -905,7 +897,7 @@ TEST_F(CsvReaderTest, FloatingPoint) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float32"}) + .dtypes({dtype()}) .lineterminator(';') .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -938,7 +930,7 @@ TEST_F(CsvReaderTest, Strings) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); @@ -968,7 +960,7 @@ TEST_F(CsvReaderTest, StringsQuotes) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quotechar('`'); auto result = cudf_io::read_csv(in_opts); @@ -997,7 +989,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE) .doublequote(false); auto result = cudf_io::read_csv(in_opts); @@ -1023,7 +1015,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(1) .skiprows(2) .nrows(2); @@ -1047,7 +1039,7 @@ TEST_F(CsvReaderTest, ByteRange) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(-1) .byte_range_offset(11) .byte_range_size(15); @@ -1066,7 +1058,7 @@ TEST_F(CsvReaderTest, ByteRangeStrings) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{input.c_str(), input.size()}) .names({"A"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .byte_range_offset(4); auto result = cudf_io::read_csv(in_opts); @@ -1089,7 +1081,7 @@ TEST_F(CsvReaderTest, BlanksAndComments) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"int32"}) + .dtypes({dtype()}) .header(-1) .comment('#'); auto result = cudf_io::read_csv(in_opts); @@ -1164,7 +1156,7 @@ TEST_F(CsvReaderTest, ArrowFileSource) auto arrow_source = cudf_io::arrow_io_source{infile}; cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source}) - .dtypes(std::vector{"int8"}); + .dtypes({dtype()}); auto result = cudf_io::read_csv(in_opts); const auto view = result.tbl->view(); @@ -1185,7 +1177,7 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float32"}) + .dtypes({dtype()}) .header(-1); const auto result = cudf_io::read_csv(in_opts); @@ -1294,7 +1286,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .na_filter(false) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1308,7 +1300,7 @@ TEST_F(CsvReaderTest, nullHandling) { cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1325,7 +1317,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .na_values({"Null"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1343,7 +1335,7 @@ TEST_F(CsvReaderTest, nullHandling) cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .keep_default_na(false) .na_values({"Null"}) - .dtypes(std::vector{"str"}) + .dtypes({dtype()}) .header(-1) .skip_blank_lines(false); const auto result = cudf_io::read_csv(in_opts); @@ -1553,18 +1545,13 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) std::vector input_columns{int8_column, int16_column, - int16_column, - int32_column, int32_column, int64_column, - int64_column, uint8_column, uint16_column, uint32_column, uint64_column, float32_column, - float32_column, - float64_column, float64_column}; cudf::table_view input_table{input_columns}; @@ -1575,26 +1562,21 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .header(-1) - .dtypes(std::vector{"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float", - "float32", - "double", - "float64"}); + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}); auto result = cudf_io::read_csv(in_opts); const auto result_table = result.tbl->view(); - std::vector non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + std::vector non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8}; const auto input_sliced_view = input_table.select(non_float64s); const auto result_sliced_view = result_table.select(non_float64s); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_sliced_view, result_sliced_view); @@ -1604,9 +1586,6 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) auto float64_col_idx = non_float64s.size(); check_float_column( input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity); - ++float64_col_idx; - check_float_column( - input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity); } TEST_F(CsvReaderTest, DatesWithWriter) @@ -1631,7 +1610,7 @@ TEST_F(CsvReaderTest, DatesWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .dayfirst(true) .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -1762,7 +1741,7 @@ TEST_F(CsvReaderTest, FloatingPointWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names({"A"}) - .dtypes(std::vector{"float64"}) + .dtypes({dtype()}) .header(-1); // in_opts.lineterminator = ';'; auto result = cudf_io::read_csv(in_opts); @@ -1788,7 +1767,7 @@ TEST_F(CsvReaderTest, StringsWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); @@ -1813,7 +1792,7 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}) + .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf_io::quote_style::NONE); auto result = cudf_io::read_csv(in_opts); @@ -1837,7 +1816,7 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{"int32", "str"}); + .dtypes(std::vector{dtype(), dtype()}); auto result = cudf_io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); @@ -1892,7 +1871,7 @@ TEST_F(CsvReaderTest, UserImplementedSource) TestSource source{csv_data.str()}; cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{&source}) - .dtypes(std::vector{"int8", "int16", "int32"}) + .dtypes({dtype(), dtype(), dtype()}) .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -1937,8 +1916,11 @@ TEST_F(CsvReaderTest, DurationsWithWriter) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .names(names) - .dtypes(std::vector{ - "timedelta[D]", "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]", "timedelta64[ns]"}); + .dtypes({data_type{type_id::DURATION_DAYS}, + data_type{type_id::DURATION_SECONDS}, + data_type{type_id::DURATION_MILLISECONDS}, + data_type{type_id::DURATION_MICROSECONDS}, + data_type{type_id::DURATION_NANOSECONDS}}); auto result = cudf_io::read_csv(in_opts); const auto result_table = result.tbl->view(); @@ -2146,7 +2128,7 @@ TEST_F(CsvReaderTest, DtypesMap) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) .names({"A", "B"}) - .dtypes({{"B", data_type{type_to_id()}}, {"A", data_type{type_to_id()}}}) + .dtypes({{"B", dtype()}, {"A", dtype()}}) .header(-1); auto result = cudf_io::read_csv(in_opts); @@ -2165,7 +2147,7 @@ TEST_F(CsvReaderTest, DtypesMapInvalid) cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()}) .names({"A", "B"}) - .dtypes({{"A", data_type{type_to_id()}}}); + .dtypes({{"A", dtype()}}); EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error); } From 64a2a07b6418902d237c658247ceca009d92daf6 Mon Sep 17 00:00:00 2001 From: vuule Date: Thu, 22 Jul 2021 16:19:07 -0700 Subject: [PATCH 05/23] infer_date -> parse_date for consistency; add parse_hex to libcudf --- cpp/include/cudf/io/csv.hpp | 80 +++++++++++++++++++++++----- cpp/src/io/csv/reader_impl.cu | 22 ++++++-- python/cudf/cudf/_lib/cpp/io/csv.pxd | 12 +++-- python/cudf/cudf/_lib/csv.pyx | 12 ++--- 4 files changed, 98 insertions(+), 28 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index e70353dff9f..8ebb3a72a85 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -104,9 +104,13 @@ class csv_reader_options { // Whether a quote inside a value is double-quoted bool _doublequote = true; // Names of columns to read as datetime - std::vector _infer_date_names; + std::vector _parse_dates_names; // Indexes of columns to read as datetime - std::vector _infer_date_indexes; + std::vector _parse_dates_indexes; + // Names of columns to parse as hexadecimal + std::vector _parse_hex_names; + // Indexes of columns to parse as hexadecimal + std::vector _parse_hex_indexes; // Conversion settings @@ -281,12 +285,22 @@ class csv_reader_options { /** * @brief Returns names of columns to read as datetime. */ - std::vector const& get_infer_date_names() const { return _infer_date_names; } + std::vector const& get_parse_dates_names() const { return _parse_dates_names; } /** * @brief Returns indexes of columns to read as datetime. */ - std::vector const& get_infer_date_indexes() const { return _infer_date_indexes; } + std::vector const& get_parse_dates_indexes() const { return _parse_dates_indexes; } + + /** + * @brief Returns names of columns to read as datetime. + */ + std::vector const& get_parse_hex_names() const { return _parse_hex_names; } + + /** + * @brief Returns indexes of columns to read as datetime. + */ + std::vector const& get_parse_hex_indexes() const { return _parse_hex_indexes; } /** * @brief Returns per-column types. @@ -551,9 +565,9 @@ class csv_reader_options { * * @param col_names Vector of column names to infer as datetime. */ - void set_infer_date_names(std::vector col_names) + void set_parse_dates(std::vector col_names) { - _infer_date_names = std::move(col_names); + _parse_dates_names = std::move(col_names); } /** @@ -561,11 +575,25 @@ class csv_reader_options { * * @param col_names Vector of column indices to infer as datetime. */ - void set_infer_date_indexes(std::vector col_ind) + void set_parse_dates(std::vector col_ind) { _parse_dates_indexes = std::move(col_ind); } + + /** + * @brief Sets names of columns to parse as hexadecimal + * + * @param col_names Vector of column names to parse as hexadecimal + */ + void set_parse_hex(std::vector col_names) { - _infer_date_indexes = std::move(col_ind); + _parse_hex_names = std::move(col_names); } + /** + * @brief Sets indexes of columns to parse as hexadecimal + * + * @param col_names Vector of column indices to parse as hexadecimal + */ + void set_parse_hex(std::vector col_ind) { _parse_hex_indexes = std::move(col_ind); } + /** * @brief Sets per-column types * @@ -969,24 +997,48 @@ class csv_reader_options_builder { /** * @brief Sets names of columns to read as datetime. * - * @param col_names Vector of column names to infer as datetime. + * @param col_names Vector of column names to read as datetime. * @return this for chaining. */ - csv_reader_options_builder& infer_date_names(std::vector col_names) + csv_reader_options_builder& parse_dates(std::vector col_names) { - options._infer_date_names = std::move(col_names); + options._parse_dates_names = std::move(col_names); return *this; } /** * @brief Sets indexes of columns to read as datetime. * - * @param col_names Vector of column indices to infer as datetime. + * @param col_names Vector of column indices to read as datetime. + * @return this for chaining. + */ + csv_reader_options_builder& parse_dates(std::vector col_ind) + { + options._parse_dates_indexes = std::move(col_ind); + return *this; + } + + /** + * @brief Sets names of columns to parse as hexadecimal. + * + * @param col_names Vector of column names to parse as hexadecimal + * @return this for chaining. + */ + csv_reader_options_builder& parse_hex(std::vector col_names) + { + options._parse_hex_names = std::move(col_names); + return *this; + } + + /** + * @brief Sets indexes of columns to parse as hexadecimal. + * + * @param col_names Vector of column indices to parse as hexadecimal * @return this for chaining. */ - csv_reader_options_builder& infer_date_indexes(std::vector col_ind) + csv_reader_options_builder& parse_hex(std::vector col_ind) { - options._infer_date_indexes = std::move(col_ind); + options._parse_hex_indexes = std::move(col_ind); return *this; } diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 02196d17fc1..656dde935df 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -390,13 +390,13 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } } - // User can specify which columns should be inferred as datetime - if (!opts_.get_infer_date_indexes().empty() || !opts_.get_infer_date_names().empty()) { - for (const auto index : opts_.get_infer_date_indexes()) { + // User can specify which columns should be read as datetime + if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) { + for (const auto index : opts_.get_parse_dates_indexes()) { column_flags_[index] |= column_parse::as_datetime; } - for (const auto& name : opts_.get_infer_date_names()) { + for (const auto& name : opts_.get_parse_dates_names()) { auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { column_flags_[it - col_names_.begin()] |= column_parse::as_datetime; @@ -404,6 +404,20 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } } + // User can specify which columns should be inferred as datetime + if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) { + for (const auto index : opts_.get_parse_hex_indexes()) { + column_flags_[index] |= column_parse::as_hexadecimal; + } + + for (const auto& name : opts_.get_parse_hex_names()) { + auto it = std::find(col_names_.begin(), col_names_.end(), name); + if (it != col_names_.end()) { + column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal; + } + } + } + // Return empty table rather than exception if nothing to load if (num_active_cols_ == 0) { return {std::make_unique(), {}}; } diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd index c5e235b5697..2d6bdf28f7f 100644 --- a/python/cudf/cudf/_lib/cpp/io/csv.pxd +++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd @@ -49,8 +49,10 @@ cdef extern from "cudf/io/csv.hpp" \ cudf_io_types.quote_style get_quoting() except+ char get_quotechar() except+ bool is_enabled_doublequote() except+ - vector[string] get_infer_date_names() except+ - vector[int] get_infer_date_indexes() except+ + vector[string] get_parse_dates_names() except+ + vector[int] get_parse_dates_indexes() except+ + vector[string] get_parse_hex_names() except+ + vector[int] get_parse_hex_indexes() except+ # Conversion settings vector[string] get_dtype() except+ @@ -92,8 +94,10 @@ cdef extern from "cudf/io/csv.hpp" \ void set_quoting(cudf_io_types.quote_style style) except+ void set_quotechar(char val) except+ void set_doublequote(bool val) except+ - void set_infer_date_names(vector[string]) except+ - void set_infer_date_indexes(vector[int]) except+ + void set_parse_dates(vector[string]) except+ + void set_parse_dates(vector[int]) except+ + void set_parse_hex(vector[string]) except+ + void set_parse_hex(vector[int]) except+ # Conversion settings void set_dtypes(vector[string] types) except+ diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 773e81a0a7b..55033c0a0ba 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -116,8 +116,8 @@ cdef csv_reader_options make_csv_reader_options( cdef vector[string] c_use_cols_names cdef size_type c_nrows = nrows if nrows is not None else -1 cdef quote_style c_quoting - cdef vector[string] c_infer_date_names - cdef vector[int] c_infer_date_indexes + cdef vector[string] c_parse_dates_names + cdef vector[int] c_parse_dates_indexes cdef vector[string] c_dtypes cdef vector[string] c_true_values cdef vector[string] c_false_values @@ -220,14 +220,14 @@ cdef csv_reader_options make_csv_reader_options( "`parse_dates`: non-lists are unsupported") for col in parse_dates: if isinstance(col, str): - c_infer_date_names.push_back(str(col).encode()) + c_parse_dates_names.push_back(str(col).encode()) elif isinstance(col, int): - c_infer_date_indexes.push_back(col) + c_parse_dates_indexes.push_back(col) else: raise NotImplementedError( "`parse_dates`: Nesting is unsupported") - csv_reader_options_c.set_infer_date_names(c_infer_date_names) - csv_reader_options_c.set_infer_date_indexes(c_infer_date_indexes) + csv_reader_options_c.set_parse_dates(c_parse_dates_names) + csv_reader_options_c.set_parse_dates(c_parse_dates_indexes) if dtype is not None: if isinstance(dtype, abc.Mapping): From ee585c1dc295a87749163b95b531121ab46c2af8 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 23 Jul 2021 11:39:02 -0700 Subject: [PATCH 06/23] use new hex API in tests --- cpp/tests/io/csv_test.cpp | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 3af1e5d5ddb..35a5c531b3a 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1467,16 +1467,35 @@ TEST_F(CsvReaderTest, HexTest) std::ofstream outfile(filepath, std::ofstream::out); outfile << "0x0\n-0x1000\n0xfedcba\n0xABCDEF\n0xaBcDeF\n9512c20b\n"; } + // specify hex columns by name + { + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) + .names({"A"}) + .dtypes({dtype()}) + .header(-1) + .parse_hex({"A"}); + auto result = cudf_io::read_csv(in_opts); - cudf_io::csv_reader_options in_opts = - cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) - .names({"A"}) - .dtypes(std::vector{"hex"}) - .header(-1); - auto result = cudf_io::read_csv(in_opts); + expect_column_data_equal( + std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, + result.tbl->view().column(0)); + } - expect_column_data_equal(std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, - result.tbl->view().column(0)); + // specify hex columns by index + { + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) + .names({"A"}) + .dtypes({dtype()}) + .header(-1) + .parse_hex(std::vector{0}); + auto result = cudf_io::read_csv(in_opts); + + expect_column_data_equal( + std::vector{0, -4096, 16702650, 11259375, 11259375, 2501034507}, + result.tbl->view().column(0)); + } } TYPED_TEST(CsvReaderNumericTypeTest, SingleColumnWithWriter) From 0e24ae8fbd2d3d5f44a39a0dcc1dc8a209cf71f7 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 23 Jul 2021 11:44:16 -0700 Subject: [PATCH 07/23] re-enable json tests that were accidentally disabled --- cpp/tests/io/json_test.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 426a39ce9d3..282e484327e 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -145,7 +145,6 @@ void check_float_column(cudf::column_view const& col, struct JsonReaderTest : public cudf::test::BaseFixture { }; -/* TEST_F(JsonReaderTest, BasicJsonLines) { std::string data = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n"; @@ -615,10 +614,10 @@ TEST_F(JsonReaderTest, JsonLinesObjectsOutOfOrder) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), cudf::test::strings_column_wrapper({"aaa", "bbb"})); } -*/ -/* + // currently, the json reader is strict about having non-empty input. -TEST_F(JsonReaderTest, EmptyFile) { +TEST_F(JsonReaderTest, EmptyFile) +{ auto filepath = temp_env->get_temp_dir() + "EmptyFile.csv"; { std::ofstream outfile{filepath, std::ofstream::out}; @@ -634,7 +633,8 @@ TEST_F(JsonReaderTest, EmptyFile) { } // currently, the json reader is strict about having non-empty input. -TEST_F(JsonReaderTest, NoDataFile) { +TEST_F(JsonReaderTest, NoDataFile) +{ auto filepath = temp_env->get_temp_dir() + "NoDataFile.csv"; { std::ofstream outfile{filepath, std::ofstream::out}; @@ -648,8 +648,7 @@ TEST_F(JsonReaderTest, NoDataFile) { const auto view = result.tbl->view(); EXPECT_EQ(0, view.num_columns()); } -*/ -/* + TEST_F(JsonReaderTest, ArrowFileSource) { const std::string fname = temp_env->get_temp_dir() + "ArrowFileSource.csv"; @@ -698,7 +697,8 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint) const auto col_data = cudf::test::to_host(result.tbl->view().column(0)); // col_data.first contains the column data - for (const auto& elem : col_data.first) ASSERT_TRUE(std::isnan(elem)); + for (const auto& elem : col_data.first) + ASSERT_TRUE(std::isnan(elem)); // col_data.second contains the bitmasks ASSERT_EQ(0u, col_data.second[0]); } @@ -861,7 +861,7 @@ TEST_F(JsonReaderTest, ParseOutOfRangeIntegers) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_less_int64_min_append, view.column(8)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_mixed_range_append, view.column(9)); } -*/ + TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs) { const std::string file1 = temp_env->get_temp_dir() + "JsonLinesFileTest1.json"; From 584f1805479007a1d2ed4065dbe55d5de1cc8568 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 23 Jul 2021 22:15:24 -0700 Subject: [PATCH 08/23] small refactor to prepare for JSON API expansion --- cpp/src/io/json/reader_impl.cu | 27 ++++++++++++++++----------- cpp/src/io/json/reader_impl.hpp | 3 +++ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index b4395d6c965..2712e53a43e 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -464,17 +464,14 @@ void reader::impl::set_column_names(device_span rec_starts, } } -void reader::impl::set_data_types(device_span rec_starts, - rmm::cuda_stream_view stream) -{ - auto const dtype = options_.get_dtypes(); - if (!dtype.empty()) { - CUDF_EXPECTS(dtype.size() == metadata_.column_names.size(), +void reader::impl::parse_data_types(std::vector const& types_as_strings){ + + CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(), "Need to specify the type of each column.\n"); // Assume that the dtype is in dictionary format only if all elements contain a colon const bool is_dict = - std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string& s) { + std::all_of(std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) { return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); }); @@ -486,8 +483,8 @@ void reader::impl::set_data_types(device_span rec_starts, if (is_dict) { std::map col_type_map; std::transform( - std::cbegin(dtype), - std::cend(dtype), + std::cbegin(types_as_strings), + std::cend(types_as_strings), std::inserter(col_type_map, col_type_map.end()), [&](auto const& ts) { auto const [col_name, type_str] = split_on_colon(ts); @@ -500,11 +497,19 @@ void reader::impl::set_data_types(device_span rec_starts, std::back_inserter(dtypes_), [&](auto const& column_name) { return col_type_map[column_name]; }); } else { - std::transform(std::cbegin(dtype), - std::cend(dtype), + std::transform(std::cbegin(types_as_strings), + std::cend(types_as_strings), std::back_inserter(dtypes_), [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); } +} + +void reader::impl::set_data_types(device_span rec_starts, + rmm::cuda_stream_view stream) +{ + auto const& dtype = options_.get_dtypes(); + if (!dtype.empty()) { + parse_data_types(dtype); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = metadata_.column_names.size(); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index bbda7e9ba74..0cf014f379b 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -158,6 +158,9 @@ class reader::impl { */ void set_column_names(device_span rec_starts, rmm::cuda_stream_view stream); + +void parse_data_types(std::vector const& types_as_strings); + /** * @brief Set the data type array data member * From 6b3778674811e3e8e8f7b0fc62839c34aca2c8e8 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 23 Jul 2021 22:21:58 -0700 Subject: [PATCH 09/23] disable the xfail tests --- cpp/tests/io/json_test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 282e484327e..b96cc9a041e 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -615,6 +615,7 @@ TEST_F(JsonReaderTest, JsonLinesObjectsOutOfOrder) cudf::test::strings_column_wrapper({"aaa", "bbb"})); } +/* // currently, the json reader is strict about having non-empty input. TEST_F(JsonReaderTest, EmptyFile) { @@ -648,6 +649,7 @@ TEST_F(JsonReaderTest, NoDataFile) const auto view = result.tbl->view(); EXPECT_EQ(0, view.num_columns()); } +*/ TEST_F(JsonReaderTest, ArrowFileSource) { From 5120d7b0e77f8ae833d9de6333655ef5b465373c Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 23 Jul 2021 23:43:42 -0700 Subject: [PATCH 10/23] extend JSON API (no tests) --- cpp/include/cudf/io/json.hpp | 59 +++++++++++++++-- cpp/src/io/csv/reader_impl.hpp | 2 +- cpp/src/io/json/reader_impl.cu | 108 ++++++++++++++++++++------------ cpp/src/io/json/reader_impl.hpp | 3 +- cpp/tests/io/json_test.cpp | 5 +- 5 files changed, 126 insertions(+), 51 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 2f4d0936d8b..f5d80f6f6c6 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -23,7 +23,9 @@ #include +#include #include +#include #include namespace cudf { @@ -66,7 +68,8 @@ class json_reader_options { source_info _source; // Data types of the column; empty to infer dtypes - std::vector _dtypes; + std::variant, std::vector, std::map> + _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -114,7 +117,13 @@ class json_reader_options { /** * @brief Returns data types of the columns. */ - std::vector const& get_dtypes() const { return _dtypes; } + std::variant, + std::vector, + std::map> const& + get_dtypes() const + { + return _dtypes; + } /** * @brief Returns compression format of the source. @@ -146,14 +155,28 @@ class json_reader_options { * * @param types Vector dtypes in string format. */ - void dtypes(std::vector types) { _dtypes = std::move(types); } + void set_dtypes(std::vector types) { _dtypes = std::move(types); } + + /** + * @brief Set data types for columns to be read. + * + * @param types Vector dtypes in string format. + */ + + void set_dtypes(std::vector types) { _dtypes = std::move(types); } + /** + * @brief Set data types for columns to be read. + * + * @param types Vector dtypes in string format. + */ + void set_dtypes(std::map types) { _dtypes = std::move(types); } /** * @brief Set the compression type. * * @param comp_type The compression type used. */ - void compression(compression_type comp_type) { _compression = comp_type; } + void set_compression(compression_type comp_type) { _compression = comp_type; } /** * @brief Set number of bytes to skip from source start. @@ -205,8 +228,8 @@ class json_reader_options_builder { /** * @brief Set data types for columns to be read. * - * @param types Vector dtypes in string format. - * @return this for chaining. + * @param types Vector of dtypes in string format + * @return this for chaining */ json_reader_options_builder& dtypes(std::vector types) { @@ -214,6 +237,30 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set data types for columns to be read. + * + * @param types Vector of dtypes + * @return this for chaining + */ + json_reader_options_builder& dtypes(std::vector types) + { + options._dtypes = std::move(types); + return *this; + } + + /** + * @brief Set data types for columns to be read. + * + * @param types Column name -> dtype map. + * @return this for chaining + */ + json_reader_options_builder& dtypes(std::map types) + { + options._dtypes = std::move(types); + return *this; + } + /** * @brief Set the compression type. * diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 222bb2c5cb3..6bd7b66874b 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -189,7 +189,7 @@ class reader::impl { */ std::vector select_data_types(std::map const& col_type_map); - /** + /** * @brief Sorts the columns' data types from the map of dtypes. * * @param col_type_map Vector of deta types specifying the columns' target data types diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 2712e53a43e..53318cf43ac 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -50,6 +50,15 @@ namespace json { using namespace cudf::io; namespace { +/** + * @brief Helper class to support inline-overloading for all of a variant's alternative types + */ +template +struct VisitorOverload : Ts... { + using Ts::operator()...; +}; +template +VisitorOverload(Ts...) -> VisitorOverload; /** * @brief Estimates the maximum expected length or a row, based on the number @@ -236,7 +245,9 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) { size_t map_range_size = 0; if (range_size != 0) { - map_range_size = range_size + calculate_max_row_size(options_.get_dtypes().size()); + auto const dtype_option_size = + std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes()); + map_range_size = range_size + calculate_max_row_size(dtype_option_size); } // Support delayed opening of the file if using memory mapping datasource @@ -464,52 +475,71 @@ void reader::impl::set_column_names(device_span rec_starts, } } -void reader::impl::parse_data_types(std::vector const& types_as_strings){ - +std::vector reader::impl::parse_data_types( + std::vector const& types_as_strings) +{ CUDF_EXPECTS(types_as_strings.size() == metadata_.column_names.size(), - "Need to specify the type of each column.\n"); - - // Assume that the dtype is in dictionary format only if all elements contain a colon - const bool is_dict = - std::all_of(std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) { - return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); + "Need to specify the type of each column.\n"); + std::vector dtypes; + // Assume that the dtype is in dictionary format only if all elements contain a colon + const bool is_dict = std::all_of( + std::cbegin(types_as_strings), std::cend(types_as_strings), [](const std::string& s) { + return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); + }); + + auto split_on_colon = [](std::string_view s) { + auto const i = s.find(":"); + return std::pair{s.substr(0, i), s.substr(i + 1)}; + }; + + if (is_dict) { + std::map col_type_map; + std::transform( + std::cbegin(types_as_strings), + std::cend(types_as_strings), + std::inserter(col_type_map, col_type_map.end()), + [&](auto const& ts) { + auto const [col_name, type_str] = split_on_colon(ts); + return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; }); - auto split_on_colon = [](std::string_view s) { - auto const i = s.find(":"); - return std::pair{s.substr(0, i), s.substr(i + 1)}; - }; - - if (is_dict) { - std::map col_type_map; - std::transform( - std::cbegin(types_as_strings), - std::cend(types_as_strings), - std::inserter(col_type_map, col_type_map.end()), - [&](auto const& ts) { - auto const [col_name, type_str] = split_on_colon(ts); - return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; - }); - - // Using the map here allows O(n log n) complexity - std::transform(std::cbegin(metadata_.column_names), - std::cend(metadata_.column_names), - std::back_inserter(dtypes_), - [&](auto const& column_name) { return col_type_map[column_name]; }); - } else { - std::transform(std::cbegin(types_as_strings), - std::cend(types_as_strings), - std::back_inserter(dtypes_), - [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); - } + // Using the map here allows O(n log n) complexity + std::transform(std::cbegin(metadata_.column_names), + std::cend(metadata_.column_names), + std::back_inserter(dtypes), + [&](auto const& column_name) { return col_type_map[column_name]; }); + } else { + std::transform(std::cbegin(types_as_strings), + std::cend(types_as_strings), + std::back_inserter(dtypes), + [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); + } + return dtypes; } void reader::impl::set_data_types(device_span rec_starts, rmm::cuda_stream_view stream) { - auto const& dtype = options_.get_dtypes(); - if (!dtype.empty()) { - parse_data_types(dtype); + bool has_to_infer_column_types = + std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes()); + if (!has_to_infer_column_types) { + dtypes_ = std::visit( + VisitorOverload{ + [&](const std::vector& dtypes) { return dtypes; }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(metadata_.column_names), + std::cend(metadata_.column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); + return it->second; + }); + return sorted_dtypes; + }, + [&](std::vector const& dtypes) { return parse_data_types(dtypes); }}, + options_.get_dtypes()); } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); auto const num_columns = metadata_.column_names.size(); diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index 0cf014f379b..5cf51369cdf 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -158,8 +158,7 @@ class reader::impl { */ void set_column_names(device_span rec_starts, rmm::cuda_stream_view stream); - -void parse_data_types(std::vector const& types_as_strings); + std::vector parse_data_types(std::vector const& types_as_strings); /** * @brief Set the data type array data member diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index b96cc9a041e..f8862d9e89c 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -151,7 +151,7 @@ TEST_F(JsonReaderTest, BasicJsonLines) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .dtypes({"int", "float64"}) + .dtypes(std::vector{"int", "float64"}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -670,8 +670,7 @@ TEST_F(JsonReaderTest, ArrowFileSource) ; cudf_io::table_with_metadata result = cudf_io::read_json(in_options); - EXPECT_EQ(result.tbl->num_columns(), - static_cast(in_options.get_dtypes().size())); + EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); From 6e4a888df3846bef6693e0b2e2f35c5a9de75c36 Mon Sep 17 00:00:00 2001 From: vuule Date: Sat, 24 Jul 2021 00:04:45 -0700 Subject: [PATCH 11/23] switch tests to new API --- cpp/tests/io/json_test.cpp | 75 +++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 42 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index f8862d9e89c..80d950cc7d8 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -42,6 +42,16 @@ using int64_wrapper = wrapper; using timestamp_ms_wrapper = wrapper; using bool_wrapper = wrapper; +using cudf::data_type; +using cudf::type_id; +using cudf::type_to_id; + +template +auto dtype() +{ + return data_type{type_to_id()}; +} + template using column_wrapper = typename std::conditional::value, @@ -151,7 +161,7 @@ TEST_F(JsonReaderTest, BasicJsonLines) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .dtypes(std::vector{"int", "float64"}) + .dtypes(std::vector{dtype(), dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -182,7 +192,7 @@ TEST_F(JsonReaderTest, FloatingPoint) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"float32"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -206,7 +216,7 @@ TEST_F(JsonReaderTest, JsonLinesStrings) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .dtypes({"2:str", "0:int", "1:float64"}) + .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -245,9 +255,8 @@ TEST_F(JsonReaderTest, MultiColumn) std::ostringstream line; for (int i = 0; i < num_rows; ++i) { line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << "," - << int16_values[i] << "," << int32_values[i] << "," << int32_values[i] << "," - << int64_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," - << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "]\n"; + << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," + << float64_values[i] << "]\n"; } std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); @@ -255,17 +264,12 @@ TEST_F(JsonReaderTest, MultiColumn) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "float", - "float32", - "double", - "float64"}) + .dtypes({dtype(), + dtype(), + dtype(), + dtype(), + dtype(), + dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -275,34 +279,21 @@ TEST_F(JsonReaderTest, MultiColumn) EXPECT_EQ(view.column(0).type().id(), cudf::type_id::INT8); EXPECT_EQ(view.column(1).type().id(), cudf::type_id::INT16); - EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT16); - EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT32); - EXPECT_EQ(view.column(4).type().id(), cudf::type_id::INT32); - EXPECT_EQ(view.column(5).type().id(), cudf::type_id::INT64); - EXPECT_EQ(view.column(6).type().id(), cudf::type_id::INT64); - EXPECT_EQ(view.column(7).type().id(), cudf::type_id::FLOAT32); - EXPECT_EQ(view.column(8).type().id(), cudf::type_id::FLOAT32); - EXPECT_EQ(view.column(9).type().id(), cudf::type_id::FLOAT64); - EXPECT_EQ(view.column(10).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT32); + EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT64); + EXPECT_EQ(view.column(4).type().id(), cudf::type_id::FLOAT32); + EXPECT_EQ(view.column(5).type().id(), cudf::type_id::FLOAT64); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(0), int8_wrapper{int8_values.begin(), int8_values.end(), validity}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(1), int16_wrapper{int16_values.begin(), int16_values.end(), validity}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(2), - int16_wrapper{int16_values.begin(), int16_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3), - int_wrapper{int32_values.begin(), int32_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(4), int_wrapper{int32_values.begin(), int32_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(5), - int64_wrapper{int64_values.begin(), int64_values.end(), validity}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(6), + CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3), int64_wrapper{int64_values.begin(), int64_values.end(), validity}); - check_float_column(view.column(7), float32_values, validity); - check_float_column(view.column(8), float32_values, validity); - check_float_column(view.column(9), float64_values, validity); - check_float_column(view.column(10), float64_values, validity); + check_float_column(view.column(4), float32_values, validity); + check_float_column(view.column(5), float64_values, validity); } TEST_F(JsonReaderTest, Booleans) @@ -315,7 +306,7 @@ TEST_F(JsonReaderTest, Booleans) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"bool"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -342,7 +333,7 @@ TEST_F(JsonReaderTest, Dates) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"date"}) + .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) .dayfirst(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -379,7 +370,7 @@ TEST_F(JsonReaderTest, Durations) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"timedelta64[ns]"}) + .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -665,7 +656,7 @@ TEST_F(JsonReaderTest, ArrowFileSource) auto arrow_source = cudf_io::arrow_io_source{infile}; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{&arrow_source}) - .dtypes({"int8"}) + .dtypes({dtype()}) .lines(true); ; cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -689,7 +680,7 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint) cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) - .dtypes({"float32"}) + .dtypes({dtype()}) .lines(true); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); From ef031253376ef74df7a7e29cd79736acfd63700c Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 26 Jul 2021 12:08:38 -0700 Subject: [PATCH 12/23] add new APIs to cython defs --- python/cudf/cudf/_lib/cpp/io/csv.pxd | 5 +++++ python/cudf/cudf/_lib/cpp/io/json.pxd | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd index 2d6bdf28f7f..faedc9ec052 100644 --- a/python/cudf/cudf/_lib/cpp/io/csv.pxd +++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd @@ -5,6 +5,7 @@ from libcpp cimport bool from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from libcpp.map cimport map cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.table.table_view as cudf_table_view @@ -166,6 +167,10 @@ cdef extern from "cudf/io/csv.hpp" \ # Conversion settings csv_reader_options_builder& dtypes(vector[string] types) except+ + csv_reader_options_builder& dtypes(vector[data_type] types) except+ + csv_reader_options_builder& dtypes( + map[string, data_type] types + ) except+ csv_reader_options_builder& true_values(vector[string] vals) except+ csv_reader_options_builder& false_values(vector[string] vals) except+ csv_reader_options_builder& na_values(vector[string] vals) except+ diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 6f20195e87f..d49ea1eeddf 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -5,6 +5,7 @@ from libcpp cimport bool from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector +from libcpp.map cimport map cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.table.table_view as cudf_table_view @@ -47,6 +48,12 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dtypes( vector[string] types ) except+ + json_reader_options_builder& dtypes( + vector[data_type] types + ) except+ + json_reader_options_builder& dtypes( + map[string, data_type] types + ) except+ json_reader_options_builder& compression( cudf_io_types.compression_type compression ) except+ From 8ae3e1a3a7bc25812b6b5801b6e6a0699bc00fa6 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 26 Jul 2021 13:26:28 -0700 Subject: [PATCH 13/23] add to last missing place --- python/cudf/cudf/_lib/cpp/io/csv.pxd | 2 ++ python/cudf/cudf/_lib/cpp/io/json.pxd | 2 ++ 2 files changed, 4 insertions(+) diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd index faedc9ec052..85074cc4369 100644 --- a/python/cudf/cudf/_lib/cpp/io/csv.pxd +++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd @@ -102,6 +102,8 @@ cdef extern from "cudf/io/csv.hpp" \ # Conversion settings void set_dtypes(vector[string] types) except+ + void set_dtypes(vector[data_type] types) except+ + void set_dtypes(map[string, data_type] types) except+ void set_true_values(vector[string] vals) except+ void set_false_values(vector[string] vals) except+ void set_na_values(vector[string] vals) except+ diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index d49ea1eeddf..158994c81a5 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -27,6 +27,8 @@ cdef extern from "cudf/io/json.hpp" \ # setter void set_dtypes(vector[string] types) except+ + void set_dtypes(vector[data_type] types) except+ + void set_dtypes(map[string, data_type] types) except+ void set_compression( cudf_io_types.compression_type compression ) except+ From 53c4d15bc93d815a831298cd094c869c4aa5b138 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 26 Jul 2021 13:45:46 -0700 Subject: [PATCH 14/23] style fix; missed rename --- python/cudf/cudf/_lib/cpp/io/csv.pxd | 6 +++--- python/cudf/cudf/_lib/cpp/io/json.pxd | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd index 85074cc4369..725757121d9 100644 --- a/python/cudf/cudf/_lib/cpp/io/csv.pxd +++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd @@ -2,10 +2,10 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector -from libcpp.map cimport map cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.table.table_view as cudf_table_view @@ -164,8 +164,8 @@ cdef extern from "cudf/io/csv.hpp" \ ) except+ csv_reader_options_builder& quotechar(char val) except+ csv_reader_options_builder& doublequote(bool val) except+ - csv_reader_options_builder& infer_date_names(vector[string]) except+ - csv_reader_options_builder& infer_date_indexes(vector[int]) except+ + csv_reader_options_builder& parse_dates(vector[string]) except+ + csv_reader_options_builder& parse_dates(vector[int]) except+ # Conversion settings csv_reader_options_builder& dtypes(vector[string] types) except+ diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 158994c81a5..4a3792f5023 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -2,10 +2,10 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector -from libcpp.map cimport map cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.table.table_view as cudf_table_view From 1b543758d0e815a89d40e0a6e3b6ddfce05ec22c Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 26 Jul 2021 14:16:45 -0700 Subject: [PATCH 15/23] docs fixes --- cpp/src/io/csv/reader_impl.cu | 2 +- cpp/src/io/csv/reader_impl.hpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 656dde935df..32326b9603b 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -404,7 +404,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } } - // User can specify which columns should be inferred as datetime + // User can specify which columns should be parsed as datetime if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) { for (const auto index : opts_.get_parse_hex_indexes()) { column_flags_[index] |= column_parse::as_hexadecimal; diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 6bd7b66874b..e0ee367ad3c 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -182,7 +182,7 @@ class reader::impl { rmm::cuda_stream_view stream); /** - * @brief Sorts the columns' data types from the map of dtypes. + * @brief Selects the columns' data types from the map of dtypes. * * @param col_type_map Column name -> data type map specifying the columns' target data types * @return Sorted ist of selected columns' data types @@ -190,9 +190,9 @@ class reader::impl { std::vector select_data_types(std::map const& col_type_map); /** - * @brief Sorts the columns' data types from the map of dtypes. + * @brief Selects the columns' data types from the list of dtypes. * - * @param col_type_map Vector of deta types specifying the columns' target data types + * @param col_type_map Vector of data types specifying the columns' target data types * @return Sorted ist of selected columns' data types */ std::vector select_data_types(std::vector const& dtypes); From 6638a94ab0e66dad9ebd351830556e2012f5216e Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 26 Jul 2021 14:27:22 -0700 Subject: [PATCH 16/23] deprecate APIs --- cpp/include/cudf/io/csv.hpp | 6 ++++-- cpp/include/cudf/io/json.hpp | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 8ebb3a72a85..8b8027b2c64 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -615,7 +615,8 @@ class csv_reader_options { */ [[deprecated( "The string-based interface will be deprecated." - "Use dtypes(std::vector) instead.")]] void + "Use dtypes(std::vector) or " + "dtypes(std::map) instead.")]] void set_dtypes(std::vector types) { _dtypes = std::move(types); @@ -1074,7 +1075,8 @@ class csv_reader_options_builder { */ [[deprecated( "The string-based interface will be deprecated." - "Use dtypes(std::vector) instead.")]] csv_reader_options_builder& + "Use dtypes(std::vector) or " + "dtypes(std::map) instead.")]] csv_reader_options_builder& dtypes(std::vector types) { options._dtypes = std::move(types); diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index f5d80f6f6c6..d456a5b8682 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -155,7 +155,14 @@ class json_reader_options { * * @param types Vector dtypes in string format. */ - void set_dtypes(std::vector types) { _dtypes = std::move(types); } + [[deprecated( + "The string-based interface will be deprecated." + "Use dtypes(std::vector) or " + "dtypes(std::map) instead.")]] void + set_dtypes(std::vector types) + { + _dtypes = std::move(types); + } /** * @brief Set data types for columns to be read. @@ -231,7 +238,11 @@ class json_reader_options_builder { * @param types Vector of dtypes in string format * @return this for chaining */ - json_reader_options_builder& dtypes(std::vector types) + [[deprecated( + "The string-based interface will be deprecated." + "Use dtypes(std::vector) or " + "dtypes(std::map) instead.")]] json_reader_options_builder& + dtypes(std::vector types) { options._dtypes = std::move(types); return *this; From c654e106af186a87750925cbce5405bfa063f5fc Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 2 Aug 2021 11:53:21 -0700 Subject: [PATCH 17/23] doc fix --- cpp/include/cudf/io/csv.hpp | 8 ++++---- cpp/include/cudf/io/json.hpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 8b8027b2c64..d4a21b2e98c 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -293,12 +293,12 @@ class csv_reader_options { std::vector const& get_parse_dates_indexes() const { return _parse_dates_indexes; } /** - * @brief Returns names of columns to read as datetime. + * @brief Returns names of columns to read as hexadecimal. */ std::vector const& get_parse_hex_names() const { return _parse_hex_names; } /** - * @brief Returns indexes of columns to read as datetime. + * @brief Returns indexes of columns to read as hexadecimal. */ std::vector const& get_parse_hex_indexes() const { return _parse_hex_indexes; } @@ -1010,7 +1010,7 @@ class csv_reader_options_builder { /** * @brief Sets indexes of columns to read as datetime. * - * @param col_names Vector of column indices to read as datetime. + * @param col_ind Vector of column indices to read as datetime * @return this for chaining. */ csv_reader_options_builder& parse_dates(std::vector col_ind) @@ -1034,7 +1034,7 @@ class csv_reader_options_builder { /** * @brief Sets indexes of columns to parse as hexadecimal. * - * @param col_names Vector of column indices to parse as hexadecimal + * @param col_ind Vector of column indices to parse as hexadecimal * @return this for chaining. */ csv_reader_options_builder& parse_hex(std::vector col_ind) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index d456a5b8682..7286d641f60 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -167,7 +167,7 @@ class json_reader_options { /** * @brief Set data types for columns to be read. * - * @param types Vector dtypes in string format. + * @param types Vector of dtypes */ void set_dtypes(std::vector types) { _dtypes = std::move(types); } From 044b6985a3ae61c627688af0ccd199daded62def Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 2 Aug 2021 11:55:21 -0700 Subject: [PATCH 18/23] Apply suggestions from code review Co-authored-by: Ram (Ramakrishna Prabhu) <42624703+rgsl888prabhu@users.noreply.github.com> --- cpp/src/io/csv/reader_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 6bd7b66874b..a89017a74bf 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -185,7 +185,7 @@ class reader::impl { * @brief Sorts the columns' data types from the map of dtypes. * * @param col_type_map Column name -> data type map specifying the columns' target data types - * @return Sorted ist of selected columns' data types + * @return Sorted list of selected columns' data types */ std::vector select_data_types(std::map const& col_type_map); @@ -193,7 +193,7 @@ class reader::impl { * @brief Sorts the columns' data types from the map of dtypes. * * @param col_type_map Vector of deta types specifying the columns' target data types - * @return Sorted ist of selected columns' data types + * @return Sorted list of selected columns' data types */ std::vector select_data_types(std::vector const& dtypes); From af697987b35ce11f6247a13f41b73a4af9c12287 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 3 Aug 2021 11:53:48 -0700 Subject: [PATCH 19/23] Apply suggestions from code review Co-authored-by: Elias Stehle --- cpp/include/cudf/io/json.hpp | 3 +-- cpp/src/io/csv/reader_impl.cu | 2 +- cpp/src/io/csv/reader_impl.hpp | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7286d641f60..a0ea6b6ed17 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -153,7 +153,7 @@ class json_reader_options { /** * @brief Set data types for columns to be read. * - * @param types Vector dtypes in string format. + * @param types Vector of dtypes in string format. */ [[deprecated( "The string-based interface will be deprecated." @@ -169,7 +169,6 @@ class json_reader_options { * * @param types Vector of dtypes */ - void set_dtypes(std::vector types) { _dtypes = std::move(types); } /** * @brief Set data types for columns to be read. diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 32326b9603b..611ad1c81d3 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -404,7 +404,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } } - // User can specify which columns should be parsed as datetime + // User can specify which columns should be parsed as hexadecimal if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) { for (const auto index : opts_.get_parse_hex_indexes()) { column_flags_[index] |= column_parse::as_hexadecimal; diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 0d2367b1646..36c2bf4f9e7 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -192,7 +192,7 @@ class reader::impl { /** * @brief Selects the columns' data types from the list of dtypes. * - * @param col_type_map Vector of data types specifying the columns' target data types + * @param dtypes Vector of data types specifying the columns' target data types * @return Sorted list of selected columns' data types */ std::vector select_data_types(std::vector const& dtypes); From 8fa26947ec0c8a75ceb2b71186eb0038e871d85a Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 3 Aug 2021 11:54:51 -0700 Subject: [PATCH 20/23] add missing empty line --- cpp/include/cudf/io/json.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index a0ea6b6ed17..8954f7dcab1 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -170,6 +170,7 @@ class json_reader_options { * @param types Vector of dtypes */ void set_dtypes(std::vector types) { _dtypes = std::move(types); } + /** * @brief Set data types for columns to be read. * From f3d94e91375c9c34da6dbca9081f59c11e08e754 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 3 Aug 2021 16:19:02 -0700 Subject: [PATCH 21/23] move visitor_overload to utilities --- .../detail/utilities/visitor_overload.hpp | 30 +++++++++++++++++++ cpp/src/io/csv/reader_impl.cu | 15 ++-------- cpp/src/io/json/reader_impl.cu | 13 ++------ 3 files changed, 34 insertions(+), 24 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/visitor_overload.hpp diff --git a/cpp/include/cudf/detail/utilities/visitor_overload.hpp b/cpp/include/cudf/detail/utilities/visitor_overload.hpp new file mode 100644 index 00000000000..c77947c2015 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::detail { + +/** + * @brief Helper class to support inline-overloading for all of a variant's alternative types + */ +template +struct visitor_overload : Ts... { + using Ts::operator()...; +}; +template +visitor_overload(Ts...) -> visitor_overload; + +} diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 611ad1c81d3..549b0474fe1 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -49,18 +50,6 @@ using cudf::device_span; using cudf::host_span; using cudf::detail::make_device_uvector_async; -namespace { -/** - * @brief Helper class to support inline-overloading for all of a variant's alternative types - */ -template -struct VisitorOverload : Ts... { - using Ts::operator()...; -}; -template -VisitorOverload(Ts...) -> VisitorOverload; -} // namespace - namespace cudf { namespace io { namespace detail { @@ -432,7 +421,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) column_types = infer_column_types(data, row_offsets, stream); } else { column_types = std::visit( - VisitorOverload{ + cudf::detail::visitor_overload{ [&](const std::vector& data_types) { return select_data_types(data_types); }, [&](const std::map& data_types) { return select_data_types(data_types); diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 53318cf43ac..a8f117c22bf 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -50,16 +51,6 @@ namespace json { using namespace cudf::io; namespace { -/** - * @brief Helper class to support inline-overloading for all of a variant's alternative types - */ -template -struct VisitorOverload : Ts... { - using Ts::operator()...; -}; -template -VisitorOverload(Ts...) -> VisitorOverload; - /** * @brief Estimates the maximum expected length or a row, based on the number * of columns @@ -524,7 +515,7 @@ void reader::impl::set_data_types(device_span rec_starts, std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes()); if (!has_to_infer_column_types) { dtypes_ = std::visit( - VisitorOverload{ + cudf::detail::visitor_overload{ [&](const std::vector& dtypes) { return dtypes; }, [&](const std::map& dtypes) { std::vector sorted_dtypes; From 829928ab4531ff9842f032693a131dd975146628 Mon Sep 17 00:00:00 2001 From: vuule Date: Tue, 3 Aug 2021 16:56:50 -0700 Subject: [PATCH 22/23] update yaml --- conda/recipes/libcudf/meta.yaml | 1 + cpp/include/cudf/detail/utilities/visitor_overload.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 75bfe6c34bc..35d444d026c 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -102,6 +102,7 @@ test: - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp + - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp - test -f $PREFIX/include/cudf/dictionary/detail/encode.hpp - test -f $PREFIX/include/cudf/dictionary/detail/merge.hpp diff --git a/cpp/include/cudf/detail/utilities/visitor_overload.hpp b/cpp/include/cudf/detail/utilities/visitor_overload.hpp index c77947c2015..fb9998df060 100644 --- a/cpp/include/cudf/detail/utilities/visitor_overload.hpp +++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp @@ -27,4 +27,4 @@ struct visitor_overload : Ts... { template visitor_overload(Ts...) -> visitor_overload; -} +} // namespace cudf::detail From 3ebe478695f6d4322aa27809eaa5adb8a61c60a2 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 4 Aug 2021 12:03:01 -0700 Subject: [PATCH 23/23] fix copyright year --- cpp/include/cudf/detail/utilities/visitor_overload.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/utilities/visitor_overload.hpp b/cpp/include/cudf/detail/utilities/visitor_overload.hpp index fb9998df060..a55ca323c50 100644 --- a/cpp/include/cudf/detail/utilities/visitor_overload.hpp +++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.