From 34c056c07c67398f554329cd78cbb7c0be6fbe30 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 21 Oct 2022 14:50:37 -0700 Subject: [PATCH 1/6] C++ changes --- cpp/include/cudf/io/csv.hpp | 18 +++++++++--------- cpp/include/cudf/io/detail/csv.hpp | 4 ++-- cpp/src/io/csv/writer_impl.cu | 15 +++++++-------- cpp/src/io/functions.cpp | 2 +- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index f753028a148..13e83253dff 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1339,7 +1339,7 @@ class csv_writer_options { // string to use for values == 0 in INT8 types (default 'false') std::string _false_value = std::string{"false"}; // Optional associated metadata - table_metadata const* _metadata = nullptr; + std::vector _names; /** * @brief Constructor from sink and table. @@ -1391,7 +1391,7 @@ class csv_writer_options { * * @return Optional associated metadata */ - [[nodiscard]] table_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] std::vector const& get_names() const { return _names; } /** * @brief Returns string to used for null entries. @@ -1444,11 +1444,11 @@ class csv_writer_options { // Setter /** - * @brief Sets optional associated metadata. + * @brief Sets optional associated column names. * - @param metadata Associated metadata + @param names Associated column names */ - void set_metadata(table_metadata* metadata) { _metadata = metadata; } + void set_names(std::vector names) { _names = std::move(names); } /** * @brief Sets string to used for null entries. @@ -1526,14 +1526,14 @@ class csv_writer_options_builder { } /** - * @brief Sets optional associated metadata. + * @brief Sets optional column names. * - * @param metadata Associated metadata + * @param names Column names * @return this for chaining */ - csv_writer_options_builder& metadata(table_metadata* metadata) + csv_writer_options_builder& names(std::vector names) { - options._metadata = metadata; + options._names = names; return *this; } diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 920b815ce12..96b47876da8 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -46,14 +46,14 @@ table_with_metadata read_csv(std::unique_ptr&& source, * * @param sink Output sink * @param table The set of columns - * @param metadata The metadata associated with the table + * @param column_names Column names for the output CSV * @param options Settings for controlling behavior * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource to use for device memory allocation */ void write_csv(data_sink* sink, table_view const& table, - const table_metadata* metadata, + std::vector const& column_names, csv_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 2fae7b4c75a..02d4cfd9db9 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -279,21 +279,21 @@ struct column_to_strings_fn { // void write_chunked_begin(data_sink* out_sink, table_view const& table, - table_metadata const* metadata, + std::vector const& user_column_names, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (options.is_enabled_include_header()) { - // need to generate column names if metadata is not provided + // need to generate column names if names are not provided std::vector generated_col_names; - if (metadata == nullptr) { + if (user_column_names.empty()) { generated_col_names.resize(table.num_columns()); thrust::tabulate(generated_col_names.begin(), generated_col_names.end(), [](auto idx) { return std::to_string(idx); }); } - auto const& column_names = (metadata == nullptr) ? generated_col_names : metadata->column_names; + auto const& column_names = user_column_names.empty() ? generated_col_names : user_column_names; CUDF_EXPECTS(column_names.size() == static_cast(table.num_columns()), "Mismatch between number of column headers and table columns."); @@ -346,7 +346,6 @@ void write_chunked_begin(data_sink* out_sink, void write_chunked(data_sink* out_sink, strings_column_view const& str_column_view, - table_metadata const* metadata, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -399,7 +398,7 @@ void write_chunked(data_sink* out_sink, void write_csv(data_sink* out_sink, table_view const& table, - table_metadata const* metadata, + std::vector const& user_column_names, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -407,7 +406,7 @@ void write_csv(data_sink* out_sink, // write header: column names separated by delimiter: // (even for tables with no rows) // - write_chunked_begin(out_sink, table, metadata, options, stream, mr); + write_chunked_begin(out_sink, table, user_column_names, options, stream, mr); if (table.num_rows() > 0) { // no need to check same-size columns constraint; auto-enforced by table_view @@ -476,7 +475,7 @@ void write_csv(data_sink* out_sink, return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); }(); - write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr); + write_chunked(out_sink, str_concat_col->view(), options, stream, mr); } } } diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 968d3827bfe..b50f8bbfaec 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -231,7 +231,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc return csv::write_csv( // sinks[0].get(), options.get_table(), - options.get_metadata(), + options.get_names(), options, cudf::get_default_stream(), mr); From 5bb1796e8d316207028a0d0b92bec538565becc0 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 21 Oct 2022 14:50:55 -0700 Subject: [PATCH 2/6] C++ test changes --- cpp/tests/io/csv_test.cpp | 64 +++++++++++++++------------------------ 1 file changed, 24 insertions(+), 40 deletions(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index f532836ef95..8100c8e3d7f 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -277,30 +277,12 @@ void expect_column_data_equal(std::vector const& lhs, cudf::column_view const void write_csv_helper(std::string const& filename, cudf::table_view const& table, - bool include_header, std::vector const& names = {}) { - // csv_writer_options only keeps a pointer to metadata (non-owning) - cudf::io::table_metadata metadata{}; - - if (not names.empty()) { - metadata.column_names = names; - } else { - // generate some dummy column names - int i = 0; - auto const num_columns = table.num_columns(); - metadata.column_names.reserve(num_columns); - std::generate_n(std::back_inserter(metadata.column_names), num_columns, [&i]() { - return std::string("col") + std::to_string(i++); - }); - } - cudf::io::csv_writer_options writer_options = cudf::io::csv_writer_options::builder(cudf::io::sink_info(filename), table) - .include_header(include_header) - .rows_per_chunk( - 1) // Note: this gets adjusted to multiple of 8 (per legacy code logic and requirements) - .metadata(&metadata); + .include_header(not names.empty()) + .names(names); cudf::io::write_csv(writer_options); } @@ -1509,7 +1491,7 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumnWithWriter) auto filepath = temp_env->get_temp_filepath("SingleColumnWithWriter.csv"); - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1); @@ -1577,7 +1559,7 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter) auto filepath = temp_env->get_temp_dir() + "MultiColumnWithWriter.csv"; - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1625,7 +1607,7 @@ TEST_F(CsvReaderTest, DatesWithWriter) cudf::table_view input_table(std::vector{input_column}); // TODO need to add a dayfirst flag? - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1650,7 +1632,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter) cudf::table_view input_table(std::vector{input_column}); - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1673,7 +1655,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter) cudf::table_view input_table(std::vector{input_column}); - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1696,7 +1678,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter) cudf::table_view input_table(std::vector{input_column}); - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1720,7 +1702,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter) cudf::table_view input_table(std::vector{input_column}); - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1743,7 +1725,7 @@ TEST_F(CsvReaderTest, DatesStringWithWriter) cudf::table_view input_table(std::vector{input_column}); - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1766,7 +1748,7 @@ TEST_F(CsvReaderTest, FloatingPointWithWriter) cudf::table_view input_table(std::vector{input_column}); // TODO add lineterminator=";" - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1792,11 +1774,10 @@ TEST_F(CsvReaderTest, StringsWithWriter) cudf::table_view input_table(std::vector{int_column, string_column}); // TODO add quoting style flag? - write_csv_helper(filepath, input_table, true, names); + write_csv_helper(filepath, input_table, names); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) - .names(names) .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf::io::quote_style::NONE); auto result = cudf::io::read_csv(in_opts); @@ -1804,6 +1785,7 @@ TEST_F(CsvReaderTest, StringsWithWriter) const auto result_table = result.tbl->view(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0)); check_string_column(input_table.column(1), result_table.column(1)); + ASSERT_EQ(names, result.metadata.column_names); } TEST_F(CsvReaderTest, StringsWithWriterSimple) @@ -1817,11 +1799,10 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple) cudf::table_view input_table(std::vector{int_column, string_column}); // TODO add quoting style flag? - write_csv_helper(filepath, input_table, true, names); + write_csv_helper(filepath, input_table, names); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) - .names(names) .dtypes(std::vector{dtype(), dtype()}) .quoting(cudf::io::quote_style::NONE); auto result = cudf::io::read_csv(in_opts); @@ -1829,6 +1810,7 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple) const auto result_table = result.tbl->view(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0)); check_string_column(input_table.column(1), result_table.column(1)); + ASSERT_EQ(names, result.metadata.column_names); } TEST_F(CsvReaderTest, StringsEmbeddedDelimiter) @@ -1841,15 +1823,15 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter) auto string_column = column_wrapper{"abc def ghi", "jkl,mno,pq", "stu vwx y"}; cudf::table_view input_table(std::vector{int_column, string_column}); - write_csv_helper(filepath, input_table, true, names); + write_csv_helper(filepath, input_table, names); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) - .names(names) .dtypes(std::vector{dtype(), dtype()}); auto result = cudf::io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); + ASSERT_EQ(names, result.metadata.column_names); } TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter) @@ -1864,7 +1846,7 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter) cudf::table_view input_table( std::vector{int_column, string_column, int_column, int_column, int_column}); - write_csv_helper(filepath, input_table, true, names); + write_csv_helper(filepath, input_table, names); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1877,6 +1859,7 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter) auto result = cudf::io::read_csv(in_opts); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view()); + ASSERT_EQ(names, result.metadata.column_names); } TEST_F(CsvReaderTest, EmptyFileWithWriter) @@ -1884,7 +1867,7 @@ TEST_F(CsvReaderTest, EmptyFileWithWriter) auto filepath = temp_env->get_temp_dir() + "EmptyFileWithWriter.csv"; cudf::table_view empty_table; - write_csv_helper(filepath, empty_table, false); + write_csv_helper(filepath, empty_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}); auto result = cudf::io::read_csv(in_opts); @@ -1968,7 +1951,7 @@ TEST_F(CsvReaderTest, DurationsWithWriter) durations_D, durations_s, durations_ms, durations_us, durations_ns}); std::vector names{"D", "s", "ms", "us", "ns"}; - write_csv_helper(filepath, input_table, true, names); + write_csv_helper(filepath, input_table, names); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) @@ -1982,6 +1965,7 @@ TEST_F(CsvReaderTest, DurationsWithWriter) const auto result_table = result.tbl->view(); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table); + ASSERT_EQ(names, result.metadata.column_names); } TEST_F(CsvReaderTest, ParseInRangeIntegers) @@ -2044,7 +2028,7 @@ TEST_F(CsvReaderTest, ParseInRangeIntegers) auto filepath = temp_env->get_temp_filepath("ParseInRangeIntegers.csv"); - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1); @@ -2123,7 +2107,7 @@ TEST_F(CsvReaderTest, ParseOutOfRangeIntegers) auto filepath = temp_env->get_temp_filepath("ParseOutOfRangeIntegers.csv"); - write_csv_helper(filepath, input_table, false); + write_csv_helper(filepath, input_table); cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1); From 2be22e0e4bd8874c8ec66843d090f64c704d7ab8 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 21 Oct 2022 14:51:11 -0700 Subject: [PATCH 3/6] Python changes --- python/cudf/cudf/_lib/cpp/io/csv.pxd | 8 ++++---- python/cudf/cudf/_lib/csv.pyx | 15 +++++++-------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd index e8064557592..e7c0fec2e3d 100644 --- a/python/cudf/cudf/_lib/cpp/io/csv.pxd +++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd @@ -199,6 +199,7 @@ cdef extern from "cudf/io/csv.hpp" \ char get_inter_column_delimiter() except + string get_true_value() except + string get_false_value() except + + vector[string] get_names() except + # setter void set_metadata(cudf_io_types.table_metadata* val) except + @@ -207,8 +208,9 @@ cdef extern from "cudf/io/csv.hpp" \ void set_rows_per_chunk(size_type val) except + void set_line_terminator(string term) except + void set_inter_column_delimiter(char delim) except + - void set__true_value(string val) except + + void set_true_value(string val) except + void set_false_value(string val) except + + void set_names(vector[string] val) except + @staticmethod csv_writer_options_builder builder( @@ -223,9 +225,7 @@ cdef extern from "cudf/io/csv.hpp" \ cudf_table_view.table_view table ) except + - csv_writer_options_builder& metadata( - cudf_io_types.table_metadata* val - ) except + + csv_writer_options_builder& names(vector[string] val) except + csv_writer_options_builder& na_rep(string val) except + csv_writer_options_builder& include_header(bool val) except + csv_writer_options_builder& rows_per_chunk(size_type val) except + diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index f1a75baa951..3750780d587 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -40,7 +40,6 @@ from cudf._lib.cpp.io.types cimport ( quote_style, sink_info, source_info, - table_metadata, table_with_metadata, ) from cudf._lib.cpp.table.table_view cimport table_view @@ -475,7 +474,7 @@ cpdef write_csv( cdef string line_term_c = line_terminator.encode() cdef string na_c = na_rep.encode() cdef int rows_per_chunk_c = rows_per_chunk - cdef table_metadata metadata_ = table_metadata() + cdef vector[string] col_names cdef string true_value_c = 'True'.encode() cdef string false_value_c = 'False'.encode() cdef unique_ptr[data_sink] data_sink_c @@ -487,26 +486,26 @@ cpdef write_csv( all_names = table._index.names + all_names if len(all_names) > 0: - metadata_.column_names.reserve(len(all_names)) + col_names.reserve(len(all_names)) if len(all_names) == 1: if all_names[0] in (None, ''): - metadata_.column_names.push_back('""'.encode()) + col_names.push_back('""'.encode()) else: - metadata_.column_names.push_back( + col_names.push_back( str(all_names[0]).encode() ) else: for idx, col_name in enumerate(all_names): if col_name is None: - metadata_.column_names.push_back(''.encode()) + col_names.push_back(''.encode()) else: - metadata_.column_names.push_back( + col_names.push_back( str(col_name).encode() ) cdef csv_writer_options options = move( csv_writer_options.builder(sink_info_c, input_table_view) - .metadata(&metadata_) + .names(col_names) .na_rep(na_c) .include_header(include_header_c) .rows_per_chunk(rows_per_chunk_c) From b28379791006bfe142e291fef5fc534a55371038 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 24 Oct 2022 15:58:13 -0700 Subject: [PATCH 4/6] fix docs --- cpp/include/cudf/io/csv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 13e83253dff..3b34eb5e135 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1387,9 +1387,9 @@ class csv_writer_options { [[nodiscard]] table_view const& get_table() const { return _table; } /** - * @brief Returns optional associated metadata. + * @brief Returns names of the columns. * - * @return Optional associated metadata + * @return Names of the columns in the output file */ [[nodiscard]] std::vector const& get_names() const { return _names; } From ddd660652b85b687e5cbd009efac1bf43c26fc17 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 26 Oct 2022 14:13:44 -0700 Subject: [PATCH 5/6] doc update --- cpp/include/cudf/io/csv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 3b34eb5e135..1fc4114b94c 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1338,7 +1338,7 @@ class csv_writer_options { std::string _true_value = std::string{"true"}; // string to use for values == 0 in INT8 types (default 'false') std::string _false_value = std::string{"false"}; - // Optional associated metadata + // Names of all columns; if empty, writer will generate column names std::vector _names; /** From 8cf2b5a8c9fce5c148efdaa3f1dd159b28deec62 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 31 Oct 2022 14:17:13 -0700 Subject: [PATCH 6/6] use span --- cpp/include/cudf/io/detail/csv.hpp | 2 +- cpp/src/io/csv/writer_impl.cu | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 44aa21d0fda..90d730338fc 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -53,7 +53,7 @@ table_with_metadata read_csv(std::unique_ptr&& source, */ void write_csv(data_sink* sink, table_view const& table, - std::vector const& column_names, + host_span column_names, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 02d4cfd9db9..7230b455d4a 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -279,7 +279,7 @@ struct column_to_strings_fn { // void write_chunked_begin(data_sink* out_sink, table_view const& table, - std::vector const& user_column_names, + host_span user_column_names, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -398,7 +398,7 @@ void write_chunked(data_sink* out_sink, void write_csv(data_sink* out_sink, table_view const& table, - std::vector const& user_column_names, + host_span user_column_names, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)