diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index aac44bed50e..c190340f6c1 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,45 +40,23 @@ table_with_metadata read_csv(std::unique_ptr&& source, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -class writer { - public: - class impl; - - private: - std::unique_ptr _impl; - - public: - /** - * @brief Constructor for output to a file. - * - * @param sinkp The data sink to write the data to - * @param options Settings for controlling writing behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - writer(std::unique_ptr sinkp, - csv_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); // cannot provide definition here (because - // _impl is incomplete hence unique_ptr has - // not enough sizeof() info) - - /** - * @brief Destructor explicitly-declared to avoid inlined in header - */ - ~writer(); +/** + * @brief Write an entire dataset to CSV format. + * + * @param sink Output sink + * @param table The set of columns + * @param metadata The metadata associated with the table + * @param options Settings for controlling behavior + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource to use for device memory allocation + */ +void write_csv(data_sink* sink, + table_view const& table, + const table_metadata* metadata, + csv_writer_options const& options, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** - * @brief Writes the entire dataset. - * - * @param table Set of columns to output - * @param metadata Table metadata and column names - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write(table_view const& table, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); -}; } // namespace csv } // namespace detail } // namespace io diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp new file mode 100644 index 00000000000..d42ddf3817c --- /dev/null +++ b/cpp/src/io/csv/durations.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include + +namespace cudf { +namespace io { +namespace detail { +namespace csv { + +std::unique_ptr pandas_format_durations( + column_view const& durations, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace csv +} // namespace detail +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index e8c673751db..b9b6fc6cf94 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -19,17 +19,25 @@ * @brief cuDF-IO CSV writer class implementation */ -#include "writer_impl.hpp" +#include "durations.hpp" + +#include "csv_common.h" +#include "csv_gpu.h" #include #include #include +#include +#include #include #include #include #include #include #include +#include +#include +#include #include #include @@ -40,13 +48,19 @@ #include #include +#include #include +#include +#include namespace cudf { namespace io { namespace detail { namespace csv { +using namespace cudf::io::csv; +using namespace cudf::io; + namespace { /** @@ -260,32 +274,16 @@ struct column_to_strings_fn { }; } // unnamed namespace -// Forward to implementation -writer::writer(std::unique_ptr sink, - csv_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _impl(std::make_unique(std::move(sink), options, mr)) -{ -} - -// Destructor within this translation unit -writer::~writer() = default; - -writer::impl::impl(std::unique_ptr sink, - csv_writer_options const& options, - rmm::mr::device_memory_resource* mr) - : out_sink_(std::move(sink)), mr_(mr), options_(options) -{ -} - // write the header: column names: // -void writer::impl::write_chunked_begin(table_view const& table, - const table_metadata* metadata, - rmm::cuda_stream_view stream) +void write_chunked_begin(data_sink* out_sink, + table_view const& table, + table_metadata const* metadata, + csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - if (options_.is_enabled_include_header()) { + if (options.is_enabled_include_header()) { // need to generate column names if metadata is not provided std::vector generated_col_names; if (metadata == nullptr) { @@ -298,8 +296,8 @@ void writer::impl::write_chunked_begin(table_view const& table, CUDF_EXPECTS(column_names.size() == static_cast(table.num_columns()), "Mismatch between number of column headers and table columns."); - auto const delimiter = options_.get_inter_column_delimiter(); - auto const terminator = options_.get_line_terminator(); + auto const delimiter = options.get_inter_column_delimiter(); + auto const terminator = options.get_line_terminator(); // process header names: // - if the header name includes the delimiter or terminator character, @@ -341,18 +339,21 @@ void writer::impl::write_chunked_begin(table_view const& table, } header.append(terminator); - out_sink_->host_write(header.data(), header.size()); + out_sink->host_write(header.data(), header.size()); } } -void writer::impl::write_chunked(strings_column_view const& str_column_view, - const table_metadata* metadata, - rmm::cuda_stream_view stream) +void write_chunked(data_sink* out_sink, + strings_column_view const& str_column_view, + table_metadata const* metadata, + csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // algorithm outline: // // for_each(strings_column.begin(), strings_column.end(), - // [sink = out_sink_](auto str_row) mutable { + // [sink = out_sink](auto str_row) mutable { // auto host_buffer = str_row.host_buffer(); // sink->host_write(host_buffer_.data(), host_buffer_.size()); // });//or...sink->device_write(device_buffer,...); @@ -362,7 +363,7 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); - cudf::string_scalar newline{options_.get_line_terminator()}; + cudf::string_scalar newline{options.get_line_terminator()}; auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream); strings_column_view strings_column{p_str_col_w_nl->view()}; @@ -370,9 +371,9 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, auto total_num_bytes = strings_column.chars_size(); char const* ptr_all_bytes = strings_column.chars_begin(); - if (out_sink_->is_device_write_preferred(total_num_bytes)) { + if (out_sink->is_device_write_preferred(total_num_bytes)) { // Direct write from device memory - out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream); + out_sink->device_write(ptr_all_bytes, total_num_bytes, stream); } else { // copy the bytes to host to write them out thrust::host_vector h_bytes(total_num_bytes); @@ -383,30 +384,33 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, stream.value())); stream.synchronize(); - out_sink_->host_write(h_bytes.data(), total_num_bytes); + out_sink->host_write(h_bytes.data(), total_num_bytes); } // Needs newline at the end, to separate from next chunk - if (out_sink_->is_device_write_preferred(newline.size())) { - out_sink_->device_write(newline.data(), newline.size(), stream); + if (out_sink->is_device_write_preferred(newline.size())) { + out_sink->device_write(newline.data(), newline.size(), stream); } else { - out_sink_->host_write(options_.get_line_terminator().data(), - options_.get_line_terminator().size()); + out_sink->host_write(options.get_line_terminator().data(), + options.get_line_terminator().size()); } } -void writer::impl::write(table_view const& table, - const table_metadata* metadata, - rmm::cuda_stream_view stream) +void write_csv(data_sink* out_sink, + table_view const& table, + table_metadata const* metadata, + csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // write header: column names separated by delimiter: // (even for tables with no rows) // - write_chunked_begin(table, metadata, stream); + write_chunked_begin(out_sink, table, metadata, options, stream, mr); if (table.num_rows() > 0) { // no need to check same-size columns constraint; auto-enforced by table_view - auto n_rows_per_chunk = options_.get_rows_per_chunk(); + auto n_rows_per_chunk = options.get_rows_per_chunk(); // // This outputs the CSV in row chunks to save memory. // Maybe we can use the total_rows*count calculation and a memory threshold @@ -436,7 +440,7 @@ void writer::impl::write(table_view const& table, // convert each chunk to CSV: // - column_to_strings_fn converter{options_, stream, rmm::mr::get_current_device_resource()}; + column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()}; for (auto&& sub_view : vector_views) { // Skip if the table has no rows if (sub_view.num_rows() == 0) continue; @@ -459,32 +463,21 @@ void writer::impl::write(table_view const& table, // concatenate columns in each row into one big string column // (using null representation and delimiter): // - std::string delimiter_str{options_.get_inter_column_delimiter()}; + std::string delimiter_str{options.get_inter_column_delimiter()}; auto str_concat_col = [&] { if (str_table_view.num_columns() > 1) return cudf::strings::detail::concatenate(str_table_view, delimiter_str, - options_.get_na_rep(), + options.get_na_rep(), strings::separator_on_nulls::YES, stream); - cudf::string_scalar narep{options_.get_na_rep()}; + cudf::string_scalar narep{options.get_na_rep()}; return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); }(); - write_chunked(str_concat_col->view(), metadata, stream); + write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr); } } - - // finalize (no-op, for now, but offers a hook for future extensions): - // - write_chunked_end(table, metadata, stream); -} - -void writer::write(table_view const& table, - const table_metadata* metadata, - rmm::cuda_stream_view stream) -{ - _impl->write(table, metadata, stream); } } // namespace csv diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp deleted file mode 100644 index 965c036dc75..00000000000 --- a/cpp/src/io/csv/writer_impl.hpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "csv_common.h" -#include "csv_gpu.h" - -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -namespace cudf { -namespace io { -namespace detail { -namespace csv { - -using namespace cudf::io::csv; -using namespace cudf::io; - -/** - * @brief Implementation for CSV writer - */ -class writer::impl { - public: - /** - * @brief Constructor with writer options. - * - * @param sink Output sink - * @param options Settings for controlling behavior - * @param mr Device memory resource to use for device memory allocation - */ - impl(std::unique_ptr sink, - csv_writer_options const& options, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Write an entire dataset to CSV format. - * - * @param table The set of columns - * @param metadata The metadata associated with the table - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write(table_view const& table, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - - /** - * @brief Write the header of a CSV format. - * - * @param table The set of columns - * @param metadata The metadata associated with the table - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write_chunked_begin(table_view const& table, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - - /** - * @brief Write dataset to CSV format without header. - * - * @param strings_column Subset of columns converted to string to be written. - * @param metadata The metadata associated with the table - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write_chunked(strings_column_view const& strings_column, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - - /** - * @brief Write footer of CSV format (typically, empty). - * - * @param table The set of columns - * @param metadata The metadata associated with the table - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write_chunked_end(table_view const& table, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) - { - // purposely no-op (for now); - } - - private: - std::unique_ptr out_sink_; - rmm::mr::device_memory_resource* mr_ = nullptr; - csv_writer_options const options_; -}; - -std::unique_ptr pandas_format_durations( - column_view const& durations, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace csv -} // namespace detail -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index b678941db21..746148d24cc 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -219,10 +219,14 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc using namespace cudf::io::detail; auto sink = make_datasink(options.get_sink()); - auto writer = - std::make_unique(std::move(sink), options, rmm::cuda_stream_default, mr); - writer->write(options.get_table(), options.get_metadata()); + return csv::write_csv( // + sink.get(), + options.get_table(), + options.get_metadata(), + options, + rmm::cuda_stream_default, + mr); } namespace detail_orc = cudf::io::detail::orc;