Adding support for writing empty dataframe (#8490)

Closes #6691 Removed expectation of `table.num_columns() > 0`. Instead if `table.num_columns() == 0`, outputs a line terminator following Pandas behaviour. Authors: - https://github.com/shaneding Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Charles Blackmon-Luca (https://github.com/charlesbluca) - Karthikeyan (https://github.com/karthikeyann) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) URL: #8490
rapidsai · Jun 11, 2021 · 0a4e8a1 · 0a4e8a1
1 parent 6a23f20
commit 0a4e8a1
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 11 deletions.
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
@@ -286,7 +286,6 @@ void writer::impl::write_chunked_begin(table_view const& table,
   if ((metadata != nullptr) && (options_.is_enabled_include_header())) {
     CUDF_EXPECTS(metadata->column_names.size() == static_cast<size_t>(table.num_columns()),
                  "Mismatch between number of column headers and table columns.");
-
     std::string delimiter_str{options_.get_inter_column_delimiter()};
 
     // avoid delimiter after last element:
@@ -295,7 +294,12 @@ void writer::impl::write_chunked_begin(table_view const& table,
     std::copy(metadata->column_names.begin(),
               metadata->column_names.end() - 1,
               std::ostream_iterator<std::string>(ss, delimiter_str.c_str()));
-    ss << metadata->column_names.back() << options_.get_line_terminator();
+
+    if (metadata->column_names.size() > 0) {
+      ss << metadata->column_names.back() << options_.get_line_terminator();
+    } else {
+      ss << options_.get_line_terminator();
+    }
 
     out_sink_->host_write(ss.str().data(), ss.str().size());
   }
@@ -355,8 +359,6 @@ void writer::impl::write(table_view const& table,
                          const table_metadata* metadata,
                          rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(table.num_columns() > 0, "Empty table.");
-
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
@@ -1754,9 +1754,12 @@ TEST_F(CsvReaderTest, EmptyFileWithWriter)
   auto filepath = temp_env->get_temp_dir() + "EmptyFileWithWriter.csv";
 
   cudf::table_view empty_table;
+  write_csv_helper(filepath, empty_table, false);
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_csv(in_opts);
 
-  // TODO is it ok for write_csv to throw instead of just writing an empty file?
-  EXPECT_THROW(write_csv_helper(filepath, empty_table, false), cudf::logic_error);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty_table, result.tbl->view());
 }
 
 class TestSource : public cudf::io::datasource {

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
@@ -432,7 +432,6 @@ cpdef write_csv(
     --------
     cudf.io.csv.to_csv
     """
-
     cdef table_view input_table_view = \
         table.view() if index is True else table.data_view()
     cdef bool include_header_c = header

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -1583,10 +1583,8 @@ def test_csv_writer_column_and_header_options(
 
 def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe):
     df = cudf_mixed_dataframe
-
-    buffer = BytesIO()
-    with pytest.raises(RuntimeError):
-        df.to_csv(buffer, columns=[], index=False)
+    write_str = df.to_csv(columns=[], index=False)
+    assert_eq(write_str, "\n")
 
 
 def test_csv_writer_multiindex(tmpdir):
@@ -1979,3 +1977,13 @@ def test_to_csv_compression_error():
     error_message = "Writing compressed csv is not currently supported in cudf"
     with pytest.raises(NotImplementedError, match=re.escape(error_message)):
         df.to_csv("test.csv", compression=compression)
+
+
+def test_empty_df_no_index():
+    actual = cudf.DataFrame({})
+    buffer = BytesIO()
+    actual.to_csv(buffer, index=False)
+
+    result = cudf.read_csv(buffer)
+
+    assert_eq(actual, result)