diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 9da56b9bef8..f812f272c25 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -538,8 +538,7 @@ void infer_column_types(parse_options const& parse_opts, auto const& stats = column_stats[inf_col_idx++]; unsigned long long int_count_total = stats.big_int_count + stats.negative_small_int_count + stats.positive_small_int_count; - - if (stats.null_count == num_records) { + if (stats.null_count == num_records or stats.total_count() == 0) { // Entire column is NULL; allocate the smallest amount of memory column_types[col_idx] = data_type(cudf::type_id::INT8); } else if (stats.string_count > 0L) { diff --git a/cpp/src/io/utilities/column_type_histogram.hpp b/cpp/src/io/utilities/column_type_histogram.hpp index 8bd2d3a89cf..88f4e58f9b1 100644 --- a/cpp/src/io/utilities/column_type_histogram.hpp +++ b/cpp/src/io/utilities/column_type_histogram.hpp @@ -33,6 +33,11 @@ struct column_type_histogram { cudf::size_type positive_small_int_count{}; cudf::size_type big_int_count{}; cudf::size_type bool_count{}; + auto total_count() const + { + return null_count + float_count + datetime_count + string_count + negative_small_int_count + + positive_small_int_count + big_int_count + bool_count; + } }; } // namespace io diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 8acc6f8f6ee..eeca87446ec 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -2244,6 +2244,27 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch) EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1"); } +TEST_F(CsvReaderTest, EmptyColumns) +{ + // First column only has empty fields. second column contains only "null" literals + std::string csv_in{",null\n,null"}; + + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()}) + .names({"a", "b", "c", "d"}) + .header(-1); + // More elements in `names` than in the file; additional columns are filled with nulls + auto result = cudf::io::read_csv(in_opts); + + const auto result_table = result.tbl->view(); + EXPECT_EQ(result_table.num_columns(), 4); + // All columns should contain only nulls; expect INT8 type to use as little memory as possible + for (auto& column : result_table) { + EXPECT_EQ(column.type(), data_type{type_id::INT8}); + EXPECT_EQ(column.null_count(), 2); + } +} + TEST_F(CsvReaderTest, BlankLineAfterFirstRow) { std::string csv_in{"12,9., 10\n\n"};