diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 7a135c1f2f2..0623e35741d 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -967,7 +967,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Association between each ORC column and its cudf::column _col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1); std::vector nested_col; - bool is_data_empty = false; // Get a list of column data types std::vector column_types; @@ -991,7 +990,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Map each ORC column to its column _col_meta.orc_col_map[level][col.id] = column_types.size() - 1; - // TODO: Once MAP type is supported in cuDF, update this for MAP as well if (col_type == type_id::LIST or col_type == type_id::STRUCT) nested_col.emplace_back(col); } @@ -1051,6 +1049,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, size_t num_rowgroups = 0; int stripe_idx = 0; + bool is_level_data_empty = true; std::vector, size_t>> read_tasks; for (auto const& stripe_source_mapping : selected_stripes) { // Iterate through the source files selected stripes @@ -1070,21 +1069,16 @@ table_with_metadata reader::impl::read(size_type skip_rows, stream_info, level == 0); - if (total_data_size == 0) { - CUDF_EXPECTS(stripe_info->indexLength == 0, "Invalid index rowgroup stream data"); - // In case ROW GROUP INDEX is not present and all columns are structs with no null - // stream, there is nothing to read at this level. - auto fn_check_dtype = [](auto dtype) { return dtype.id() == type_id::STRUCT; }; - CUDF_EXPECTS(std::all_of(column_types.begin(), column_types.end(), fn_check_dtype), - "Expected streams data within stripe"); - is_data_empty = true; - } + auto const is_stripe_data_empty = total_data_size == 0; + if (not is_stripe_data_empty) { is_level_data_empty = false; } + CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0, + "Invalid index rowgroup stream data"); stripe_data.emplace_back(total_data_size, stream); auto dst_base = static_cast(stripe_data.back().data()); // Coalesce consecutive streams into one read - while (not is_data_empty and stream_count < stream_info.size()) { + while (not is_stripe_data_empty and stream_count < stream_info.size()) { const auto d_dst = dst_base + stream_info[stream_count].dst_pos; const auto offset = stream_info[stream_count].offset; auto len = stream_info[stream_count].length; @@ -1162,7 +1156,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); } - if (not is_data_empty) { + if (not is_stripe_data_empty) { for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; } @@ -1199,7 +1193,8 @@ table_with_metadata reader::impl::read(size_type skip_rows, }); } // Setup row group descriptors if using indexes - if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) { + if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and + not is_level_data_empty) { auto decomp_data = decompress_stripe_data(chunks, stripe_data, *_metadata.per_file_metadata[0].decompressor, @@ -1242,7 +1237,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr); } - if (not is_data_empty) { + if (not is_level_data_empty) { decode_stream_data(chunks, num_dict_entries, skip_rows, @@ -1256,7 +1251,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Extract information to process nested child columns if (nested_col.size()) { - if (not is_data_empty) { + if (not is_level_data_empty) { scan_null_counts(chunks, null_count_prefix_sums[level], stream); } row_groups.device_to_host(stream, true); diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc new file mode 100644 index 00000000000..1c661e1c6f0 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc new file mode 100644 index 00000000000..edc1094a186 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc new file mode 100644 index 00000000000..fe5f57af14c Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc new file mode 100644 index 00000000000..53c323436d6 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc new file mode 100644 index 00000000000..1bb4079c492 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc new file mode 100644 index 00000000000..a457b8285bd Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc differ diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 1699c11617a..48f1a49b7f4 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1815,3 +1815,22 @@ def test_statistics_string_sum(): file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) assert_eq(file_stats[0]["str"].get("sum"), sum(len(s) for s in strings)) + + +@pytest.mark.parametrize( + "fname", + [ + "TestOrcFile.Hive.OneEmptyMap.orc", + "TestOrcFile.Hive.OneEmptyList.orc", + "TestOrcFile.Hive.OneNullStruct.orc", + "TestOrcFile.Hive.EmptyListStripe.orc", + "TestOrcFile.Hive.NullStructStripe.orc", + "TestOrcFile.Hive.AllNulls.orc", + ], +) +def test_reader_empty_stripe(datadir, fname): + path = datadir / fname + + expected = pd.read_orc(path) + got = cudf.read_orc(path) + assert_eq(expected, got)