From dcebfe7783d552e5788502308bd73a21d5ff033a Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Wed, 21 Apr 2021 20:33:09 -0500 Subject: [PATCH] Fix dask_cudf metadata-inference when first ORC path is empty (#8021) Closes #8011 Dask-cuDF currently reads a single stripe to infer metadata in `read_orc`. When the first path corresponds to an empty file, there is no stripe "0" to read. This PR includes a simple fix (and test coverage). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/8021 --- python/dask_cudf/dask_cudf/io/orc.py | 7 ++++++- .../dask_cudf/dask_cudf/io/tests/test_orc.py | 21 +++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index 5b0d19b737b..00fc197da9b 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -85,7 +85,12 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): columns = list(schema) with fs.open(paths[0], "rb") as f: - meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs) + meta = cudf.read_orc( + f, + stripes=[0] if nstripes_per_file[0] else None, + columns=columns, + **kwargs, + ) name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs) dsk = {} diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index eae6509bc92..d8ac9e52fd8 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -6,10 +6,10 @@ from dask import dataframe as dd -import dask_cudf - import cudf +import dask_cudf + # import pyarrow.orc as orc cur_dir = os.path.dirname(__file__) @@ -74,6 +74,23 @@ def test_read_orc_filtered(tmpdir, engine, predicate, expected_len): dd.assert_eq(len(df), expected_len) +def test_read_orc_first_file_empty(tmpdir): + + # Write a 3-file dataset where the first file is empty + # See: https://github.com/rapidsai/cudf/issues/8011 + path = str(tmpdir) + os.makedirs(path, exist_ok=True) + df1 = cudf.DataFrame({"id": [1, 2], "float": [1.0, 2.0]}) + df1.iloc[:0].to_orc(os.path.join(path, "data.0")) + df1.iloc[:1].to_orc(os.path.join(path, "data.1")) + df1.iloc[1:].to_orc(os.path.join(path, "data.2")) + + # Read back the files with dask_cudf, + # and check the result. + df2 = dask_cudf.read_orc(os.path.join(path, "*")) + dd.assert_eq(df1, df2, check_index=False) + + @pytest.mark.parametrize("compute", [True, False]) @pytest.mark.parametrize("compression", [None, "snappy"]) @pytest.mark.parametrize(