From dcebfe7783d552e5788502308bd73a21d5ff033a Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 21 Apr 2021 20:33:09 -0500
Subject: [PATCH] Fix dask_cudf metadata-inference when first ORC path is empty
 (#8021)

Closes #8011

Dask-cuDF currently reads a single stripe to infer metadata in `read_orc`.  When the first path corresponds to an empty file, there is no stripe "0" to read.  This PR includes a simple fix (and test coverage).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Keith Kraus (https://github.com/kkraus14)

URL: https://github.com/rapidsai/cudf/pull/8021
---
 python/dask_cudf/dask_cudf/io/orc.py          |  7 ++++++-
 .../dask_cudf/dask_cudf/io/tests/test_orc.py  | 21 +++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 5b0d19b737b..00fc197da9b 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -85,7 +85,12 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
         columns = list(schema)
 
     with fs.open(paths[0], "rb") as f:
-        meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs)
+        meta = cudf.read_orc(
+            f,
+            stripes=[0] if nstripes_per_file[0] else None,
+            columns=columns,
+            **kwargs,
+        )
 
     name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
     dsk = {}
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index eae6509bc92..d8ac9e52fd8 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -6,10 +6,10 @@
 
 from dask import dataframe as dd
 
-import dask_cudf
-
 import cudf
 
+import dask_cudf
+
 # import pyarrow.orc as orc
 
 cur_dir = os.path.dirname(__file__)
@@ -74,6 +74,23 @@ def test_read_orc_filtered(tmpdir, engine, predicate, expected_len):
     dd.assert_eq(len(df), expected_len)
 
 
+def test_read_orc_first_file_empty(tmpdir):
+
+    # Write a 3-file dataset where the first file is empty
+    # See: https://github.com/rapidsai/cudf/issues/8011
+    path = str(tmpdir)
+    os.makedirs(path, exist_ok=True)
+    df1 = cudf.DataFrame({"id": [1, 2], "float": [1.0, 2.0]})
+    df1.iloc[:0].to_orc(os.path.join(path, "data.0"))
+    df1.iloc[:1].to_orc(os.path.join(path, "data.1"))
+    df1.iloc[1:].to_orc(os.path.join(path, "data.2"))
+
+    # Read back the files with dask_cudf,
+    # and check the result.
+    df2 = dask_cudf.read_orc(os.path.join(path, "*"))
+    dd.assert_eq(df1, df2, check_index=False)
+
+
 @pytest.mark.parametrize("compute", [True, False])
 @pytest.mark.parametrize("compression", [None, "snappy"])
 @pytest.mark.parametrize(