From 959b392bf7f2c98ee1cdb317dcadc8f89fea512e Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 4 Nov 2021 09:52:14 -0700 Subject: [PATCH] add 9599 fix into standalone branch --- python/cudf/cudf/_lib/parquet.pyx | 12 +++++++++++- python/cudf/cudf/tests/test_s3.py | 4 +++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 5b73b1fef10..9c24e5becfd 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -118,9 +118,17 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, cudf.io.parquet.read_parquet cudf.io.parquet.to_parquet """ + + # Convert NativeFile buffers to NativeFileDatasource, + # but save original buffers in case we need to use + # pyarrow for metadata processing + # (See: https://github.com/rapidsai/cudf/issues/9599) + pa_buffers = [] for i, datasource in enumerate(filepaths_or_buffers): if isinstance(datasource, NativeFile): + pa_buffers.append(datasource) filepaths_or_buffers[i] = NativeFileDatasource(datasource) + cdef cudf_io_types.source_info source = make_source_info( filepaths_or_buffers) @@ -203,7 +211,9 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, range_index_meta = index_col[0] if row_groups is not None: per_file_metadata = [ - pa.parquet.read_metadata(s) for s in filepaths_or_buffers + pa.parquet.read_metadata(s) for s in ( + pa_buffers or filepaths_or_buffers + ) ] filtered_idx = [] diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 2d7907a43b8..ff551ec74ca 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -272,7 +272,8 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns): assert_eq(expect, got) -def test_read_parquet_filters(s3_base, s3so, pdf): +@pytest.mark.parametrize("python_file", [True, False]) +def test_read_parquet_filters(s3_base, s3so, pdf, python_file): fname = "test_parquet_reader_filters.parquet" bname = "parquet" buffer = BytesIO() @@ -284,6 +285,7 @@ def test_read_parquet_filters(s3_base, s3so, pdf): "s3://{}/{}".format(bname, fname), storage_options=s3so, filters=filters, + use_python_file_object=python_file, ) # All row-groups should be filtered out