diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 5b73b1fef10..9c24e5becfd 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -118,9 +118,17 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, cudf.io.parquet.read_parquet cudf.io.parquet.to_parquet """ + + # Convert NativeFile buffers to NativeFileDatasource, + # but save original buffers in case we need to use + # pyarrow for metadata processing + # (See: https://github.com/rapidsai/cudf/issues/9599) + pa_buffers = [] for i, datasource in enumerate(filepaths_or_buffers): if isinstance(datasource, NativeFile): + pa_buffers.append(datasource) filepaths_or_buffers[i] = NativeFileDatasource(datasource) + cdef cudf_io_types.source_info source = make_source_info( filepaths_or_buffers) @@ -203,7 +211,9 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, range_index_meta = index_col[0] if row_groups is not None: per_file_metadata = [ - pa.parquet.read_metadata(s) for s in filepaths_or_buffers + pa.parquet.read_metadata(s) for s in ( + pa_buffers or filepaths_or_buffers + ) ] filtered_idx = [] diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 2d7907a43b8..ff551ec74ca 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -272,7 +272,8 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns): assert_eq(expect, got) -def test_read_parquet_filters(s3_base, s3so, pdf): +@pytest.mark.parametrize("python_file", [True, False]) +def test_read_parquet_filters(s3_base, s3so, pdf, python_file): fname = "test_parquet_reader_filters.parquet" bname = "parquet" buffer = BytesIO() @@ -284,6 +285,7 @@ def test_read_parquet_filters(s3_base, s3so, pdf): "s3://{}/{}".format(bname, fname), storage_options=s3so, filters=filters, + use_python_file_object=python_file, ) # All row-groups should be filtered out