From 5cdb9d98ddcd35be4dfe9920986464fd521ef6c2 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 23 Mar 2023 17:21:44 -0500 Subject: [PATCH] Pre-emptive fix for upstream `dask.dataframe.read_parquet` changes (#12983) Once https://github.com/dask/dask/pull/10007 is merged, users will be able to pass a dictionary of hive-partitioning options to `dd.read_parquet` (using the `dataset=` kwarg). This new feature provides a workaround for the fact that `pyarrow.dataset.Partitioning` objects **cannot** be serialized in Python. In order for this feature to be supported in `dask_cudf` the `CudfEngine.read_partition` method must account for the case that `partitioning` is a `dict`. **NOTE**: It is not possible to add test coverage for this change until dask#10007 is merged. However, I don't see any good reason not to merge this PR **before** dask#10007. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/12983 --- python/dask_cudf/dask_cudf/io/parquet.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index b03ac256b05..f19c373150d 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -121,6 +121,8 @@ def _read_paths( if row_groups else None, strings_to_categorical=strings_to_categorical, + dataset_kwargs=dataset_kwargs, + categorical_partitions=False, **kwargs, ) for i, pof in enumerate(paths_or_fobs) @@ -191,6 +193,8 @@ def read_partition( dataset_kwargs = kwargs.get("dataset", {}) partitioning = partitioning or dataset_kwargs.get("partitioning", None) + if isinstance(partitioning, dict): + partitioning = pa_ds.partitioning(**partitioning) # Check if we are actually selecting any columns read_columns = columns