From 5cdb9d98ddcd35be4dfe9920986464fd521ef6c2 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 23 Mar 2023 17:21:44 -0500
Subject: [PATCH] Pre-emptive fix for upstream `dask.dataframe.read_parquet`
 changes (#12983)

Once https://github.com/dask/dask/pull/10007 is merged, users will be able to pass a dictionary of hive-partitioning options to `dd.read_parquet` (using the `dataset=` kwarg). This new feature provides a workaround for the fact that `pyarrow.dataset.Partitioning` objects **cannot** be serialized in Python. In order for this feature to be supported in `dask_cudf` the `CudfEngine.read_partition` method must account for the case that `partitioning` is a `dict`.

**NOTE**:
It is not possible to add test coverage for this change until dask#10007 is merged. However, I don't see any good reason not to merge this PR **before** dask#10007.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12983
---
 python/dask_cudf/dask_cudf/io/parquet.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index b03ac256b05..f19c373150d 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -121,6 +121,8 @@ def _read_paths(
                                 if row_groups
                                 else None,
                                 strings_to_categorical=strings_to_categorical,
+                                dataset_kwargs=dataset_kwargs,
+                                categorical_partitions=False,
                                 **kwargs,
                             )
                             for i, pof in enumerate(paths_or_fobs)
@@ -191,6 +193,8 @@ def read_partition(
 
         dataset_kwargs = kwargs.get("dataset", {})
         partitioning = partitioning or dataset_kwargs.get("partitioning", None)
+        if isinstance(partitioning, dict):
+            partitioning = pa_ds.partitioning(**partitioning)
 
         # Check if we are actually selecting any columns
         read_columns = columns