rapidsai · rapids-bot · Nov 15, 2024 · Sep 10, 2024 · Sep 16, 2024 · Oct 9, 2024
diff --git a/docs/cudf/source/cudf_polars/engine_options.md b/docs/cudf/source/cudf_polars/engine_options.md
@@ -0,0 +1,25 @@
+# GPUEngine Configuration Options
+
+The `polars.GPUEngine` object may be configured in several different ways.
+
+## Parquet Reader Options
+Reading large parquet files incurs significant memory overhead, especially when the files are compressed. This may lead to out of memory errors for some workflows. To mitigate this, the "chunked" parquet reader may be selected. When enabled, parquet files are read in chunks with a limit on the amount of memory that is used, at the cost of a small drop in performance.
+
+
+To configure the parquet reader, we provide a dictionary of options to the `parquet_options` keyword of the `GPUEngine` object. Valid keys and values are:
+- `chunked`, indicicates is chunked parquet reading is to be used, default True.
+- [`chunk_read_limit`](https://docs.rapids.ai/api/libcudf/legacy/classcudf_1_1io_1_1chunked__parquet__reader#aad118178b7536b7966e3325ae1143a1a) controls the maximum size per chunk, default unlimited.
+- [`pass_read_limit`](https://docs.rapids.ai/api/libcudf/legacy/classcudf_1_1io_1_1chunked__parquet__reader#aad118178b7536b7966e3325ae1143a1a) controls the maximum memory used for decompression, default 16GiB.
+
+For example, to select the chunked reader with custom values for `pass_read_limit` and `chunk_read_limit`:
+```python
+engine = GPUEngine(
+    parquet_options={
+        'chunked': True,
+        'chunk_read_limit': int(1e9),
+        'pass_read_limit': int(4e9)
+    }
+)
+result = query.collect(engine=engine)
+```
+Note that passing `chunked: False` disables chunked reading entirely, and thus `chunk_read_limit` and `pass_read_limit` will have no effect.
diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst
@@ -39,3 +39,9 @@ Launch on Google Colab
    :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb
 
    Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Engine Config Options:
+
+   engine_options
@@ -129,6 +129,7 @@ def set_device(device: int | None) -> Generator[int, None, None]:
 
 def _callback(
     ir: IR,
+    config: GPUEngine,
     with_columns: list[str] | None,
     pyarrow_predicate: str | None,
     n_rows: int | None,
@@ -145,7 +146,7 @@ def _callback(
         set_device(device),
         set_memory_resource(memory_resource),
     ):
-        return ir.evaluate(cache={}).to_polars()
+        return ir.evaluate(cache={}, config=config).to_polars()
 
 
 def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
@@ -174,7 +175,7 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
     device = config.device
     memory_resource = config.memory_resource
     raise_on_fail = config.config.get("raise_on_fail", False)
-    if unsupported := (config.config.keys() - {"raise_on_fail"}):
+    if unsupported := (config.config.keys() - {"raise_on_fail", "parquet_options"}):
         raise ValueError(
             f"Engine configuration contains unsupported settings {unsupported}"
         )
@@ -200,5 +201,11 @@ def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:
                 raise exception
         else:
             nt.set_udf(
-                partial(_callback, ir, device=device, memory_resource=memory_resource)
+                partial(
+                    _callback,
+                    ir,
+                    config,
+                    device=device,
+                    memory_resource=memory_resource,
+                )
             )
@@ -37,6 +37,8 @@
     from collections.abc import Callable, Hashable, MutableMapping, Sequence
     from typing import Literal
 
+    from polars import GPUEngine
+
     from cudf_polars.typing import Schema
 
 
@@ -180,7 +182,9 @@ def get_hashable(self) -> Hashable:
         translation phase should fail earlier.
     """
 
-    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
+    def evaluate(
+        self, *, cache: MutableMapping[int, DataFrame], config: GPUEngine
+    ) -> DataFrame:
         """
         Evaluate the node (recursively) and return a dataframe.
 
@@ -189,6 +193,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         cache
             Mapping from cached node ids to constructed DataFrames.
             Used to implement evaluation of the `Cache` node.
+        config
+            GPU engine configuration.
 
         Notes
         -----
@@ -208,8 +214,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             translation phase should fail earlier.
         """
         return self.do_evaluate(
+            config,
             *self._non_child_args,
-            *(child.evaluate(cache=cache) for child in self.children),
+            *(child.evaluate(cache=cache, config=config) for child in self.children),
         )
 
 
@@ -294,6 +301,9 @@ class Scan(IR):
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
+    PARQUET_DEFAULT_CHUNK_SIZE: int = 0  # unlimited
+    PARQUET_DEFAULT_PASS_LIMIT: int = 16 * 1024**3  # 16GiB
+
     def __init__(
         self,
         schema: Schema,
@@ -413,6 +423,7 @@ def get_hashable(self) -> Hashable:
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         schema: Schema,
         typ: str,
         reader_options: dict[str, Any],
@@ -498,25 +509,59 @@ def do_evaluate(
                 colnames[0],
             )
         elif typ == "parquet":
-            filters = None
-            if predicate is not None and row_index is None:
-                # Can't apply filters during read if we have a row index.
-                filters = to_parquet_filter(predicate.value)
-            tbl_w_meta = plc.io.parquet.read_parquet(
-                plc.io.SourceInfo(paths),
-                columns=with_columns,
-                filters=filters,
-                nrows=n_rows,
-                skip_rows=skip_rows,
-            )
-            df = DataFrame.from_table(
-                tbl_w_meta.tbl,
-                # TODO: consider nested column names?
-                tbl_w_meta.column_names(include_children=False),
-            )
-            if filters is not None:
-                # Mask must have been applied.
-                return df
+            parquet_options = config.config.get("parquet_options", {})
+            if parquet_options.get("chunked", False):
+                reader = plc.io.parquet.ChunkedParquetReader(
+                    plc.io.SourceInfo(paths),
+                    columns=with_columns,
+                    nrows=n_rows,
+                    skip_rows=skip_rows,
+                    chunk_read_limit=parquet_options.get(
+                        "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE
+                    ),
+                    pass_read_limit=parquet_options.get(
+                        "pass_read_limit", cls.PARQUET_DEFAULT_PASS_LIMIT
+                    ),
+                )
+                chk = reader.read_chunk()
+                tbl = chk.tbl
+                names = chk.column_names()
+                concatenated_columns = tbl.columns()
+                while reader.has_next():
+                    tbl = reader.read_chunk().tbl
+
+                    for i in range(tbl.num_columns()):
+                        concatenated_columns[i] = plc.concatenate.concatenate(
+                            [concatenated_columns[i], tbl._columns[i]]
+                        )
+                        # Drop residual columns to save memory
+                        tbl._columns[i] = None
+
+                df = DataFrame.from_table(
+                    plc.Table(concatenated_columns),
+                    names=names,
+                )
+            else:
+                filters = None
+                if predicate is not None and row_index is None:
+                    # Can't apply filters during read if we have a row index.
+                    filters = to_parquet_filter(predicate.value)
+                tbl_w_meta = plc.io.parquet.read_parquet(
+                    plc.io.SourceInfo(paths),
+                    columns=with_columns,
+                    filters=filters,
+                    nrows=n_rows,
+                    skip_rows=skip_rows,
+                )
+                df = DataFrame.from_table(
+                    tbl_w_meta.tbl,
+                    # TODO: consider nested column names?
+                    tbl_w_meta.column_names(include_children=False),
+                )
+                if filters is not None:
+                    # Mask must have been applied.
+                    return df
+
         elif typ == "ndjson":
             json_schema: list[plc.io.json.NameAndType] = [
                 (name, typ, []) for name, typ in schema.items()
@@ -591,22 +636,26 @@ def __init__(self, schema: Schema, key: int, value: IR):
 
     @classmethod
     def do_evaluate(
-        cls, key: int, df: DataFrame
+        cls, config: GPUEngine, key: int, df: DataFrame
     ) -> DataFrame:  # pragma: no cover; basic evaluation never calls this
         """Evaluate and return a dataframe."""
         # Our value has already been computed for us, so let's just
         # return it.
         return df
 
-    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
+    def evaluate(
+        self, *, cache: MutableMapping[int, DataFrame], config: GPUEngine
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         # We must override the recursion scheme because we don't want
         # to recurse if we're in the cache.
         try:
             return cache[self.key]
         except KeyError:
             (value,) = self.children
-            return cache.setdefault(self.key, value.evaluate(cache=cache))
+            return cache.setdefault(
+                self.key, value.evaluate(cache=cache, config=config)
+            )
 
 
 class DataFrameScan(IR):
@@ -652,6 +701,7 @@ def get_hashable(self) -> Hashable:
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         schema: Schema,
         df: Any,
         projection: tuple[str, ...] | None,
@@ -699,6 +749,7 @@ def __init__(
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         exprs: tuple[expr.NamedExpr, ...],
         should_broadcast: bool,  # noqa: FBT001
         df: DataFrame,
@@ -733,7 +784,10 @@ def __init__(
 
     @classmethod
     def do_evaluate(
-        cls, exprs: tuple[expr.NamedExpr, ...], df: DataFrame
+        cls,
+        config: GPUEngine,
+        exprs: tuple[expr.NamedExpr, ...],
+        df: DataFrame,
     ) -> DataFrame:  # pragma: no cover; not exposed by polars yet
         """Evaluate and return a dataframe."""
         columns = broadcast(*(e.evaluate(df) for e in exprs))
@@ -824,6 +878,7 @@ def check_agg(agg: expr.Expr) -> int:
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         keys_in: Sequence[expr.NamedExpr],
         agg_requests: Sequence[expr.NamedExpr],
         maintain_order: bool,  # noqa: FBT001
@@ -945,6 +1000,7 @@ def __init__(
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         predicate: plc.expressions.Expression,
         zlice: tuple[int, int] | None,
         suffix: str,
@@ -1117,6 +1173,7 @@ def _reorder_maps(
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         left_on_exprs: Sequence[expr.NamedExpr],
         right_on_exprs: Sequence[expr.NamedExpr],
         options: tuple[
@@ -1240,6 +1297,7 @@ def __init__(
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         exprs: Sequence[expr.NamedExpr],
         should_broadcast: bool,  # noqa: FBT001
         df: DataFrame,
@@ -1302,6 +1360,7 @@ def __init__(
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         keep: plc.stream_compaction.DuplicateKeepOption,
         subset: frozenset[str] | None,
         zlice: tuple[int, int] | None,
@@ -1391,6 +1450,7 @@ def __init__(
     @classmethod
     def do_evaluate(
         cls,
+        config: GPUEngine,
         by: Sequence[expr.NamedExpr],
         order: Sequence[plc.types.Order],
         null_order: Sequence[plc.types.NullOrder],
@@ -1446,7 +1506,9 @@ def __init__(self, schema: Schema, offset: int, length: int, df: IR):
         self.children = (df,)
 
     @classmethod
-    def do_evaluate(cls, offset: int, length: int, df: DataFrame) -> DataFrame:
+    def do_evaluate(
+        cls, config: GPUEngine, offset: int, length: int, df: DataFrame
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         return df.slice((offset, length))
 
@@ -1466,7 +1528,9 @@ def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR):
         self.children = (df,)
 
     @classmethod
-    def do_evaluate(cls, mask_expr: expr.NamedExpr, df: DataFrame) -> DataFrame:
+    def do_evaluate(
+        cls, config: GPUEngine, mask_expr: expr.NamedExpr, df: DataFrame
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         (mask,) = broadcast(mask_expr.evaluate(df), target_length=df.num_rows)
         return df.filter(mask)
@@ -1484,7 +1548,7 @@ def __init__(self, schema: Schema, df: IR):
         self.children = (df,)
 
     @classmethod
-    def do_evaluate(cls, schema: Schema, df: DataFrame) -> DataFrame:
+    def do_evaluate(cls, config: GPUEngine, schema: Schema, df: DataFrame) -> DataFrame:
         """Evaluate and return a dataframe."""
         # This can reorder things.
         columns = broadcast(
@@ -1560,7 +1624,9 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR):
         self._non_child_args = (name, self.options)
 
     @classmethod
-    def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame:
+    def do_evaluate(
+        cls, config: GPUEngine, name: str, options: Any, df: DataFrame
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         if name == "rechunk":
             # No-op in our data model
@@ -1639,7 +1705,9 @@ def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR)
             raise NotImplementedError("Schema mismatch")
 
     @classmethod
-    def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame:
+    def do_evaluate(
+        cls, config: GPUEngine, zlice: tuple[int, int] | None, *dfs: DataFrame
+    ) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: only evaluate what we need if we have a slice?
         return DataFrame.from_table(
@@ -1688,7 +1756,7 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
         )
 
     @classmethod
-    def do_evaluate(cls, *dfs: DataFrame) -> DataFrame:
+    def do_evaluate(cls, config: GPUEngine, *dfs: DataFrame) -> DataFrame:
         """Evaluate and return a dataframe."""
         max_rows = max(df.num_rows for df in dfs)
         # Horizontal concatenation extends shorter tables with nulls