rapidsai · rapids-bot · Dec 3, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -217,14 +217,16 @@ def validate_config_options(config: dict) -> None:
         If the configuration contains unsupported options.
     """
     if unsupported := (
-        config.keys() - {"raise_on_fail", "parquet_options", "executor"}
+        config.keys()
+        - {"raise_on_fail", "parquet_options", "parallel_options", "executor"}
     ):
         raise ValueError(
             f"Engine configuration contains unsupported settings: {unsupported}"
         )
     assert {"chunked", "chunk_read_limit", "pass_read_limit"}.issuperset(
         config.get("parquet_options", {})
     )
+    assert {"num_rows_threshold"}.issuperset(config.get("parallel_options", {}))
 
 
 def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None:

@@ -683,27 +683,36 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
 
-    __slots__ = ("df", "projection", "predicate")
-    _non_child = ("schema", "df", "projection", "predicate")
+    __slots__ = ("df", "projection", "predicate", "config_options")
+    _non_child = ("schema", "df", "projection", "predicate", "config_options")
     df: Any
     """Polars LazyFrame object."""
     projection: tuple[str, ...] | None
     """List of columns to project out."""
     predicate: expr.NamedExpr | None
     """Mask to apply."""
+    config_options: dict[str, Any]
+    """GPU-specific configuration options"""
 
     def __init__(
         self,
         schema: Schema,
         df: Any,
         projection: Sequence[str] | None,
         predicate: expr.NamedExpr | None,
+        config_options: dict[str, Any],
     ):
         self.schema = schema
         self.df = df
         self.projection = tuple(projection) if projection is not None else None
         self.predicate = predicate
-        self._non_child_args = (schema, df, self.projection, predicate)
+        self.config_options = config_options
+        self._non_child_args = (
+            schema,
+            pl.DataFrame._from_pydf(df),
+            self.projection,
+            predicate,
+        )
         self.children = ()
 
     def get_hashable(self) -> Hashable:
@@ -714,7 +723,14 @@ def get_hashable(self) -> Hashable:
         not stable across runs, or repeat instances of the same equal dataframes.
         """
         schema_hash = tuple(self.schema.items())
-        return (type(self), schema_hash, id(self.df), self.projection, self.predicate)
+        return (
+            type(self),
+            schema_hash,
+            id(self.df),
+            self.projection,
+            self.predicate,
+            json.dumps(self.config_options),
+        )
 
     @classmethod
     def do_evaluate(
@@ -725,10 +741,9 @@ def do_evaluate(
         predicate: expr.NamedExpr | None,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
-        pdf = pl.DataFrame._from_pydf(df)
         if projection is not None:
-            pdf = pdf.select(projection)
-        df = DataFrame.from_polars(pdf)
+            df = df.select(projection)
+        df = DataFrame.from_polars(df)
         assert all(
             c.obj.type() == dtype
             for c, dtype in zip(df.columns, schema.values(), strict=True)

@@ -263,6 +263,7 @@ def _(
         translate_named_expr(translator, n=node.selection)
         if node.selection is not None
         else None,
+        translator.config.config.copy(),
     )
 
 

@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Parallel IO Logic."""
+
+from __future__ import annotations
+
+import math
+from functools import cached_property
+from typing import TYPE_CHECKING, Any
+
+import polars as pl
+
+from cudf_polars.dsl.ir import DataFrameScan
+from cudf_polars.experimental.parallel import (
+    PartitionInfo,
+    generate_ir_tasks,
+    get_key_name,
+    lower_ir_node,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.parallel import LowerIRTransformer
+
+
+##
+## DataFrameScan
+##
+
+
+class ParDataFrameScan(DataFrameScan):
+    """Parallel DataFrameScan."""
+
+    @property
+    def _max_n_rows(self) -> int:
+        """Row-count threshold for splitting a DataFrame."""
+        parallel_options = self.config_options.get("parallel_options", {})
+        return parallel_options.get("num_rows_threshold", 1_000_000)
+
+    @cached_property
+    def _count(self) -> int:
+        """Partition count."""
+        total_rows = max(self.df.shape()[0], 1)
+        return math.ceil(total_rows / self._max_n_rows)
+
+    def _tasks(
+        self, partition_info: MutableMapping[IR, PartitionInfo]
+    ) -> MutableMapping[Any, Any]:
+        """Task graph."""
+        assert (
+            partition_info[self].count == self._count
+        ), "Inconsistent ParDataFrameScan partitioning."
+        total_rows = max(self.df.shape()[0], 1)
+        stride = math.ceil(total_rows / self._count)
+        key_name = get_key_name(self)
+        return {
+            (key_name, i): (
+                self.do_evaluate,
+                self.schema,
+                pl.DataFrame._from_pydf(self.df.slice(offset, stride)),
+                self.projection,
+                self.predicate,
+            )
+            for i, offset in enumerate(range(0, total_rows, stride))
+        }
+
+
+@lower_ir_node.register(ParDataFrameScan)
+def _(
+    ir: ParDataFrameScan, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    # Avoid reconstruction if we need to re-lower
+    return ir, {ir: PartitionInfo(count=ir._count)}  # pragma: no cover
+
+
+@generate_ir_tasks.register(ParDataFrameScan)
+def _(
+    ir: ParDataFrameScan, partition_info: MutableMapping[IR, PartitionInfo]
+) -> MutableMapping[Any, Any]:
+    return ir._tasks(partition_info)
@@ -8,11 +8,11 @@
 from functools import reduce, singledispatch
 from typing import TYPE_CHECKING, Any
 
-from cudf_polars.dsl.ir import IR
+from cudf_polars.dsl.ir import IR, DataFrameScan, Union
 from cudf_polars.dsl.traversal import traversal
 
 if TYPE_CHECKING:
-    from collections.abc import MutableMapping
+    from collections.abc import MutableMapping, Sequence
     from typing import TypeAlias
 
     from cudf_polars.containers import DataFrame
@@ -79,7 +79,9 @@ def lower_ir_node(
 def _(ir: IR, rec: LowerIRTransformer) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
     if len(ir.children) == 0:
         # Default leaf node has single partition
-        return ir, {ir: PartitionInfo(count=1)}
+        return ir, {
+            ir: PartitionInfo(count=1)
+        }  # pragma: no cover; Missed by pylibcudf executor
 
     # Lower children
     children, _partition_info = zip(*(rec(c) for c in ir.children), strict=False)
@@ -223,7 +225,14 @@ def task_graph(
         operator.or_,
         (generate_ir_tasks(node, partition_info) for node in traversal(ir)),
     )
-    return graph, (get_key_name(ir), 0)
+
+    key_name = get_key_name(ir)
+    partition_count = partition_info[ir].count
+    if partition_count > 1:
+        graph[key_name] = (_concat, [(key_name, i) for i in range(partition_count)])
+        return graph, key_name
+    else:
+        return graph, (key_name, 0)
 
 
 def evaluate_dask(ir: IR) -> DataFrame:
@@ -234,3 +243,29 @@ def evaluate_dask(ir: IR) -> DataFrame:
 
     graph, key = task_graph(ir, partition_info)
     return get(graph, key)
+
+
+def _concat(dfs: Sequence[DataFrame]) -> DataFrame:
+    # Concatenate a sequence of DataFrames vertically
+    return Union.do_evaluate(None, *dfs)
+
+
+##
+## DataFrameScan
+##
+
+
+@lower_ir_node.register(DataFrameScan)
+def _(
+    ir: DataFrameScan, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    from cudf_polars.experimental.io import ParDataFrameScan
+
+    new_node = ParDataFrameScan(
+        ir.schema,
+        ir.df,
+        ir.projection,
+        ir.predicate,
+        ir.config_options,
+    )
+    return new_node, {new_node: PartitionInfo(count=new_node._count)}
@@ -116,7 +116,11 @@ def test_rewrite_ir_node():
     def replace_df(node, rec):
         if isinstance(node, ir.DataFrameScan):
             return ir.DataFrameScan(
-                node.schema, new_df._df, node.projection, node.predicate
+                node.schema,
+                new_df._df,
+                node.projection,
+                node.predicate,
+                node.config_options,
             )
         return reuse_if_unchanged(node, rec)
 
@@ -144,7 +148,11 @@ def test_rewrite_scan_node(tmp_path):
     def replace_scan(node, rec):
         if isinstance(node, ir.Scan):
             return ir.DataFrameScan(
-                node.schema, right._df, node.with_columns, node.predicate
+                node.schema,
+                right._df,
+                node.with_columns,
+                node.predicate,
+                node.config_options,
             )
         return reuse_if_unchanged(node, rec)
 

@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import Translator
+from cudf_polars.experimental.parallel import lower_ir_graph
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "x": range(30_000),
+            "y": ["cat", "dog", "fish"] * 10_000,
+            "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 6_000,
+        }
+    )
+
+
+@pytest.mark.parametrize("num_rows_threshold", [1_000, 1_000_000])
+def test_parallel_dataframescan(df, num_rows_threshold):
+    total_row_count = len(df.collect())
+    engine = pl.GPUEngine(
+        raise_on_fail=True,
+        parallel_options={"num_rows_threshold": num_rows_threshold},
+        executor="dask-experimental",
+    )
+    assert_gpu_result_equal(df, engine=engine)
+
+    # Check partitioning
+    qir = Translator(df._ldf.visit(), engine).translate_ir()
+    ir, info = lower_ir_graph(qir)
+    count = info[ir].count
+    if num_rows_threshold < total_row_count:
+        assert count > 1
+    else:
+        assert count == 1