-
Notifications
You must be signed in to change notification settings - Fork 915
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add multi-partition
DataFrameScan
support to cuDF-Polars (#17441)
Follow-up to #17262 Adds support for parallel `DataFrameScan` operations. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #17441
- Loading branch information
Showing
10 changed files
with
411 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
"""Multi-partition base classes.""" | ||
|
||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
from cudf_polars.dsl.ir import Union | ||
|
||
if TYPE_CHECKING: | ||
from collections.abc import Iterator, Sequence | ||
|
||
from cudf_polars.containers import DataFrame | ||
from cudf_polars.dsl.nodebase import Node | ||
|
||
|
||
class PartitionInfo: | ||
""" | ||
Partitioning information. | ||
This class only tracks the partition count (for now). | ||
""" | ||
|
||
__slots__ = ("count",) | ||
|
||
def __init__(self, count: int): | ||
self.count = count | ||
|
||
def keys(self, node: Node) -> Iterator[tuple[str, int]]: | ||
"""Return the partitioned keys for a given node.""" | ||
name = get_key_name(node) | ||
yield from ((name, i) for i in range(self.count)) | ||
|
||
|
||
def get_key_name(node: Node) -> str: | ||
"""Generate the key name for a Node.""" | ||
return f"{type(node).__name__.lower()}-{hash(node)}" | ||
|
||
|
||
def _concat(dfs: Sequence[DataFrame]) -> DataFrame: | ||
# Concatenate a sequence of DataFrames vertically | ||
return Union.do_evaluate(None, *dfs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
"""Multi-partition dispatch functions.""" | ||
|
||
from __future__ import annotations | ||
|
||
from functools import singledispatch | ||
from typing import TYPE_CHECKING, Any | ||
|
||
if TYPE_CHECKING: | ||
from collections.abc import MutableMapping | ||
from typing import TypeAlias | ||
|
||
from cudf_polars.dsl.ir import IR | ||
from cudf_polars.experimental.base import PartitionInfo | ||
from cudf_polars.typing import GenericTransformer | ||
|
||
|
||
LowerIRTransformer: TypeAlias = ( | ||
"GenericTransformer[IR, tuple[IR, MutableMapping[IR, PartitionInfo]]]" | ||
) | ||
"""Protocol for Lowering IR nodes.""" | ||
|
||
|
||
@singledispatch | ||
def lower_ir_node( | ||
ir: IR, rec: LowerIRTransformer | ||
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: | ||
""" | ||
Rewrite an IR node and extract partitioning information. | ||
Parameters | ||
---------- | ||
ir | ||
IR node to rewrite. | ||
rec | ||
Recursive LowerIRTransformer callable. | ||
Returns | ||
------- | ||
new_ir, partition_info | ||
The rewritten node, and a mapping from unique nodes in | ||
the full IR graph to associated partitioning information. | ||
Notes | ||
----- | ||
This function is used by `lower_ir_graph`. | ||
See Also | ||
-------- | ||
lower_ir_graph | ||
""" | ||
raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover | ||
|
||
|
||
@singledispatch | ||
def generate_ir_tasks( | ||
ir: IR, partition_info: MutableMapping[IR, PartitionInfo] | ||
) -> MutableMapping[Any, Any]: | ||
""" | ||
Generate a task graph for evaluation of an IR node. | ||
Parameters | ||
---------- | ||
ir | ||
IR node to generate tasks for. | ||
partition_info | ||
Partitioning information, obtained from :func:`lower_ir_graph`. | ||
Returns | ||
------- | ||
mapping | ||
A (partial) dask task graph for the evaluation of an ir node. | ||
Notes | ||
----- | ||
Task generation should only produce the tasks for the current node, | ||
referring to child tasks by name. | ||
See Also | ||
-------- | ||
task_graph | ||
""" | ||
raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
"""Multi-partition IO Logic.""" | ||
|
||
from __future__ import annotations | ||
|
||
import math | ||
from typing import TYPE_CHECKING | ||
|
||
from cudf_polars.dsl.ir import DataFrameScan, Union | ||
from cudf_polars.experimental.base import PartitionInfo | ||
from cudf_polars.experimental.dispatch import lower_ir_node | ||
|
||
if TYPE_CHECKING: | ||
from collections.abc import MutableMapping | ||
|
||
from cudf_polars.dsl.ir import IR | ||
from cudf_polars.experimental.dispatch import LowerIRTransformer | ||
|
||
|
||
@lower_ir_node.register(DataFrameScan) | ||
def _( | ||
ir: DataFrameScan, rec: LowerIRTransformer | ||
) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: | ||
rows_per_partition = ir.config_options.get("executor_options", {}).get( | ||
"max_rows_per_partition", 1_000_000 | ||
) | ||
|
||
nrows = max(ir.df.shape()[0], 1) | ||
count = math.ceil(nrows / rows_per_partition) | ||
|
||
if count > 1: | ||
length = math.ceil(nrows / count) | ||
slices = [ | ||
DataFrameScan( | ||
ir.schema, | ||
ir.df.slice(offset, length), | ||
ir.projection, | ||
ir.predicate, | ||
ir.config_options, | ||
) | ||
for offset in range(0, nrows, length) | ||
] | ||
new_node = Union(ir.schema, None, *slices) | ||
return new_node, {slice: PartitionInfo(count=1) for slice in slices} | { | ||
new_node: PartitionInfo(count=count) | ||
} | ||
|
||
return ir, {ir: PartitionInfo(count=1)} |
Oops, something went wrong.