Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Executor for polars logical plans #15504

Merged
merged 58 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
b3d0e06
Give pylibcudf DataTypes a __hash__
wence- May 16, 2024
22f6a4f
WIP: Translate polars IR to ours
wence- May 8, 2024
8ac4347
Add some container objects
wence- May 8, 2024
4ab983e
WIP: really, fleshing out some evaluation
wence- May 8, 2024
1981a3d
Flesh out more container stuff
wence- May 9, 2024
700f075
WIP: More fleshing out evaluation
wence- May 9, 2024
9c303bc
WIP: More fleshing out
wence- May 9, 2024
688d8ef
WIP: more implementation
wence- May 10, 2024
f56525a
WIP: simplify
wence- May 10, 2024
2cb6f50
WIP: Maybe done with eval of plan nodes
wence- May 10, 2024
c3e0a92
WIP: expression evaluation
wence- May 13, 2024
ec4562c
WIP: some more
wence- May 13, 2024
f21cd57
WIP: some agg expr stuff
wence- May 14, 2024
1f5a490
Bla
wence- May 14, 2024
31a3d5e
More fixes
wence- May 15, 2024
cda34e0
WIP: More working
wence- May 16, 2024
235575d
Expr objects are no longer dataclasses
wence- May 17, 2024
e158de6
No recursive nvtx annotations
wence- May 17, 2024
b400391
Testing infrastructure
wence- May 17, 2024
7f04985
Add basic tests
wence- May 17, 2024
233c1be
All tests passing (or at least xfailing appropriately)
wence- May 17, 2024
3a3ad2d
Handle string functions and boolean functions and add some docs
wence- May 21, 2024
dd6efaa
Flesh out more boolean functions
wence- May 21, 2024
e279a2f
More fixes
wence- May 21, 2024
bdd6ee3
Simplify
wence- May 21, 2024
c06b980
More fixes
wence- May 21, 2024
3b17c71
xfail strict in cudf_polars tests
wence- May 21, 2024
19db751
Overview doc, simplify callback
wence- May 21, 2024
146327c
Docstrings for plan nodes.
wence- May 21, 2024
e81a1e1
ClosedInterval will be a string
wence- May 21, 2024
98281e8
Small fixes from code review
wence- May 22, 2024
3a1ac86
Dedent some assertions
wence- May 22, 2024
f0686a2
More fixes in review
wence- May 23, 2024
8d25f3a
Singledispatch for translation
wence- May 23, 2024
90fca6d
Spell out DSL
wence- May 23, 2024
0f82d0f
Avoid double import
wence- May 23, 2024
f5683e7
Docs fixes
wence- May 23, 2024
b774e0e
Merge remote-tracking branch 'upstream/branch-24.08' into wence/fea/c…
wence- May 24, 2024
34aac9a
Split scan tests out into separate file
wence- May 24, 2024
74e3824
Build out groupby test and fix one bug
wence- May 24, 2024
b77c573
Split out a few more tests
wence- May 24, 2024
4b7dd6e
Move expression tests to subdirectory
wence- May 24, 2024
3aefc56
Migrate agg tests
wence- May 24, 2024
22805a6
Joins and sorts already test elsewhere
wence- May 24, 2024
d8745f6
Better distinct test and fix bug
wence- May 24, 2024
eb6626e
More exhaustive binop tests
wence- May 24, 2024
246ff6a
Migrate basic gather test
wence- May 24, 2024
26c5994
Basic tests now covered elsewhere, or unimplemented functionality
wence- May 24, 2024
00628b0
Update join for new names
wence- May 28, 2024
47df8e2
Dataframe copy
wence- May 29, 2024
2157323
Fix handling of CSE in Select and HStack
wence- May 29, 2024
6d324cb
Adapt to polars-side changes
wence- May 30, 2024
786730a
A few more tests
wence- May 30, 2024
810a8b8
Merge remote-tracking branch 'upstream/branch-24.08' into wence/fea/c…
wence- May 30, 2024
2773b0b
Update for rapids-build-backend
wence- May 30, 2024
62f6455
Rename with_sorted to sorted_like
wence- May 30, 2024
a1f579f
Column.copy takes an optional new_name argument
wence- May 30, 2024
1240b62
Expand docstrings
wence- May 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
- polars>=0.20.24
- polars>=0.20.30
run_dask_cudf:
common:
- output_types: [conda, requirements, pyproject]
Expand Down
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ cdef class DataType:
self.c_obj == (<DataType>other).c_obj
)

def __hash__(self):
return hash((self.c_obj.id(), self.c_obj.scale()))

@staticmethod
cdef DataType from_libcudf(data_type dt):
"""Create a DataType from a libcudf data_type.
Expand Down
56 changes: 56 additions & 0 deletions python/cudf_polars/cudf_polars/callback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Callback for the polars collect function to execute on device."""

from __future__ import annotations

from functools import partial
from typing import TYPE_CHECKING

import nvtx

from cudf_polars.dsl.translate import translate_ir

if TYPE_CHECKING:
import polars as pl

from cudf_polars.dsl.ir import IR

__all__: list[str] = ["execute_with_cudf"]


def _callback(
ir: IR,
with_columns: list[str] | None,
pyarrow_predicate: str | None,
n_rows: int | None,
) -> pl.DataFrame:
assert with_columns is None
assert pyarrow_predicate is None
assert n_rows is None
vyasr marked this conversation as resolved.
Show resolved Hide resolved
with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
return ir.evaluate(cache={}).to_polars()


def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None:
"""
A post optimization callback that attempts to execute the plan with cudf.

Parameters
----------
nt
NodeTraverser

raise_on_fail
Should conversion raise an exception rather than continuing
without setting a callback.

The NodeTraverser is mutated if the libcudf executor can handle the plan.
"""
try:
with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
nt.set_udf(partial(_callback, translate_ir(nt)))
except NotImplementedError:
if raise_on_fail:
raise
12 changes: 12 additions & 0 deletions python/cudf_polars/cudf_polars/containers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Containers of concrete data."""

from __future__ import annotations

__all__: list[str] = ["DataFrame", "Column", "Scalar"]

from cudf_polars.containers.column import Column
from cudf_polars.containers.dataframe import DataFrame
from cudf_polars.containers.scalar import Scalar
119 changes: 119 additions & 0 deletions python/cudf_polars/cudf_polars/containers/column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""A column, with some properties."""

from __future__ import annotations

import functools
from typing import TYPE_CHECKING

import cudf._lib.pylibcudf as plc

if TYPE_CHECKING:
from typing_extensions import Self

__all__: list[str] = ["Column"]


class Column:
"""A column, a name, and sortedness."""

obj: plc.Column
name: str
is_sorted: plc.types.Sorted
order: plc.types.Order
null_order: plc.types.NullOrder

def __init__(self, column: plc.Column, name: str):
self.obj = column
self.name = name
self.is_sorted = plc.types.Sorted.NO
self.order = plc.types.Order.ASCENDING
self.null_order = plc.types.NullOrder.BEFORE

def sorted_like(self, like: Column, /) -> Self:
"""
Copy sortedness properties from a column onto self.

Parameters
----------
like
The column to copy sortedness metadata from.

Returns
-------
Self with metadata set.

See Also
--------
set_sorted
"""
return self.set_sorted(
is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
)

def set_sorted(
self,
*,
is_sorted: plc.types.Sorted,
order: plc.types.Order,
null_order: plc.types.NullOrder,
) -> Self:
"""
Modify sortedness metadata in place.

Parameters
----------
is_sorted
Is the column sorted
order
The order if sorted
null_order
Where nulls sort, if sorted

Returns
-------
Self with metadata set.
"""
self.is_sorted = is_sorted
self.order = order
self.null_order = null_order
return self

def copy(self, *, new_name: str | None = None) -> Self:
"""
Return a shallow copy of the column.

Parameters
----------
new_name
Optional new name for the copied column.

Returns
-------
New column sharing data with self.
"""
return type(self)(
self.obj, self.name if new_name is None else new_name
).sorted_like(self)

def mask_nans(self) -> Self:
"""Return a copy of self with nans masked out."""
if self.nan_count > 0:
raise NotImplementedError
return self.copy()

@functools.cached_property
def nan_count(self) -> int:
"""Return the number of NaN values in the column."""
if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
return 0
return plc.interop.to_arrow(
plc.reduce.reduce(
plc.unary.is_nan(self.obj),
plc.aggregation.sum(),
# TODO: pylibcudf needs to have a SizeType DataType singleton
plc.DataType(plc.TypeId.INT32),
)
).as_py()
Loading
Loading