Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce NamedColumn concept in cudf-polars #15914

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/cudf_polars/cudf_polars/containers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from __future__ import annotations

__all__: list[str] = ["DataFrame", "Column", "Scalar"]
__all__: list[str] = ["DataFrame", "Column", "NamedColumn", "Scalar"]

from cudf_polars.containers.column import Column
from cudf_polars.containers.column import Column, NamedColumn
from cudf_polars.containers.dataframe import DataFrame
from cudf_polars.containers.scalar import Scalar
78 changes: 61 additions & 17 deletions python/cudf_polars/cudf_polars/containers/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,29 @@
if TYPE_CHECKING:
from typing_extensions import Self

__all__: list[str] = ["Column"]
__all__: list[str] = ["Column", "NamedColumn"]


class Column:
"""A column, a name, and sortedness."""
"""A column with sortedness metadata."""

obj: plc.Column
name: str
is_sorted: plc.types.Sorted
order: plc.types.Order
null_order: plc.types.NullOrder

def __init__(self, column: plc.Column, name: str):
def __init__(
self,
column: plc.Column,
*,
is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
order: plc.types.Order = plc.types.Order.ASCENDING,
null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
):
self.obj = column
self.name = name
self.is_sorted = plc.types.Sorted.NO
self.order = plc.types.Order.ASCENDING
self.null_order = plc.types.NullOrder.BEFORE
self.is_sorted = is_sorted
self.order = order
self.null_order = null_order

def sorted_like(self, like: Column, /) -> Self:
"""
Expand Down Expand Up @@ -81,22 +86,20 @@ def set_sorted(
self.null_order = null_order
return self

def copy(self, *, new_name: str | None = None) -> Self:
def copy(self) -> Self:
"""
Return a shallow copy of the column.

Parameters
----------
new_name
Optional new name for the copied column.
A shallow copy of the column.

Returns
-------
New column sharing data with self.
"""
return type(self)(
self.obj, self.name if new_name is None else new_name
).sorted_like(self)
self.obj,
is_sorted=self.is_sorted,
order=self.order,
null_order=self.null_order,
)

def mask_nans(self) -> Self:
"""Return a copy of self with nans masked out."""
Expand All @@ -117,3 +120,44 @@ def nan_count(self) -> int:
plc.DataType(plc.TypeId.INT32),
)
).as_py()


class NamedColumn(Column):
"""A column with a name."""

name: str

def __init__(
self,
column: plc.Column,
name: str,
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
*,
is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
order: plc.types.Order = plc.types.Order.ASCENDING,
null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
) -> None:
super().__init__(
column, is_sorted=is_sorted, order=order, null_order=null_order
)
self.name = name

def copy(self, *, new_name: str | None = None) -> Self:
"""
A shallow copy of the column.

Parameters
----------
new_name
Optional new name for the copied column.

Returns
-------
New column sharing data with self.
"""
return type(self)(
self.obj,
self.name if new_name is None else new_name,
is_sorted=self.is_sorted,
order=self.order,
null_order=self.null_order,
)
51 changes: 23 additions & 28 deletions python/cudf_polars/cudf_polars/containers/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import cudf._lib.pylibcudf as plc

from cudf_polars.containers.column import Column
from cudf_polars.containers.column import NamedColumn

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence, Set
Expand All @@ -21,7 +21,7 @@

import cudf

from cudf_polars.containers.scalar import Scalar
from cudf_polars.containers import Column


__all__: list[str] = ["DataFrame"]
Expand All @@ -30,26 +30,20 @@
class DataFrame:
"""A representation of a dataframe."""

columns: list[Column]
scalars: list[Scalar]
columns: list[NamedColumn]
table: plc.Table | None

def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None:
def __init__(self, columns: Sequence[NamedColumn]) -> None:
self.columns = list(columns)
self._column_map = {c.name: c for c in self.columns}
self.scalars = list(scalars)
if len(scalars) == 0:
self.table = plc.Table([c.obj for c in columns])
else:
self.table = None
self.table = plc.Table([c.obj for c in columns])

def copy(self) -> Self:
"""Return a shallow copy of self."""
return type(self)(self.columns, self.scalars)
return type(self)(self.columns)

def to_polars(self) -> pl.DataFrame:
"""Convert to a polars DataFrame."""
assert len(self.scalars) == 0
return pl.from_arrow(
plc.interop.to_arrow(
self.table,
Expand Down Expand Up @@ -83,8 +77,10 @@ def num_rows(self) -> int:
def from_cudf(cls, df: cudf.DataFrame) -> Self:
"""Create from a cudf dataframe."""
return cls(
[Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()],
[],
[
NamedColumn(c.to_pylibcudf(mode="read"), name)
for name, c in df._data.items()
]
)

@classmethod
Expand All @@ -108,10 +104,12 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
ValueError if the number of provided names does not match the
number of columns in the table.
"""
# TODO: strict=True when we drop py39
if table.num_columns() != len(names):
raise ValueError("Mismatching name and table length.")
return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
return cls(
# TODO: strict=True when we drop py39
[NamedColumn(c, name) for c, name in zip(table.columns(), names)]
)

def sorted_like(
self, like: DataFrame, /, *, subset: Set[str] | None = None
Expand Down Expand Up @@ -139,11 +137,12 @@ def sorted_like(
subset = self.column_names_set if subset is None else subset
self.columns = [
c.sorted_like(other) if c.name in subset else c
# TODO: strict=True when we drop py39
for c, other in zip(self.columns, like.columns)
]
return self

def with_columns(self, columns: Sequence[Column]) -> Self:
def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
"""
Return a new dataframe with extra columns.

Expand All @@ -160,35 +159,31 @@ def with_columns(self, columns: Sequence[Column]) -> Self:
-----
If column names overlap, newer names replace older ones.
"""
return type(self)([*self.columns, *columns], self.scalars)
return type(self)([*self.columns, *columns])

def discard_columns(self, names: Set[str]) -> Self:
"""Drop columns by name."""
return type(self)(
[c for c in self.columns if c.name not in names], self.scalars
)
return type(self)([c for c in self.columns if c.name not in names])

def select(self, names: Sequence[str]) -> Self:
"""Select columns by name returning DataFrame."""
want = set(names)
if not want.issubset(self.column_names_set):
raise ValueError("Can't select missing names")
return type(self)([self._column_map[name] for name in names], self.scalars)
return type(self)([self._column_map[name] for name in names])

def replace_columns(self, *columns: Column) -> Self:
def replace_columns(self, *columns: NamedColumn) -> Self:
"""Return a new dataframe with columns replaced by name."""
new = {c.name: c for c in columns}
if not set(new).issubset(self.column_names_set):
raise ValueError("Cannot replace with non-existing names")
return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
return type(self)([new.get(c.name, c) for c in self.columns])

def rename_columns(self, mapping: Mapping[str, str]) -> Self:
"""Rename some columns."""
return type(self)(
[c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars
)
return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns])

def select_columns(self, names: Set[str]) -> list[Column]:
def select_columns(self, names: Set[str]) -> list[NamedColumn]:
"""Select columns by name."""
return [c for c in self.columns if c.name in names]

Expand Down
Loading
Loading