From f7b159ab549b49f78ada634b225d1fb2e9ad92f0 Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Wed, 18 Dec 2024 12:07:35 +0100 Subject: [PATCH 01/16] make 'arrange' kwarg required for window functions --- generate_col_ops.py | 5 +++-- src/pydiverse/transform/_internal/ops/op.py | 11 ++++++++-- .../_internal/ops/ops/aggregation.py | 7 +++++-- .../transform/_internal/ops/ops/window.py | 21 ++++++++++++++++--- .../transform/_internal/pipe/functions.py | 16 ++++++++++---- .../transform/_internal/tree/col_expr.py | 2 +- .../transform/_internal/tree/types.py | 9 -------- tests/test_core.py | 19 ++++++++++++----- 8 files changed, 62 insertions(+), 28 deletions(-) diff --git a/generate_col_ops.py b/generate_col_ops.py index cfaa8f4..d5ddb13 100644 --- a/generate_col_ops.py +++ b/generate_col_ops.py @@ -78,7 +78,8 @@ def generate_fn_decl( } annotated_kwargs = "".join( - f", {kwarg}: {context_kwarg_annotation[kwarg]} | None = None" + f", {kwarg.name}: {context_kwarg_annotation[kwarg.name]}" + + f"{'' if kwarg.required else ' | None = None'}" for kwarg in op.context_kwargs ) @@ -116,7 +117,7 @@ def generate_fn_body( args = add_vararg_star(args) if op.context_kwargs is not None: - kwargs = "".join(f", {kwarg}={kwarg}" for kwarg in op.context_kwargs) + kwargs = "".join(f", {kwarg.name}={kwarg.name}" for kwarg in op.context_kwargs) else: kwargs = "" diff --git a/src/pydiverse/transform/_internal/ops/op.py b/src/pydiverse/transform/_internal/ops/op.py index 0030903..2013835 100644 --- a/src/pydiverse/transform/_internal/ops/op.py +++ b/src/pydiverse/transform/_internal/ops/op.py @@ -1,5 +1,6 @@ from __future__ import annotations +import dataclasses import enum from collections.abc import Sequence from typing import Any @@ -14,6 +15,12 @@ class Ftype(enum.IntEnum): WINDOW = 3 +@dataclasses.dataclass(slots=True) +class ContextKwarg: + name: str + required: bool + + class Operator: __slots__ = ( "name", @@ -31,7 +38,7 @@ class Operator: trie: SignatureTrie signatures: list[Signature] ftype: Ftype - context_kwargs: list[str] + context_kwargs: list[ContextKwarg] param_names: list[str] default_values: list[str] | None generate_expr_method: bool @@ -42,7 +49,7 @@ def __init__( name: str, *signatures: Signature, ftype: Ftype = Ftype.ELEMENT_WISE, - context_kwargs: list[str] | None = None, + context_kwargs: list[ContextKwarg] | None = None, param_names: list[str] | None = None, default_values: list[Any] | None = None, generate_expr_method: bool = True, diff --git a/src/pydiverse/transform/_internal/ops/ops/aggregation.py b/src/pydiverse/transform/_internal/ops/ops/aggregation.py index c7b53c0..4688aee 100644 --- a/src/pydiverse/transform/_internal/ops/ops/aggregation.py +++ b/src/pydiverse/transform/_internal/ops/ops/aggregation.py @@ -1,6 +1,6 @@ from __future__ import annotations -from pydiverse.transform._internal.ops.op import Ftype, Operator +from pydiverse.transform._internal.ops.op import ContextKwarg, Ftype, Operator from pydiverse.transform._internal.ops.signature import Signature from pydiverse.transform._internal.tree.types import ( COMPARABLE, @@ -25,7 +25,10 @@ def __init__( name, *signatures, ftype=Ftype.AGGREGATE, - context_kwargs=["partition_by", "filter"], + context_kwargs=[ + ContextKwarg("partition_by", False), + ContextKwarg("filter", False), + ], generate_expr_method=generate_expr_method, doc=doc, ) diff --git a/src/pydiverse/transform/_internal/ops/ops/window.py b/src/pydiverse/transform/_internal/ops/ops/window.py index cff39e6..1479f4f 100644 --- a/src/pydiverse/transform/_internal/ops/ops/window.py +++ b/src/pydiverse/transform/_internal/ops/ops/window.py @@ -2,7 +2,7 @@ from typing import Any -from pydiverse.transform._internal.ops.op import Ftype, Operator +from pydiverse.transform._internal.ops.op import ContextKwarg, Ftype, Operator from pydiverse.transform._internal.ops.signature import Signature from pydiverse.transform._internal.tree.types import D, Int, Tvar @@ -21,7 +21,10 @@ def __init__( name, *signatures, ftype=Ftype.WINDOW, - context_kwargs=["partition_by", "arrange"], + context_kwargs=[ + ContextKwarg("partition_by", False), + ContextKwarg("arrange", True), + ], param_names=param_names, default_values=default_values, generate_expr_method=generate_expr_method, @@ -39,6 +42,18 @@ def __init__( row_number = Window("row_number", Signature(return_type=Int())) -rank = Window("rank", Signature(return_type=Int())) +rank = Window( + "rank", + Signature(return_type=Int()), + doc=""" +The number of strictly smaller elements in the column, plus one. + +This is the same as `rank("min")` in polars. + +Examples +-------- +t = pdt.Table({"}) +""", +) dense_rank = Window("dense_rank", Signature(return_type=Int())) diff --git a/src/pydiverse/transform/_internal/pipe/functions.py b/src/pydiverse/transform/_internal/pipe/functions.py index 2e2428b..c2fb423 100644 --- a/src/pydiverse/transform/_internal/pipe/functions.py +++ b/src/pydiverse/transform/_internal/pipe/functions.py @@ -99,7 +99,7 @@ def count( def dense_rank( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr] | None = None, + arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr[Int]: """""" @@ -169,9 +169,17 @@ def min(arg: ColExpr, *args: ColExpr) -> ColExpr: def rank( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr] | None = None, + arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr[Int]: - """""" + """ + The number of strictly smaller elements in the column, plus one. + + This is the same as `rank("min")` in polars. + + Examples + -------- + t = pdt.Table({"}) + """ return ColFn(ops.rank, partition_by=partition_by, arrange=arrange) @@ -179,7 +187,7 @@ def rank( def row_number( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr] | None = None, + arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr[Int]: """""" diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index 226096d..6b0baf3 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -733,7 +733,7 @@ def shift( fill_value: ColExpr = None, *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr] | None = None, + arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr: """""" diff --git a/src/pydiverse/transform/_internal/tree/types.py b/src/pydiverse/transform/_internal/tree/types.py index 7f6d6d7..b9466d1 100644 --- a/src/pydiverse/transform/_internal/tree/types.py +++ b/src/pydiverse/transform/_internal/tree/types.py @@ -2,7 +2,6 @@ import datetime import inspect -import warnings class Dtype: @@ -16,14 +15,6 @@ def __eq__(self, rhs: Dtype | type[Dtype] | None) -> bool: return False if inspect.isclass(rhs) and issubclass(rhs, Dtype): rhs = rhs() - if is_supertype(rhs.without_const()) and not is_subtype(rhs.without_const()): - warnings.warn( - f"comparing to the supertype `{rhs}` for equality may yield unexpected " - "results.\n" - "hint: `==` is very strict, e.g. Int16() == Int() is False. Use the " - "`<=` operator to also match on subtypes (so Int16() <= Int() would " - "return True)." - ) if isinstance(rhs, Dtype): return self.const == rhs.const and type(self) is type(rhs) diff --git a/tests/test_core.py b/tests/test_core.py index f7b4d52..42abba3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2,20 +2,21 @@ import polars as pl import pytest +from polars.testing import assert_series_equal -from pydiverse.transform import C, Table -from pydiverse.transform._internal.pipe.pipeable import Pipeable, inverse_partial, verb -from pydiverse.transform._internal.pipe.verbs import mutate, select +import pydiverse.transform as pdt +from pydiverse.transform._internal.pipe.pipeable import Pipeable, inverse_partial +from pydiverse.transform.common import * @pytest.fixture def tbl1(): - return Table(pl.DataFrame({"col1": [0.1], "col2": [3.14]})) + return pdt.Table(pl.DataFrame({"col1": [0.1], "col2": [3.14]})) @pytest.fixture def tbl2(): - return Table(pl.DataFrame({"col1": [1], "col2": [2], "col3": [3]})) + return pdt.Table(pl.DataFrame({"col1": [1], "col2": [2], "col3": [3]})) class TestTable: @@ -59,6 +60,14 @@ def test_contains(self, tbl1, tbl2): assert all(col in tbl1 for col in tbl1) assert all(col in tbl2 for col in tbl2) + def test_dict_constructor(self): + t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) + assert_series_equal( + pdt.rank(arrange=t.a) >> export(Polars()), + pl.Series([3, 1, 4, 1, 6, 7, 4]), + check_names=False, + ) + class TestDispatchers: def test_inverse_partial(self): From bd900c3c861efe0fae8f6a6f784fc1a07c86f26f Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Wed, 18 Dec 2024 12:25:56 +0100 Subject: [PATCH 02/16] document rank --- .../transform/_internal/backend/table_impl.py | 2 +- .../transform/_internal/ops/ops/window.py | 19 +++++++++++++++++-- .../transform/_internal/pipe/functions.py | 19 +++++++++++++++++-- .../transform/_internal/pipe/verbs.py | 2 +- 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/pydiverse/transform/_internal/backend/table_impl.py b/src/pydiverse/transform/_internal/backend/table_impl.py index fc5cdc9..8614ff0 100644 --- a/src/pydiverse/transform/_internal/backend/table_impl.py +++ b/src/pydiverse/transform/_internal/backend/table_impl.py @@ -75,7 +75,7 @@ def from_resource( if hasattr(resource, "name"): name = resource.name else: - name = "?" + name = "" if backend is None or isinstance(backend, Polars): from pydiverse.transform._internal.backend.polars import PolarsImpl diff --git a/src/pydiverse/transform/_internal/ops/ops/window.py b/src/pydiverse/transform/_internal/ops/ops/window.py index 1479f4f..248332d 100644 --- a/src/pydiverse/transform/_internal/ops/ops/window.py +++ b/src/pydiverse/transform/_internal/ops/ops/window.py @@ -46,13 +46,28 @@ def __init__( "rank", Signature(return_type=Int()), doc=""" -The number of strictly smaller elements in the column, plus one. +The number of strictly smaller elements in the column plus one. This is the same as `rank("min")` in polars. Examples -------- -t = pdt.Table({"}) +>>> t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) +>>> t >> mutate(b=pdt.rank(arrange=t.a)) >> export(Polars(lazy=False)) +shape: (7, 2) +┌─────┬─────┐ +│ a ┆ b │ +│ --- ┆ --- │ +│ i64 ┆ i64 │ +╞═════╪═════╡ +│ 3 ┆ 3 │ +│ 1 ┆ 1 │ +│ 4 ┆ 4 │ +│ 1 ┆ 1 │ +│ 5 ┆ 6 │ +│ 9 ┆ 7 │ +│ 4 ┆ 4 │ +└─────┴─────┘ """, ) diff --git a/src/pydiverse/transform/_internal/pipe/functions.py b/src/pydiverse/transform/_internal/pipe/functions.py index c2fb423..a86fa49 100644 --- a/src/pydiverse/transform/_internal/pipe/functions.py +++ b/src/pydiverse/transform/_internal/pipe/functions.py @@ -172,13 +172,28 @@ def rank( arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr[Int]: """ - The number of strictly smaller elements in the column, plus one. + The number of strictly smaller elements in the column plus one. This is the same as `rank("min")` in polars. Examples -------- - t = pdt.Table({"}) + >>> t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) + >>> t >> mutate(b=pdt.rank(arrange=t.a)) >> export(Polars(lazy=False)) + shape: (7, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 3 ┆ 3 │ + │ 1 ┆ 1 │ + │ 4 ┆ 4 │ + │ 1 ┆ 1 │ + │ 5 ┆ 6 │ + │ 9 ┆ 7 │ + │ 4 ┆ 4 │ + └─────┴─────┘ """ return ColFn(ops.rank, partition_by=partition_by, arrange=arrange) diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index cb19317..f6498f2 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -219,7 +219,7 @@ def export( if isinstance(target, Polars): if isinstance(df, pl.LazyFrame): df = df.collect() - return df.get_column(str(uid)) + return df.get_column(str(uid)).rename("") elif isinstance(target, Pandas): assert isinstance(df, pd.DataFrame) return pd.Series(df[str(uid)]) From a289792b75977ff0670cb8bd52cb503486d2645c Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Wed, 18 Dec 2024 14:03:53 +0100 Subject: [PATCH 03/16] document markers --- .../transform/_internal/ops/ops/markers.py | 56 +++++++++++++++---- .../transform/_internal/tree/col_expr.py | 36 ++++++++++-- 2 files changed, 78 insertions(+), 14 deletions(-) diff --git a/src/pydiverse/transform/_internal/ops/ops/markers.py b/src/pydiverse/transform/_internal/ops/ops/markers.py index 3a94478..17dfb37 100644 --- a/src/pydiverse/transform/_internal/ops/ops/markers.py +++ b/src/pydiverse/transform/_internal/ops/ops/markers.py @@ -7,13 +7,49 @@ class Marker(Operator): def __init__(self, name: str, doc: str = ""): - super().__init__(name, Signature(D, return_type=D), doc=doc) - - -nulls_first = Marker("nulls_first") - -nulls_last = Marker("nulls_last") - -ascending = Marker("ascending") - -descending = Marker("descending") + super().__init__( + name, + Signature(D, return_type=D), + doc=doc + + """ +Can only be used in expressions given to the `arrange` verb or as as an +`arrange` keyword argument. +""", + ) + + +nulls_first = Marker( + "nulls_first", + doc=""" +Specifies that nulls are placed at the beginning of the ordering. + +This does not mean that nulls are considered to be `less` than any other +element. I.e. if both `nulls_first` and `descending` are given, nulls will still +be placed at the beginning. +""", +) + +nulls_last = Marker( + "nulls_last", + doc=""" +Specifies that nulls are placed at the end of the ordering. + +This does not mean that nulls are considered to be `greater` than any other +element. I.e. if both `nulls_last` and `descending` are given, nulls will still +be placed at the end. +""", +) + +ascending = Marker( + "ascending", + doc=""" +The default ordering. +""", +) + +descending = Marker( + "descending", + doc=""" +Reverses the default ordering. +""", +) diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index 6b0baf3..59cc3c6 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -217,7 +217,12 @@ def any( return ColFn(ops.any, self, partition_by=partition_by, filter=filter) def ascending(self: ColExpr) -> ColExpr: - """""" + """ + The default ordering. + + Can only be used in expressions given to the `arrange` verb or as as an + `arrange` keyword argument. + """ return ColFn(ops.ascending, self) @@ -280,7 +285,12 @@ def count( return ColFn(ops.count, self, partition_by=partition_by, filter=filter) def descending(self: ColExpr) -> ColExpr: - """""" + """ + Reverses the default ordering. + + Can only be used in expressions given to the `arrange` verb or as as an + `arrange` keyword argument. + """ return ColFn(ops.descending, self) @@ -662,12 +672,30 @@ def __ne__(self: ColExpr, rhs: ColExpr) -> ColExpr[Bool]: return ColFn(ops.not_equal, self, rhs) def nulls_first(self: ColExpr) -> ColExpr: - """""" + """ + Specifies that nulls are placed at the beginning of the ordering. + + This does not mean that nulls are considered to be `less` than any other + element. I.e. if both `nulls_first` and `descending` are given, nulls will still + be placed at the beginning. + + Can only be used in expressions given to the `arrange` verb or as as an + `arrange` keyword argument. + """ return ColFn(ops.nulls_first, self) def nulls_last(self: ColExpr) -> ColExpr: - """""" + """ + Specifies that nulls are placed at the end of the ordering. + + This does not mean that nulls are considered to be `greater` than any other + element. I.e. if both `nulls_last` and `descending` are given, nulls will still + be placed at the end. + + Can only be used in expressions given to the `arrange` verb or as as an + `arrange` keyword argument. + """ return ColFn(ops.nulls_last, self) From 7beee074a40e93c63a5556203b3741563fa9330c Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Wed, 18 Dec 2024 14:14:13 +0100 Subject: [PATCH 04/16] restructure column operations --- .../reference/operators/conditional_logic.rst | 13 +++++++++++++ docs/source/reference/operators/index.rst | 19 ++----------------- .../reference/operators/type_conversion.rst | 13 +++++++++++++ .../transform/_internal/ops/ops/markers.py | 6 ++++++ .../transform/_internal/tree/col_expr.py | 6 ++++++ 5 files changed, 40 insertions(+), 17 deletions(-) create mode 100644 docs/source/reference/operators/conditional_logic.rst create mode 100644 docs/source/reference/operators/type_conversion.rst diff --git a/docs/source/reference/operators/conditional_logic.rst b/docs/source/reference/operators/conditional_logic.rst new file mode 100644 index 0000000..2f781f7 --- /dev/null +++ b/docs/source/reference/operators/conditional_logic.rst @@ -0,0 +1,13 @@ +================= +Conditional Logic +================= + +.. currentmodule:: pydiverse.transform + +.. autosummary:: + :toctree: _generated/ + :template: autosummary/short_title.rst + :nosignatures: + + when + ColExpr.map diff --git a/docs/source/reference/operators/index.rst b/docs/source/reference/operators/index.rst index 427411c..7b39257 100644 --- a/docs/source/reference/operators/index.rst +++ b/docs/source/reference/operators/index.rst @@ -16,26 +16,11 @@ Column Operations window sorting_markers horizontal_aggregation - + conditional_logic + type_conversion .. currentmodule:: pydiverse.transform .. autoclass:: ColExpr :no-index: :members: dtype - -.. autosummary:: - :toctree: _generated/ - :template: autosummary/short_title.rst - :nosignatures: - - lit - when - -.. autosummary:: - :toctree: _generated/ - :template: autosummary/short_title.rst - :nosignatures: - - ColExpr.cast - ColExpr.map diff --git a/docs/source/reference/operators/type_conversion.rst b/docs/source/reference/operators/type_conversion.rst new file mode 100644 index 0000000..0d18bfc --- /dev/null +++ b/docs/source/reference/operators/type_conversion.rst @@ -0,0 +1,13 @@ +=============== +Type Conversion +=============== + +.. currentmodule:: pydiverse.transform + +.. autosummary:: + :toctree: _generated/ + :template: autosummary/short_title.rst + :nosignatures: + + lit + ColExpr.cast diff --git a/src/pydiverse/transform/_internal/ops/ops/markers.py b/src/pydiverse/transform/_internal/ops/ops/markers.py index 17dfb37..d6dbb82 100644 --- a/src/pydiverse/transform/_internal/ops/ops/markers.py +++ b/src/pydiverse/transform/_internal/ops/ops/markers.py @@ -26,6 +26,9 @@ def __init__(self, name: str, doc: str = ""): This does not mean that nulls are considered to be `less` than any other element. I.e. if both `nulls_first` and `descending` are given, nulls will still be placed at the beginning. + +If neither `nulls_first` nor `nulls_last` is specified, the position of nulls is +backend-dependent. """, ) @@ -37,6 +40,9 @@ def __init__(self, name: str, doc: str = ""): This does not mean that nulls are considered to be `greater` than any other element. I.e. if both `nulls_last` and `descending` are given, nulls will still be placed at the end. + +If neither `nulls_first` nor `nulls_last` is specified, the position of nulls is +backend-dependent. """, ) diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index 59cc3c6..368ca97 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -679,6 +679,9 @@ def nulls_first(self: ColExpr) -> ColExpr: element. I.e. if both `nulls_first` and `descending` are given, nulls will still be placed at the beginning. + If neither `nulls_first` nor `nulls_last` is specified, the position of nulls is + backend-dependent. + Can only be used in expressions given to the `arrange` verb or as as an `arrange` keyword argument. """ @@ -693,6 +696,9 @@ def nulls_last(self: ColExpr) -> ColExpr: element. I.e. if both `nulls_last` and `descending` are given, nulls will still be placed at the end. + If neither `nulls_first` nor `nulls_last` is specified, the position of nulls is + backend-dependent. + Can only be used in expressions given to the `arrange` verb or as as an `arrange` keyword argument. """ From f37c348fc2d257836130462a11c80955042b84dd Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Wed, 18 Dec 2024 17:53:16 +0100 Subject: [PATCH 05/16] document join --- .../reference/operators/conditional_logic.rst | 1 + .../transform/_internal/backend/targets.py | 2 +- .../transform/_internal/pipe/verbs.py | 101 ++++++++++++++---- 3 files changed, 85 insertions(+), 19 deletions(-) diff --git a/docs/source/reference/operators/conditional_logic.rst b/docs/source/reference/operators/conditional_logic.rst index 2f781f7..86dbdf2 100644 --- a/docs/source/reference/operators/conditional_logic.rst +++ b/docs/source/reference/operators/conditional_logic.rst @@ -10,4 +10,5 @@ Conditional Logic :nosignatures: when + coalesce ColExpr.map diff --git a/src/pydiverse/transform/_internal/backend/targets.py b/src/pydiverse/transform/_internal/backend/targets.py index 7e1db3e..bf66519 100644 --- a/src/pydiverse/transform/_internal/backend/targets.py +++ b/src/pydiverse/transform/_internal/backend/targets.py @@ -11,7 +11,7 @@ class Target: ... class Polars(Target): - def __init__(self, *, lazy: bool = True) -> None: + def __init__(self, *, lazy: bool = False) -> None: self.lazy = lazy diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index f6498f2..63829f4 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -144,34 +144,20 @@ def export( ) -> Pipeable: """Convert a pydiverse.transform Table to a data frame. - Parameters - ---------- - target + :param target: Can currently be either a `Polars` or `Pandas` object. For polars, one can specify whether a DataFrame or LazyFrame is returned via the `lazy` kwarg. If `lazy=True`, no actual computations are performed, they just get stored in the LazyFrame. - schema_overrides + :param schema_overrides: A dictionary of columns to backend-specific data types. This controls which data types are used when writing to the backend. Because the data types are not constrained by pydiverse.transform's type system, this may sometimes be preferred over a `cast`. - Returns - ------- - A polars or pandas DataFrame / LazyFrame or a series. - - - Note - ---- - This verb can be used with a Table or column expression on the left. When applied to - a column expression, all tables containing columns in the expression tree are - gathered and the expression is exported with respect to the most recent table. This - means that there has to be one column in the expression tree, whose table is a - common ancestor of all others. If this is not the case, the export fails. Anonymous - columns (`C`-columns) can be used in the expression and are interpreted with respect - to this common ancestor table. + :return: + A polars or pandas DataFrame / LazyFrame or a series. """ if isinstance(data, ColExpr): # Find the common ancestor of all AST nodes of columns appearing in the @@ -554,6 +540,73 @@ def join( validate: Literal["1:1", "1:m", "m:1", "m:m"] = "m:m", suffix: str | None = None, ) -> Pipeable: + """ + Joins two tables on a boolean expression. + + The left table in the join comes through the pipe `>>` operator from the + left. + + :param right: + The right table to join with. + + :param on: + The join condition. See the note below for more information on which expressions + are allowed here. + + :param how: + The join type. + + :param validate: + Only relevant for polars. When set to ``"m:m"``, this does nothing. If set to + ``"1:m"``, it is checked whether each right row matches at most one left row. In + case this does not hold, an error is raised. Symmetrically, if set to ``"m:1"`` + it is checked whether each left row matches at most one right row. If set to + ``"1:1"`` both ``"1:m"`` and ``"m:1"`` are checked. + + :param suffix: + A string that is appended to all column names from the right table. If no suffix + is specified and there are no column name collisions, columns will retain their + original name. If there are name collisions, the name of the right table is + appended to all columns of the right table. If this still does not resolve all + name collisions, additionally an integer is appended to the column names of the + right table. + + + Note + ---- + Not all backends can handle arbitrary boolean expressions in ``on`` with every join + type. + + :polars: + For everything except conjunctions of equalities, it depends on whether polars + ``join_asof`` can handle the join condition. + :postgres: + In full joins, the join condition must be hashable or mergeable. See + https://wiki.postgresql.org/wiki/PostgreSQL_vs_SQL_Standard#FULL_OUTER_JOIN_conditions. + + Examples + -------- + >>> t1 = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}, name="t1") + >>> t2 = pdt.Table({"a": [4, 4, 1, 7], "b": ["f", "g", "h", "i"]}, name="t2") + >>> t1 >> join(t2, t1.a == t2.a, how="left") >> export(Polars()) + shape: (9, 3) + ┌─────┬──────┬──────┐ + │ a ┆ a_t2 ┆ b_t2 │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ str │ + ╞═════╪══════╪══════╡ + │ 3 ┆ null ┆ null │ + │ 1 ┆ 1 ┆ h │ + │ 4 ┆ 4 ┆ f │ + │ 4 ┆ 4 ┆ g │ + │ 1 ┆ 1 ┆ h │ + │ 5 ┆ null ┆ null │ + │ 9 ┆ null ┆ null │ + │ 4 ┆ 4 ┆ f │ + │ 4 ┆ 4 ┆ g │ + └─────┴──────┴──────┘ + """ + errors.check_arg_type(Table, "join", "right", right) errors.check_arg_type(str | None, "join", "suffix", suffix) errors.check_literal_type(["inner", "left", "full"], "join", "how", how) @@ -636,6 +689,10 @@ def inner_join( validate: Literal["1:1", "1:m", "m:1", "m:m"] = "m:m", suffix: str | None = None, ) -> Pipeable: + """ + Alias for the `join` verb with `how="inner"`. + """ + return left >> join(right, on, "inner", validate=validate, suffix=suffix) @@ -658,6 +715,10 @@ def left_join( validate: Literal["1:1", "1:m", "m:1", "m:m"] = "m:m", suffix: str | None = None, ) -> Pipeable: + """ + Alias for the `join` verb with `how="left"`. + """ + return left >> join(right, on, "left", validate=validate, suffix=suffix) @@ -680,6 +741,10 @@ def full_join( validate: Literal["1:1", "1:m", "m:1", "m:m"] = "m:m", suffix: str | None = None, ) -> Pipeable: + """ + Alias for the `join` verb with `how="full"`. + """ + return left >> join(right, on, "full", validate=validate, suffix=suffix) From cced873f5f9291d19c6ec4213adcab050ddf6e5b Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Wed, 18 Dec 2024 18:29:59 +0100 Subject: [PATCH 06/16] document more verbs --- docs/source/reference/verbs.rst | 1 + .../transform/_internal/pipe/verbs.py | 58 +++++++++++++++++-- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/docs/source/reference/verbs.rst b/docs/source/reference/verbs.rst index a719298..9368bf9 100644 --- a/docs/source/reference/verbs.rst +++ b/docs/source/reference/verbs.rst @@ -17,6 +17,7 @@ Verbs filter full_join group_by + inner_join join left_join mutate diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index 63829f4..8e50c37 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -69,6 +69,19 @@ def alias(new_name: str | None = None) -> Pipeable: ... @verb def alias(table: Table, new_name: str | None = None) -> Pipeable: + """ + Changes the name of the current table and resets column references. + + That column references are reset means that the resulting table is not + considered to be derived from the input table, i.e. one cannot use columns + from the input table in subsequent operations on the result table. However, + the reset of column references is necessary before performing a self-join. + + :param new_name: + The new name assigned to the table. If this is ``None``, the table + retains its previous name. + """ + if new_name is None: new_name = table._ast.name new = copy.copy(table) @@ -226,6 +239,13 @@ def build_query() -> Pipeable: ... @verb def build_query(table: Table) -> Pipeable: + """ + Compiles the operations accumulated on the current table to a SQL query. + + :returns: + The SQL query as a string. + """ + table = table >> alias() SourceBackend: type[TableImpl] = get_backend(table._ast) return SourceBackend.build_query(table._ast, table._cache.select) @@ -237,6 +257,10 @@ def show_query() -> Pipeable: ... @verb def show_query(table: Table) -> Pipeable: + """ + Prints the compiled SQL query to stdout. + """ + if query := table >> build_query(): print(query) else: @@ -251,6 +275,13 @@ def select(*cols: Col | ColName) -> Pipeable: ... @verb def select(table: Table, *cols: Col | ColName) -> Pipeable: + """ + Selects a subset of columns. + + :param cols: + The columns to be included in the resulting table. + """ + errors.check_vararg_type(Col | ColName, "select", *cols) new = copy.copy(table) @@ -505,6 +536,16 @@ def slice_head(n: int, *, offset: int = 0) -> Pipeable: ... @verb def slice_head(table: Table, n: int, *, offset: int = 0) -> Pipeable: + """ + Selects a subset of rows based on their index. + + :param n: + The number of rows to select. + + :param offset: + The index of the first row (0-based) that is included in the selection. + """ + errors.check_arg_type(int, "slice_head", "n", n) errors.check_arg_type(int, "slice_head", "offset", offset) @@ -581,8 +622,14 @@ def join( For everything except conjunctions of equalities, it depends on whether polars ``join_asof`` can handle the join condition. :postgres: - In full joins, the join condition must be hashable or mergeable. See - https://wiki.postgresql.org/wiki/PostgreSQL_vs_SQL_Standard#FULL_OUTER_JOIN_conditions. + For full joins, the join condition must be hashable or mergeable. See the + `postgres documentation `_ + for more details. + + Tip + --- + Two tables cannot be joined if one is derived from the other. In particular, before + a self-join, the ``alias`` verb has to be applied to one table. Examples -------- @@ -690,7 +737,7 @@ def inner_join( suffix: str | None = None, ) -> Pipeable: """ - Alias for the `join` verb with `how="inner"`. + Alias for the ``join`` verb with ``how="inner"``. """ return left >> join(right, on, "inner", validate=validate, suffix=suffix) @@ -716,7 +763,7 @@ def left_join( suffix: str | None = None, ) -> Pipeable: """ - Alias for the `join` verb with `how="left"`. + Alias for the ``join`` verb with `how="left"`. """ return left >> join(right, on, "left", validate=validate, suffix=suffix) @@ -754,6 +801,9 @@ def show() -> Pipeable: ... @verb def show(table: Table) -> Pipeable: + """ + Prints the table to stdout. + """ print(table) return table From f15fa9cd2d7f37eeb4d420e114cdf10e2c74a02d Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Wed, 18 Dec 2024 18:57:14 +0100 Subject: [PATCH 07/16] document more verbs --- .../transform/_internal/ops/ops/window.py | 2 +- .../transform/_internal/pipe/functions.py | 2 +- .../transform/_internal/pipe/verbs.py | 46 +++++++++++++++++-- 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/src/pydiverse/transform/_internal/ops/ops/window.py b/src/pydiverse/transform/_internal/ops/ops/window.py index 248332d..1965b3f 100644 --- a/src/pydiverse/transform/_internal/ops/ops/window.py +++ b/src/pydiverse/transform/_internal/ops/ops/window.py @@ -48,7 +48,7 @@ def __init__( doc=""" The number of strictly smaller elements in the column plus one. -This is the same as `rank("min")` in polars. +This is the same as ``rank("min")`` in polars. Examples -------- diff --git a/src/pydiverse/transform/_internal/pipe/functions.py b/src/pydiverse/transform/_internal/pipe/functions.py index a86fa49..77f294e 100644 --- a/src/pydiverse/transform/_internal/pipe/functions.py +++ b/src/pydiverse/transform/_internal/pipe/functions.py @@ -174,7 +174,7 @@ def rank( """ The number of strictly smaller elements in the column plus one. - This is the same as `rank("min")` in polars. + This is the same as ``rank("min")`` in polars. Examples -------- diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index 8e50c37..cf9ca47 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -299,6 +299,12 @@ def drop(*cols: Col | ColName) -> Pipeable: ... @verb def drop(table: Table, *cols: Col | ColName) -> Pipeable: + """ + Removes a subset of the columns. + + :param cols: + The columns to be removed. + """ errors.check_vararg_type(Col | ColName, "drop", *cols) dropped_uuids = {preprocess_arg(col, table)._uuid for col in cols} @@ -314,6 +320,12 @@ def rename(name_map: dict[str, str]) -> Pipeable: ... @verb def rename(table: Table, name_map: dict[str, str]) -> Pipeable: + """ + Renames columns. + + :param name_map: + A dictionary assigning some columns (given by their name) new names. + """ errors.check_arg_type(dict, "rename", "name_map", name_map) if len(name_map) == 0: return table @@ -340,6 +352,33 @@ def mutate(**kwargs: ColExpr) -> Pipeable: ... @verb def mutate(table: Table, **kwargs: ColExpr) -> Pipeable: + """ + Adds new columns to the table. + + :param kwargs: + Each key is the name of a new column, and its value is the column + expression defining the new column. + + Examples + -------- + >>> t1 = pdt.Table( + ... dict(a=[3, 1, 4, 1, 5, 9], b=[2.465, 0.22, -4.477, 10.8, -81.2, 0.0]) + ... ) + >>> t1 >> mutate(u=t1.a * t1.b) >> export(Polars()) + shape: (6, 3) + ┌─────┬────────┬─────────┐ + │ a ┆ b ┆ u │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ f64 │ + ╞═════╪════════╪═════════╡ + │ 3 ┆ 2.465 ┆ 7.395 │ + │ 1 ┆ 0.22 ┆ 0.22 │ + │ 4 ┆ -4.477 ┆ -17.908 │ + │ 1 ┆ 10.8 ┆ 10.8 │ + │ 5 ┆ -81.2 ┆ -406.0 │ + │ 9 ┆ 0.0 ┆ 0.0 │ + └─────┴────────┴─────────┘ + """ if len(kwargs) == 0: return table return table >> _mutate(*map(list, zip(*kwargs.items(), strict=True))) @@ -377,6 +416,7 @@ def filter(*predicates: ColExpr[Bool]) -> Pipeable: ... @verb def filter(table: Table, *predicates: ColExpr[Bool]) -> Pipeable: + """ """ if len(predicates) == 0: return table @@ -737,7 +777,7 @@ def inner_join( suffix: str | None = None, ) -> Pipeable: """ - Alias for the ``join`` verb with ``how="inner"``. + Alias for the :doc:`pydiverse.transform.join` verb with ``how="inner"``. """ return left >> join(right, on, "inner", validate=validate, suffix=suffix) @@ -763,7 +803,7 @@ def left_join( suffix: str | None = None, ) -> Pipeable: """ - Alias for the ``join`` verb with `how="left"`. + Alias for the :doc:`pydiverse.transform.join` verb with ``how="left"``. """ return left >> join(right, on, "left", validate=validate, suffix=suffix) @@ -789,7 +829,7 @@ def full_join( suffix: str | None = None, ) -> Pipeable: """ - Alias for the `join` verb with `how="full"`. + Alias for the :doc:`pydiverse.transform.join` verb with ``how="full"``. """ return left >> join(right, on, "full", validate=validate, suffix=suffix) From 133fa8bb92839638dec5c9adf13bf3a5b406e323 Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Sat, 28 Dec 2024 09:34:47 +0100 Subject: [PATCH 08/16] simplify column export --- .../transform/_internal/backend/polars.py | 11 +- .../transform/_internal/backend/table_impl.py | 14 +- .../transform/_internal/pipe/verbs.py | 124 ++---------------- .../transform/_internal/tree/col_expr.py | 18 +++ tests/test_core.py | 2 +- tests/test_polars_table.py | 9 +- 6 files changed, 44 insertions(+), 134 deletions(-) diff --git a/src/pydiverse/transform/_internal/backend/polars.py b/src/pydiverse/transform/_internal/backend/polars.py index 73755dc..b6a5573 100644 --- a/src/pydiverse/transform/_internal/backend/polars.py +++ b/src/pydiverse/transform/_internal/backend/polars.py @@ -6,7 +6,7 @@ import polars as pl from pydiverse.transform._internal.backend.table_impl import TableImpl -from pydiverse.transform._internal.backend.targets import Pandas, Polars, Target +from pydiverse.transform._internal.backend.targets import Polars, Target from pydiverse.transform._internal.ops import ops from pydiverse.transform._internal.ops.op import Ftype from pydiverse.transform._internal.tree import types, verbs @@ -73,15 +73,6 @@ def export( raise AssertionError - @staticmethod - def export_col(expr: ColExpr, target: Target) -> pl.Series: - if isinstance(target, Polars): - ... - elif isinstance(target, Pandas): - ... - - raise AssertionError - def _clone(self) -> tuple[PolarsImpl, dict[AstNode, AstNode], dict[UUID, UUID]]: cloned = PolarsImpl(self.name, self.df.clone()) return ( diff --git a/src/pydiverse/transform/_internal/backend/table_impl.py b/src/pydiverse/transform/_internal/backend/table_impl.py index 8614ff0..d736aad 100644 --- a/src/pydiverse/transform/_internal/backend/table_impl.py +++ b/src/pydiverse/transform/_internal/backend/table_impl.py @@ -14,7 +14,7 @@ from pydiverse.transform._internal.ops import ops from pydiverse.transform._internal.ops.op import Ftype from pydiverse.transform._internal.tree.ast import AstNode -from pydiverse.transform._internal.tree.col_expr import Col, ColExpr +from pydiverse.transform._internal.tree.col_expr import Col from pydiverse.transform._internal.tree.types import Dtype try: @@ -120,9 +120,6 @@ def export( schema_overrides: dict[Col, Any], ) -> Any: ... - @classmethod - def export_col(cls, expr: ColExpr, target: Target): ... - @classmethod def get_impl(cls, op: Operator, sig: Sequence[Dtype]) -> Any: if (impl := cls.impl_store.get_impl(op, sig)) is not None: @@ -140,6 +137,15 @@ def get_impl(cls, op: Operator, sig: Sequence[Dtype]) -> Any: ) from err +def get_backend(nd: AstNode) -> type[TableImpl]: + from pydiverse.transform._internal.pipe.verbs import Verb + + if isinstance(nd, Verb): + return get_backend(nd.child) + assert isinstance(nd, TableImpl) and nd is not TableImpl + return nd.__class__ + + with TableImpl.impl_store.impl_manager as impl: @impl(ops.add) diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index cf9ca47..f349352 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -4,18 +4,13 @@ import uuid from typing import Any, Literal, overload -import pandas as pd -import polars as pl - from pydiverse.transform._internal import errors -from pydiverse.transform._internal.backend.table_impl import TableImpl -from pydiverse.transform._internal.backend.targets import Pandas, Polars, Target +from pydiverse.transform._internal.backend.table_impl import TableImpl, get_backend +from pydiverse.transform._internal.backend.targets import Polars, Target from pydiverse.transform._internal.errors import FunctionTypeError from pydiverse.transform._internal.ops.op import Ftype -from pydiverse.transform._internal.pipe.c import C from pydiverse.transform._internal.pipe.pipeable import Pipeable, verb from pydiverse.transform._internal.pipe.table import Table -from pydiverse.transform._internal.tree.ast import AstNode from pydiverse.transform._internal.tree.col_expr import ( Col, ColExpr, @@ -37,7 +32,6 @@ SliceHead, Summarize, Ungroup, - Verb, ) __all__ = [ @@ -150,10 +144,7 @@ def export(target: Target, *, schema_overrides: dict | None = None) -> Pipeable: @verb def export( - data: Table | ColExpr, - target: Target, - *, - schema_overrides: dict[Col, Any] | None = None, + data: Table, target: Target, *, schema_overrides: dict[Col, Any] | None = None ) -> Pipeable: """Convert a pydiverse.transform Table to a data frame. @@ -170,59 +161,8 @@ def export( preferred over a `cast`. :return: - A polars or pandas DataFrame / LazyFrame or a series. + A polars or pandas DataFrame / LazyFrame. """ - if isinstance(data, ColExpr): - # Find the common ancestor of all AST nodes of columns appearing in the - # expression. - nodes: dict[AstNode, int] = {} - for col in data.iter_subtree(): - if isinstance(col, Col): - nodes[col._ast] = 0 - - for nd in list(nodes.keys()): - gen = nd.iter_subtree_preorder() - try: - descendant = next(gen) - while True: - if descendant != nd and descendant in nodes: - nodes[descendant] += 1 - descendant = gen.send(True) - else: - descendant = next(gen) - except StopIteration: - ... - - indeg0 = (nd for nd, cnt in nodes.items() if cnt == 0) - root = next(indeg0) - try: - next(indeg0) - # The AST nodes have a common ancestor if and only if there is exactly one - # node that has not been reached by another one. - raise ValueError( - "cannot export a column expression containing columns without a common " - "ancestor" - ) - except StopIteration: - ... - - uid = uuid.uuid1() - table = from_ast(root) - df = ( - table - >> _mutate([str(uid)], [data], [uid]) - >> select(C[str(uid)]) - >> export(target) - ) - - if isinstance(target, Polars): - if isinstance(df, pl.LazyFrame): - df = df.collect() - return df.get_column(str(uid)).rename("") - elif isinstance(target, Pandas): - assert isinstance(df, pd.DataFrame) - return pd.Series(df[str(uid)]) - raise AssertionError # TODO: allow stuff like pdt.Int(): pl.Uint32() in schema_overrides and resolve that # to columns @@ -379,23 +319,13 @@ def mutate(table: Table, **kwargs: ColExpr) -> Pipeable: │ 9 ┆ 0.0 ┆ 0.0 │ └─────┴────────┴─────────┘ """ + if len(kwargs) == 0: return table - return table >> _mutate(*map(list, zip(*kwargs.items(), strict=True))) - -@verb -def _mutate( - table: Table, - names: list[str], - values: list[ColExpr], - uuids: list[uuid.UUID] | None = None, -) -> Pipeable: + names, values = map(list, zip(*kwargs.items(), strict=True)) + uuids = [uuid.uuid1() for _ in names] new = copy.copy(table) - if uuids is None: - uuids = [uuid.uuid1() for _ in names] - else: - assert len(uuids) == len(names) new._ast = Mutate( table._ast, @@ -513,23 +443,10 @@ def summarize(**kwargs: ColExpr) -> Pipeable: ... @verb def summarize(table: Table, **kwargs: ColExpr) -> Pipeable: - return table >> _summarize(*map(list, zip(*kwargs.items(), strict=True))) - - -@verb -def _summarize( - table: Table, - names: list[str], - values: list[ColExpr], - uuids: list[uuid.UUID] | None = None, -) -> Pipeable: + names, values = map(list, zip(*kwargs.items(), strict=True)) + uuids = [uuid.uuid1() for _ in names] new = copy.copy(table) - if uuids is None: - uuids = [uuid.uuid1() for _ in names] - else: - assert len(uuids) == len(names) - new._ast = Summarize( table._ast, names, @@ -895,26 +812,3 @@ def preprocess_arg(arg: Any, table: Table, *, update_partition_by: bool = True) expr.context_kwargs["partition_by"] = table._cache.partition_by return resolve_C_columns(arg, table) - - -def get_backend(nd: AstNode) -> type[TableImpl]: - if isinstance(nd, Verb): - return get_backend(nd.child) - assert isinstance(nd, TableImpl) and nd is not TableImpl - return nd.__class__ - - -def from_ast(root: AstNode) -> Table: - if isinstance(root, TableImpl): - return Table(root) - - assert isinstance(root, Verb) - child = from_ast(root.child) - if isinstance(root, Alias): - return child >> alias(root.name) - - child._cache.update( - root, from_ast(root.right)._cache if isinstance(root, Join) else None - ) - child._ast = root - return child diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index 368ca97..dd61705 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -12,6 +12,10 @@ from typing import Any, Generic, TypeVar, overload from uuid import UUID +import pandas as pd +import polars as pl + +from pydiverse.transform._internal.backend.targets import Pandas, Polars, Target from pydiverse.transform._internal.errors import FunctionTypeError from pydiverse.transform._internal.ops import ops, signature from pydiverse.transform._internal.ops.op import Ftype, Operator @@ -1121,6 +1125,20 @@ def __str__(self) -> str: def __hash__(self) -> int: return hash(self._uuid) + def export(self, target: Target) -> Any: + from pydiverse.transform._internal.backend.table_impl import get_backend + from pydiverse.transform._internal.tree.verbs import Select + + ast = Select(self._ast, [self]) + df = get_backend(self._ast).export(ast, target, [self], {}) + if isinstance(target, Polars): + if isinstance(df, pl.LazyFrame): + df = df.collect() + return df.get_column(self.name) + else: + assert isinstance(target, Pandas) + return pd.Series(df[self.name]) + class ColName(ColExpr): def __init__( diff --git a/tests/test_core.py b/tests/test_core.py index 42abba3..77d2653 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -63,7 +63,7 @@ def test_contains(self, tbl1, tbl2): def test_dict_constructor(self): t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) assert_series_equal( - pdt.rank(arrange=t.a) >> export(Polars()), + (t >> mutate(r=pdt.rank(arrange=t.a))).r.export(Polars()), pl.Series([3, 1, 4, 1, 6, 7, 4]), check_names=False, ) diff --git a/tests/test_polars_table.py b/tests/test_polars_table.py index 6f76132..fb0a709 100644 --- a/tests/test_polars_table.py +++ b/tests/test_polars_table.py @@ -580,20 +580,21 @@ def test_duckdb_execution(self, tbl2, tbl3): check_row_order=False, ) - def test_col_export(self, tbl1, tbl2): - assert_equal(df1.get_column("col1"), tbl1.col1 >> export(Polars())) + def test_col_export(self, tbl1: pdt.Table, tbl2: pdt.Table): + assert_equal(df1.get_column("col1"), tbl1.col1.export(Polars())) t = tbl2 >> mutate(u=tbl2.col1 * tbl2.col2, v=-tbl2.col3) t_ex: pl.DataFrame = t >> export(Polars(lazy=False)) assert_equal( (t_ex["u"] + t_ex["col2"]).exp() - t_ex["v"], - ((t.u + C.col2).exp() - t.v) >> export(Polars()), + (t >> mutate(h=(t.u + C.col2).exp() - t.v)).h.export(Polars()), ) e = t >> inner_join( tbl1, tbl1.col1.cast(pdt.Float64()) <= tbl2.col1 + tbl2.col3 ) e_ex = e >> export(Polars(lazy=False)) assert_equal( - e_ex["col2"] + e_ex["col1_df1"], (e.col2 + tbl1.col1) >> export(Polars()) + e_ex["col2"] + e_ex["col1_df1"], + (e >> mutate(j=e.col2 + tbl1.col1)).j.export(Polars()), ) From b3bb43b265d4058873ed31d8eb581346faa0ad4a Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Sat, 28 Dec 2024 10:28:11 +0100 Subject: [PATCH 09/16] delete generated files with 'make clean' --- docs/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/Makefile b/docs/Makefile index 880e1f5..b739a80 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -14,6 +14,10 @@ help: .PHONY: help Makefile +clean: + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + rm **/_generated/* + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile From 9e1b5d7677e2bc31bbc0cc5c91e48822116fe709 Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Sat, 28 Dec 2024 11:10:06 +0100 Subject: [PATCH 10/16] add Col and col export documentation --- docs/Makefile | 4 -- docs/source/conf.py | 3 ++ docs/source/reference/api.rst | 20 ++++++++- docs/source/reference/operators/index.rst | 4 -- src/pydiverse/transform/__init__.py | 5 +-- .../transform/_internal/backend/polars.py | 5 ++- .../transform/_internal/backend/table_impl.py | 2 +- .../transform/_internal/pipe/verbs.py | 41 +++++++++++++++++-- .../transform/_internal/tree/col_expr.py | 34 +++++++++++++++ 9 files changed, 99 insertions(+), 19 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index b739a80..880e1f5 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -14,10 +14,6 @@ help: .PHONY: help Makefile -clean: - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - rm **/_generated/* - # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile diff --git a/docs/source/conf.py b/docs/source/conf.py index c49f083..308422f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -55,6 +55,9 @@ "member-order": "bysource", } +autodoc_class_signature = "separated" +autodoc_default_options = {"exclude-members": "__new__"} + autosectionlabel_prefix_document = True toc_object_entries_show_parents = "all" diff --git a/docs/source/reference/api.rst b/docs/source/reference/api.rst index 74e37de..2df837b 100644 --- a/docs/source/reference/api.rst +++ b/docs/source/reference/api.rst @@ -9,9 +9,25 @@ API operators/index targets + +.. currentmodule:: pydiverse.transform + Table ----- -.. currentmodule:: pydiverse.transform .. autoclass:: Table - :noindex: + +ColExpr +------- + +.. autoclass:: ColExpr + :members: dtype + :exclude-members: __new__, __init__ + +Col +--- + +.. autoclass:: Col + :no-index: + :members: export + :exclude-members: __new__, __init__ diff --git a/docs/source/reference/operators/index.rst b/docs/source/reference/operators/index.rst index 7b39257..15129fe 100644 --- a/docs/source/reference/operators/index.rst +++ b/docs/source/reference/operators/index.rst @@ -20,7 +20,3 @@ Column Operations type_conversion .. currentmodule:: pydiverse.transform - -.. autoclass:: ColExpr - :no-index: - :members: dtype diff --git a/src/pydiverse/transform/__init__.py b/src/pydiverse/transform/__init__.py index 0a76829..1c565cd 100644 --- a/src/pydiverse/transform/__init__.py +++ b/src/pydiverse/transform/__init__.py @@ -2,11 +2,10 @@ from ._internal.pipe.pipeable import verb from ._internal.pipe.table import Table -from ._internal.tree.col_expr import ColExpr +from ._internal.tree.col_expr import Col, ColExpr from .extended import * from .extended import __all__ as __extended from .types import * from .types import __all__ as __types -__all__ = ["Table", "ColExpr", "verb"] -# __all__ += __extended + __types +__all__ = ["Table", "ColExpr", "Col", "verb"] + __extended + __types diff --git a/src/pydiverse/transform/_internal/backend/polars.py b/src/pydiverse/transform/_internal/backend/polars.py index b6a5573..fc24095 100644 --- a/src/pydiverse/transform/_internal/backend/polars.py +++ b/src/pydiverse/transform/_internal/backend/polars.py @@ -6,7 +6,7 @@ import polars as pl from pydiverse.transform._internal.backend.table_impl import TableImpl -from pydiverse.transform._internal.backend.targets import Polars, Target +from pydiverse.transform._internal.backend.targets import Pandas, Polars, Target from pydiverse.transform._internal.ops import ops from pydiverse.transform._internal.ops.op import Ftype from pydiverse.transform._internal.tree import types, verbs @@ -71,6 +71,9 @@ def export( lf.name = nd.name return lf + elif isinstance(target, Pandas): + return lf.collect().to_pandas(use_pyarrow_extension_array=True) + raise AssertionError def _clone(self) -> tuple[PolarsImpl, dict[AstNode, AstNode], dict[UUID, UUID]]: diff --git a/src/pydiverse/transform/_internal/backend/table_impl.py b/src/pydiverse/transform/_internal/backend/table_impl.py index d736aad..be4e944 100644 --- a/src/pydiverse/transform/_internal/backend/table_impl.py +++ b/src/pydiverse/transform/_internal/backend/table_impl.py @@ -138,7 +138,7 @@ def get_impl(cls, op: Operator, sig: Sequence[Dtype]) -> Any: def get_backend(nd: AstNode) -> type[TableImpl]: - from pydiverse.transform._internal.pipe.verbs import Verb + from pydiverse.transform._internal.tree.verbs import Verb if isinstance(nd, Verb): return get_backend(nd.child) diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index f349352..d3243ee 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -149,19 +149,52 @@ def export( """Convert a pydiverse.transform Table to a data frame. :param target: - Can currently be either a `Polars` or `Pandas` object. For polars, one can - specify whether a DataFrame or LazyFrame is returned via the `lazy` kwarg. - If `lazy=True`, no actual computations are performed, they just get stored in + Can currently be either a ``Polars`` or ``Pandas`` object. For polars, one can + specify whether a DataFrame or LazyFrame is returned via the ``lazy`` kwarg. + If ``lazy=True``, no actual computations are performed, they just get stored in the LazyFrame. :param schema_overrides: A dictionary of columns to backend-specific data types. This controls which data types are used when writing to the backend. Because the data types are not constrained by pydiverse.transform's type system, this may sometimes be - preferred over a `cast`. + preferred over a cast. :return: A polars or pandas DataFrame / LazyFrame. + + Examples + -------- + + >>> t1 = pdt.Table( + ... { + ... "a": [3, 1, 4, 1, 5, 9], + ... "b": [2.465, 0.22, -4.477, 10.8, -81.2, 0.0], + ... "c": ["a,", "bcbc", "pydiverse", "transform", "' '", "-22"], + ... } + ... ) + >>> t1 >> export(Pandas()) + a b c + 0 3 2.465 a, + 1 1 0.22 bcbc + 2 4 -4.477 pydiverse + 3 1 10.8 transform + 4 5 -81.2 ' ' + 5 9 0.0 -22 + >>> t1 >> export(Polars()) + shape: (6, 3) + ┌─────┬────────┬───────────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪════════╪═══════════╡ + │ 3 ┆ 2.465 ┆ a, │ + │ 1 ┆ 0.22 ┆ bcbc │ + │ 4 ┆ -4.477 ┆ pydiverse │ + │ 1 ┆ 10.8 ┆ transform │ + │ 5 ┆ -81.2 ┆ ' ' │ + │ 9 ┆ 0.0 ┆ -22 │ + └─────┴────────┴───────────┘ """ # TODO: allow stuff like pdt.Int(): pl.Uint32() in schema_overrides and resolve that diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index dd61705..0febb52 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -1126,6 +1126,40 @@ def __hash__(self) -> int: return hash(self._uuid) def export(self, target: Target) -> Any: + """ + Exports a column to a polars or pandas Series. + + :param target: + The data frame library to export to. Can be a ``Polars`` or ``Pandas`` + object. The ``lazy`` kwarg for polars is ignored. + + :return: + A polars or pandas Series. + + Examples + -------- + >>> t1 = pdt.Table({"h": [2.465, 0.22, -4.477, 10.8, -81.2, 0.0]}) + >>> t1.h.export(Polars()) + shape: (6,) + Series: 'h' [f64] + [ + 2.465 + 0.22 + -4.477 + 10.8 + -81.2 + 0.0 + ] + >>> t1.h.export(Pandas()) + 0 2.465 + 1 0.22 + 2 -4.477 + 3 10.8 + 4 -81.2 + 5 0.0 + Name: h, dtype: double[pyarrow] + """ + from pydiverse.transform._internal.backend.table_impl import get_backend from pydiverse.transform._internal.tree.verbs import Select From 5932fde5017d478818c5b306fbb153b6856aa1ef Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Sat, 28 Dec 2024 12:00:25 +0100 Subject: [PATCH 11/16] add examples for alias, collect, filter --- .../transform/_internal/pipe/verbs.py | 96 ++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index d3243ee..64b1e28 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -74,6 +74,40 @@ def alias(table: Table, new_name: str | None = None) -> Pipeable: :param new_name: The new name assigned to the table. If this is ``None``, the table retains its previous name. + + Examples + -------- + + A self join without applying ``alias`` before raises an exception: + + >>> t = pdt.Table({"a": [4, 2, 1, 4], "b": ["l", "g", "uu", "-- r"]}) + >>> t >> join(t, t.a == t.a, how="inner", suffix="_right") + ValueError: table `` occurs twice in the table tree. + hint: To join two tables derived from a common table, apply `>> alias()` to one of + them before the join. + + By applying ``alias`` to the right table and storing the result in a new variable, + the self join succeeds. + + >>> ( + ... t + ... >> join(s := t >> alias(), t.a == s.a, how="inner", suffix="_right") + ... >> export(Polars()) + ... ) + shape: (6, 4) + ┌─────┬────────┬─────────┬─────────┐ + │ a ┆ b ┆ a_right ┆ b_right │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪════════╪═════════╪═════════╡ + │ 4 ┆ l ┆ 4 ┆ l │ + │ 4 ┆ -- r ┆ 4 ┆ l │ + │ 2 ┆ g ┆ 2 ┆ g │ + │ 1 ┆ uu ┆ 1 ┆ uu │ + │ 4 ┆ l ┆ 4 ┆ -- r │ + │ 4 ┆ -- r ┆ 4 ┆ -- r │ + └─────┴────────┴─────────┴─────────┘ """ if new_name is None: @@ -115,6 +149,46 @@ def collect(target: Target | None = None) -> Pipeable: ... @verb def collect(table: Table, target: Target | None = None) -> Pipeable: + """ + Execute all accumulated operations and write the result to a new Table. + + This verb is only for polars-backed tables. All operations lazily stored in the + table are executed and a new table containing the result is returned. The returned + table always stored the data in a polars LazyFrame. One can choose whether the + following operations on the table are executed via polars or DuckDB on the + LazyFrame (see also :doc:`/examples/duckdb_polars_parquet`). + + :param target: + The execution engine to be used from here on. Can be either ``Polars`` or + ``DuckDb``. + + Examples + -------- + Here, ``collect`` does not change anything in the result, but the ``mutate`` is + executed on the DataFrame when ``collect`` is called, whereas the ``arrange`` is + only executed when ``export`` is called. Without ``collect``, the ``mutate`` would + only have been executed with the ``export``, too. + + >>> t = pdt.Table({"a": [4, 2, 1, 4], "b": ["l", "g", "uu", "-- r"]}) + >>> ( + ... t + ... >> mutate(z=t.a + t.b.str.len()) + ... >> collect() + ... >> arrange(C.z, t.a) + ... >> export(Polars()) + ... ) + shape: (4, 3) + ┌─────┬────────┬─────┐ + │ a ┆ b ┆ z │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 │ + ╞═════╪════════╪═════╡ + │ 1 ┆ uu ┆ 3 │ + │ 2 ┆ g ┆ 3 │ + │ 4 ┆ l ┆ 5 │ + │ 4 ┆ -- r ┆ 10 │ + └─────┴────────┴─────┘ + """ errors.check_arg_type(Target | None, "collect", "target", target) df = table >> select(*table._cache.all_cols.values()) >> export(Polars(lazy=False)) @@ -379,7 +453,27 @@ def filter(*predicates: ColExpr[Bool]) -> Pipeable: ... @verb def filter(table: Table, *predicates: ColExpr[Bool]) -> Pipeable: - """ """ + """ + Selects a subset of rows based on some condition. + + :param predicates: + Column expressions of boolean type to filter by. Only rows where all expressions + are true are included in the result. + + Examples + -------- + >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) + >>> t >> filter(t.a <= 4, ~t.b.str.contains("_")) >> export(Polars()) + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 3 ┆ lll │ + │ 2 ┆ g │ + └─────┴─────┘ + """ if len(predicates) == 0: return table From 72cbccd4830f333b0ff2aacbf077ffd6dbd48ab1 Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Sat, 28 Dec 2024 12:32:10 +0100 Subject: [PATCH 12/16] add more examples --- .../transform/_internal/pipe/verbs.py | 136 +++++++++++++++++- 1 file changed, 134 insertions(+), 2 deletions(-) diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index 64b1e28..5dee5d3 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -327,6 +327,22 @@ def select(table: Table, *cols: Col | ColName) -> Pipeable: :param cols: The columns to be included in the resulting table. + + Examples + -------- + >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) + >>> t >> select(t.a) >> export(Polars()) + shape: (4, 1) + ┌─────┐ + │ a │ + │ --- │ + │ i64 │ + ╞═════╡ + │ 3 │ + │ 2 │ + │ 6 │ + │ 4 │ + └─────┘ """ errors.check_vararg_type(Col | ColName, "select", *cols) @@ -351,12 +367,27 @@ def drop(table: Table, *cols: Col | ColName) -> Pipeable: :param cols: The columns to be removed. + + Examples + -------- + >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) + >>> t >> drop(t.a) >> export(Polars()) + shape: (4, 1) + ┌───────┐ + │ b │ + │ --- │ + │ str │ + ╞═══════╡ + │ lll │ + │ g │ + │ u0 │ + │ __**_ │ + └───────┘ """ errors.check_vararg_type(Col | ColName, "drop", *cols) dropped_uuids = {preprocess_arg(col, table)._uuid for col in cols} - return select( - table, + return table >> select( *(col for col in table._cache.select if col._uuid not in dropped_uuids), ) @@ -372,6 +403,60 @@ def rename(table: Table, name_map: dict[str, str]) -> Pipeable: :param name_map: A dictionary assigning some columns (given by their name) new names. + + Examples + -------- + Renaming one column: + + >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) + >>> t >> rename({"a": "h"}) >> export(Polars()) + shape: (4, 2) + ┌─────┬───────┐ + │ h ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═══════╡ + │ 3 ┆ lll │ + │ 2 ┆ g │ + │ 6 ┆ u0 │ + │ 4 ┆ __**_ │ + └─────┴───────┘ + + Here is a more subtle example: As long as there are no two equal column names in the + result table, one can give names to columns that already exist in the table. In the + following example, the names of columns *a* and *b* are swapped. + + >>> s = t >> rename({"a": "b", "b": "a"}) >> show() + Table , backend: PolarsImpl + shape: (4, 2) + ┌─────┬───────┐ + │ b ┆ a │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═══════╡ + │ 3 ┆ lll │ + │ 2 ┆ g │ + │ 6 ┆ u0 │ + │ 4 ┆ __**_ │ + └─────┴───────┘ + + When using the column ``t.a`` in an expression derived from *s* now, it + still refers to the same column, which now has the name *b*. The anonymous + column ``C.a``, however, refers to the column with name *a* in the *current* + table. + + >>> s >> mutate(u=t.a, v=C.a) >> export(Polars()) + shape: (4, 4) + ┌─────┬───────┬─────┬───────┐ + │ b ┆ a ┆ u ┆ v │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞═════╪═══════╪═════╪═══════╡ + │ 3 ┆ lll ┆ 3 ┆ lll │ + │ 2 ┆ g ┆ 2 ┆ g │ + │ 6 ┆ u0 ┆ 6 ┆ u0 │ + │ 4 ┆ __**_ ┆ 4 ┆ __**_ │ + └─────┴───────┴─────┴───────┘ """ errors.check_arg_type(dict, "rename", "name_map", name_map) if len(name_map) == 0: @@ -511,6 +596,53 @@ def arrange(*order_by: ColExpr) -> Pipeable: ... @verb def arrange(table: Table, *order_by: ColExpr) -> Pipeable: + """ + Sorts the rows of the table. + + :param order_by: + Column expressions to sort by. The order of the expressions determines + the priority. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "r": [2, 7, 3, 2, 6, None, 4], + ... "s": ["l", "o", "a", "c", "s", "---", "3"], + ... "p": [0.655, -4.33, None, 143.6, 0.0, 1.0, 4.5], + ... } + ... ) + >>> t >> arrange(t.r.nulls_first(), t.p) >> export(Polars()) + shape: (7, 3) + ┌──────┬─────┬───────┐ + │ r ┆ s ┆ p │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞══════╪═════╪═══════╡ + │ null ┆ --- ┆ 1.0 │ + │ 2 ┆ l ┆ 0.655 │ + │ 2 ┆ c ┆ 143.6 │ + │ 3 ┆ a ┆ null │ + │ 4 ┆ 3 ┆ 4.5 │ + │ 6 ┆ s ┆ 0.0 │ + │ 7 ┆ o ┆ -4.33 │ + └──────┴─────┴───────┘ + >>> t >> arrange(t.p.nulls_last().descending(), t.s) >> export(Polars()) + shape: (7, 3) + ┌──────┬─────┬───────┐ + │ r ┆ s ┆ p │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ f64 │ + ╞══════╪═════╪═══════╡ + │ 2 ┆ c ┆ 143.6 │ + │ 4 ┆ 3 ┆ 4.5 │ + │ null ┆ --- ┆ 1.0 │ + │ 2 ┆ l ┆ 0.655 │ + │ 6 ┆ s ┆ 0.0 │ + │ 7 ┆ o ┆ -4.33 │ + │ 3 ┆ a ┆ null │ + └──────┴─────┴───────┘ + """ if len(order_by) == 0: return table From 901eee97adb89d721f73614ea90a117b9e528b2f Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Sat, 28 Dec 2024 14:52:25 +0100 Subject: [PATCH 13/16] generate summary, add alias for rank / dense_rank --- docs/source/examples/aggregations.md | 4 +- docs/source/reference/operators/index.rst | 102 ++++++++++++++++++ generate_col_ops.py | 53 +++++++++ .../transform/_internal/tree/col_expr.py | 18 ++++ 4 files changed, 175 insertions(+), 2 deletions(-) diff --git a/docs/source/examples/aggregations.md b/docs/source/examples/aggregations.md index c7d5104..5d4185e 100644 --- a/docs/source/examples/aggregations.md +++ b/docs/source/examples/aggregations.md @@ -8,8 +8,8 @@ from pydiverse.transform.extended import * tbl1 = pdt.Table(dict(a=[1, 1, 2], b=[4, 5, 6])) -tbl1 >> summarize(sum_a=sum(a), sum_b=sum(b)) >> show() -tbl1 >> group_by(tbl1.a) >> summarize(sum_b=sum(b)) >> show() +tbl1 >> summarize(sum_a=a.sum(), sum_b=b.sum()) >> show() +tbl1 >> group_by(tbl1.a) >> summarize(sum_b=b.sum()) >> show() ``` Typical aggregation functions are `sum()`, `mean()`, `count()`, `min()`, `max()`, `any()`, and `all()`. diff --git a/docs/source/reference/operators/index.rst b/docs/source/reference/operators/index.rst index 15129fe..dba9af8 100644 --- a/docs/source/reference/operators/index.rst +++ b/docs/source/reference/operators/index.rst @@ -19,4 +19,106 @@ Column Operations conditional_logic type_conversion + +Expression methods +------------------ + +.. currentmodule:: pydiverse.transform.ColExpr + +.. autosummary:: + :nosignatures: + + __add__ + __and__ + __eq__ + __floordiv__ + __ge__ + __gt__ + __invert__ + __le__ + __lt__ + __mod__ + __mul__ + __ne__ + __neg__ + __or__ + __pos__ + __pow__ + __sub__ + __truediv__ + __xor__ + abs + all + any + ascending + cast + ceil + count + dense_rank + descending + dt.day + dt.day_of_week + dt.day_of_year + dt.hour + dt.microsecond + dt.millisecond + dt.minute + dt.month + dt.second + dt.year + dur.days + dur.hours + dur.microseconds + dur.milliseconds + dur.minutes + dur.seconds + exp + fill_null + floor + is_in + is_inf + is_nan + is_not_inf + is_not_nan + is_not_null + is_null + log + map + max + mean + min + nulls_first + nulls_last + rank + round + shift + str.contains + str.ends_with + str.len + str.lower + str.replace_all + str.slice + str.starts_with + str.strip + str.to_date + str.to_datetime + str.upper + sum + +Global functions +---------------- + .. currentmodule:: pydiverse.transform + +.. autosummary:: + :nosignatures: + + coalesce + count + dense_rank + lit + max + min + rank + row_number + when diff --git a/generate_col_ops.py b/generate_col_ops.py index d5ddb13..b516e20 100644 --- a/generate_col_ops.py +++ b/generate_col_ops.py @@ -15,6 +15,7 @@ COL_EXPR_PATH = "./src/pydiverse/transform/_internal/tree/col_expr.py" FNS_PATH = "./src/pydiverse/transform/_internal/pipe/functions.py" +API_DOCS_PATH = "./docs/source/reference/operators/index.rst" NAMESPACES = ["str", "dt", "dur"] @@ -247,3 +248,55 @@ def indent(s: str, by: int) -> str: file.truncate() os.system(f"ruff format {FNS_PATH}") + +with open(API_DOCS_PATH, "r+") as file: + new_file_contents = "" + + for line in file: + new_file_contents += line + if line.startswith("Expression methods"): + new_file_contents += ( + "------------------\n\n" + ".. currentmodule:: pydiverse.transform.ColExpr\n\n" + ".. autosummary::\n" + " :nosignatures:\n\n " + ) + + new_file_contents += "\n ".join( + sorted( + [ + op.name + for op in ops.__dict__.values() + if isinstance(op, Operator) and op.generate_expr_method + ] + + ["rank", "dense_rank", "map", "cast"] + ) + ) + + new_file_contents += ( + "\n\nGlobal functions\n" + "----------------\n\n" + ".. currentmodule:: pydiverse.transform\n\n" + ".. autosummary::\n" + " :nosignatures:\n\n " + ) + + new_file_contents += ( + "\n ".join( + sorted( + [ + op.name + for op in ops.__dict__.values() + if isinstance(op, Operator) and not op.generate_expr_method + ] + + ["when", "lit"] + ) + ) + + "\n" + ) + + break + + file.seek(0) + file.write(new_file_contents) + file.truncate() diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index 0febb52..be1c794 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -140,6 +140,24 @@ def __rshift__(self, rhs): ) return rhs(self) + def rank( + self: ColExpr, + partition_by: Col | ColName | Iterable[Col | ColName] | None = None, + ) -> ColExpr[Int]: + """ + Alias for :doc:`/reference/operators/_generated/pydiverse.transform.rank`. + """ + return ColFn(ops.rank, partition_by=partition_by, arrange=self) + + def dense_rank( + self: ColExpr, + partition_by: Col | ColName | Iterable[Col | ColName] | None = None, + ) -> ColExpr[Int]: + """ + Alias for :doc:`/reference/operators/_generated/pydiverse.transform.dense_rank`. + """ + return ColFn(ops.dense_rank, partition_by=partition_by, arrange=self) + # --- generated code starts here, do not delete this comment --- @overload From 6438e841085da4a8b6181c6e4b63eaf1959d7238 Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Sat, 28 Dec 2024 16:31:11 +0100 Subject: [PATCH 14/16] document % and // behavior --- docs/source/examples.md | 1 - .../reference/operators/aggregation.rst | 10 ++ .../source/reference/operators/arithmetic.rst | 1 + .../transform/_internal/ops/ops/arithmetic.py | 81 +++++++++- .../transform/_internal/ops/ops/numeric.py | 3 - .../transform/_internal/tree/col_expr.py | 146 +++++++++++++++++- 6 files changed, 233 insertions(+), 9 deletions(-) diff --git a/docs/source/examples.md b/docs/source/examples.md index 50790ac..37be5b4 100644 --- a/docs/source/examples.md +++ b/docs/source/examples.md @@ -11,7 +11,6 @@ Some examples how to use pydiverse.transform: * [Best practices / beware the flatfile & embrace working with entities](/examples/best_practices_entities) ```{toctree} -/quickstart /examples/joining /examples/aggregations /examples/window_functions diff --git a/docs/source/reference/operators/aggregation.rst b/docs/source/reference/operators/aggregation.rst index f79e182..dfd9e23 100644 --- a/docs/source/reference/operators/aggregation.rst +++ b/docs/source/reference/operators/aggregation.rst @@ -2,6 +2,16 @@ Aggregation =========== +Aggregation functions take a ``partition_by`` and ``filter`` keyword argument. The +``partition_by`` argument can only be given when used within ``mutate``. If a +``partition_by`` argument is given and there is a surrounding ``group_by`` / +``ungroup``, the ``group_by`` is ignored and the value of ``partition_by`` is used. + +.. warning:: + The ``filter`` argument works similar to ``Expr.filter`` in polars. But in contrast + to polars, if all values in a group are ``null`` or the group becomes empty after + filtering, the value of every aggregation function for that group is ``null``, too. + .. currentmodule:: pydiverse.transform.ColExpr .. autosummary:: :toctree: _generated/ diff --git a/docs/source/reference/operators/arithmetic.rst b/docs/source/reference/operators/arithmetic.rst index 9e965e6..ade1eb9 100644 --- a/docs/source/reference/operators/arithmetic.rst +++ b/docs/source/reference/operators/arithmetic.rst @@ -10,6 +10,7 @@ Arithmetic __add__ __floordiv__ + __mod__ __mul__ __neg__ __pos__ diff --git a/src/pydiverse/transform/_internal/ops/ops/arithmetic.py b/src/pydiverse/transform/_internal/ops/ops/arithmetic.py index 8c5ec74..e044af8 100644 --- a/src/pydiverse/transform/_internal/ops/ops/arithmetic.py +++ b/src/pydiverse/transform/_internal/ops/ops/arithmetic.py @@ -40,4 +40,83 @@ Signature(Decimal(), Decimal(), return_type=Decimal()), ) -floordiv = Operator("__floordiv__", Signature(Int(), Int(), return_type=Int())) +floordiv = Operator( + "__floordiv__", + Signature(Int(), Int(), return_type=Int()), + doc=""" +Integer division. + +Warning +------- +The behavior of this operator differs from polars and python. Polars and python +always round towards negative infinity, whereas pydiverse.transform always +rounds towards zero, regardless of the sign. This behavior matches the one of C, +C++ and all currently supported SQL backends. + +See also +-------- +__mod__ + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [65, -65, 65, -65], +... "b": [7, 7, -7, -7], +... } +... ) +>>> t >> mutate(r=t.a // t.b) >> export(Polars()) +shape: (4, 3) +┌─────┬─────┬─────┐ +│ a ┆ b ┆ r │ +│ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 │ +╞═════╪═════╪═════╡ +│ 65 ┆ 7 ┆ 9 │ +│ -65 ┆ 7 ┆ -9 │ +│ 65 ┆ -7 ┆ -9 │ +│ -65 ┆ -7 ┆ 9 │ +└─────┴─────┴─────┘ +""", +) + +mod = Operator( + "__mod__", + Signature(Int(), Int(), return_type=Int()), + doc=""" +Computes the remainder of integer division. + +Warning +------- +This operator behaves differently than in polars. There are at least two +conventions how `%` and :doc:`// ` +should behave for negative inputs. We follow the one that C, C++ and all +currently supported SQL backends follow. This means that the output has the same +sign as the left hand side of the input, regardless of the right hand side. + +See also +-------- +__floordiv__ + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [65, -65, 65, -65], +... "b": [7, 7, -7, -7], +... } +... ) +>>> t >> mutate(r=t.a % t.b) >> export(Polars()) +shape: (4, 3) +┌─────┬─────┬─────┐ +│ a ┆ b ┆ r │ +│ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 │ +╞═════╪═════╪═════╡ +│ 65 ┆ 7 ┆ 2 │ +│ -65 ┆ 7 ┆ -2 │ +│ 65 ┆ -7 ┆ 2 │ +│ -65 ┆ -7 ┆ -2 │ +└─────┴─────┴─────┘ +""", +) diff --git a/src/pydiverse/transform/_internal/ops/ops/numeric.py b/src/pydiverse/transform/_internal/ops/ops/numeric.py index bc7662a..6260323 100644 --- a/src/pydiverse/transform/_internal/ops/ops/numeric.py +++ b/src/pydiverse/transform/_internal/ops/ops/numeric.py @@ -4,9 +4,6 @@ from pydiverse.transform._internal.ops.signature import Signature from pydiverse.transform._internal.tree.types import NUMERIC, Bool, Decimal, Float, Int -mod = Operator("__mod__", Signature(Int(), Int(), return_type=Int())) - - pow = Operator( "__pow__", Signature(Int(), Int(), return_type=Float()), diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index be1c794..42fecb7 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -343,12 +343,80 @@ def floor(self: ColExpr) -> ColExpr: return ColFn(ops.floor, self) def __floordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: - """""" + """ + Integer division. + + Warning + ------- + The behavior of this operator differs from polars and python. Polars and python + always round towards negative infinity, whereas pydiverse.transform always + rounds towards zero, regardless of the sign. This behavior matches the one of C, + C++ and all currently supported SQL backends. + + See also + -------- + __mod__ + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, -65, 65, -65], + ... "b": [7, 7, -7, -7], + ... } + ... ) + >>> t >> mutate(r=t.a // t.b) >> export(Polars()) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ r │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 65 ┆ 7 ┆ 9 │ + │ -65 ┆ 7 ┆ -9 │ + │ 65 ┆ -7 ┆ -9 │ + │ -65 ┆ -7 ┆ 9 │ + └─────┴─────┴─────┘ + """ return ColFn(ops.floordiv, self, rhs) def __rfloordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: - """""" + """ + Integer division. + + Warning + ------- + The behavior of this operator differs from polars and python. Polars and python + always round towards negative infinity, whereas pydiverse.transform always + rounds towards zero, regardless of the sign. This behavior matches the one of C, + C++ and all currently supported SQL backends. + + See also + -------- + __mod__ + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, -65, 65, -65], + ... "b": [7, 7, -7, -7], + ... } + ... ) + >>> t >> mutate(r=t.a // t.b) >> export(Polars()) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ r │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 65 ┆ 7 ┆ 9 │ + │ -65 ┆ 7 ┆ -9 │ + │ 65 ┆ -7 ┆ -9 │ + │ -65 ┆ -7 ┆ 9 │ + └─────┴─────┴─────┘ + """ return ColFn(ops.floordiv, rhs, self) @@ -637,12 +705,82 @@ def min( return ColFn(ops.min, self, partition_by=partition_by, filter=filter) def __mod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: - """""" + """ + Computes the remainder of integer division. + + Warning + ------- + This operator behaves differently than in polars. There are at least two + conventions how `%` and :doc:`// ` + should behave for negative inputs. We follow the one that C, C++ and all + currently supported SQL backends follow. This means that the output has the same + sign as the left hand side of the input, regardless of the right hand side. + + See also + -------- + __floordiv__ + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, -65, 65, -65], + ... "b": [7, 7, -7, -7], + ... } + ... ) + >>> t >> mutate(r=t.a % t.b) >> export(Polars()) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ r │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 65 ┆ 7 ┆ 2 │ + │ -65 ┆ 7 ┆ -2 │ + │ 65 ┆ -7 ┆ 2 │ + │ -65 ┆ -7 ┆ -2 │ + └─────┴─────┴─────┘ + """ return ColFn(ops.mod, self, rhs) def __rmod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: - """""" + """ + Computes the remainder of integer division. + + Warning + ------- + This operator behaves differently than in polars. There are at least two + conventions how `%` and :doc:`// ` + should behave for negative inputs. We follow the one that C, C++ and all + currently supported SQL backends follow. This means that the output has the same + sign as the left hand side of the input, regardless of the right hand side. + + See also + -------- + __floordiv__ + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, -65, 65, -65], + ... "b": [7, 7, -7, -7], + ... } + ... ) + >>> t >> mutate(r=t.a % t.b) >> export(Polars()) + shape: (4, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ r │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 65 ┆ 7 ┆ 2 │ + │ -65 ┆ 7 ┆ -2 │ + │ 65 ┆ -7 ┆ 2 │ + │ -65 ┆ -7 ┆ -2 │ + └─────┴─────┴─────┘ + """ return ColFn(ops.mod, rhs, self) From b9beea2263648b10067e1be873f783de5aa2e2d1 Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Sat, 28 Dec 2024 16:54:04 +0100 Subject: [PATCH 15/16] document group_by, ungroup, slice_head --- .../transform/_internal/pipe/verbs.py | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index 5dee5d3..9882b78 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -664,6 +664,24 @@ def group_by(table: Table, *cols: Col | ColName, add=False) -> Pipeable: ... @verb def group_by(table: Table, *cols: Col | ColName, add=False) -> Pipeable: + """ + Add a grouping state to the table. + + :param cols: + The columns to group by. + + :param add: + If ``add=True``, the given columns are added to the set of columns the table is + currently grouped by. If ``add=False``, the current grouping state is replaced + by the given columns. + + This verb does not modify the table itself, but only adds a grouping in the + background. The number of rows is only reduced when + :doc:`summarize ` + is called. The + :doc:`ungroup ` + verb can be used to clear the grouping state. + """ if len(cols) == 0: return table @@ -689,6 +707,46 @@ def ungroup() -> Pipeable: ... @verb def ungroup(table: Table) -> Pipeable: + """ + Clear the grouping state of the table. + + Examples + -------- + In the following example, ``group_by`` and ``ungroup`` are used to specify that each + aggregation function between them uses the column ``t.c`` for grouping. + + >>> t = pdt.Table( + ... { + ... "a": [1.2, 5.077, -2.29, -0.0, 3.0, -7.7], + ... "b": ["a ", "transform", "pipedag", "cdegh", " -ade ", " pq"], + ... "c": [True, False, None, None, True, True], + ... "d": [4, 4, 2, 0, 1, 0], + ... } + ... ) + >>> ( + ... t + ... >> group_by(t.c) + ... >> mutate( + ... u=t.b.str.len().max() + t.a.min(), + ... v=t.d.mean(filter=t.a >= 0), + ... ) + ... >> ungroup() + ... >> export(Polars()) + ... ) + shape: (6, 6) + ┌───────┬───────────┬───────┬─────┬────────┬─────┐ + │ a ┆ b ┆ c ┆ d ┆ u ┆ v │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ f64 ┆ str ┆ bool ┆ i64 ┆ f64 ┆ f64 │ + ╞═══════╪═══════════╪═══════╪═════╪════════╪═════╡ + │ 1.2 ┆ a ┆ true ┆ 4 ┆ -0.7 ┆ 2.5 │ + │ 5.077 ┆ transform ┆ false ┆ 4 ┆ 14.077 ┆ 4.0 │ + │ -2.29 ┆ pipedag ┆ null ┆ 2 ┆ 4.71 ┆ 0.0 │ + │ -0.0 ┆ cdegh ┆ null ┆ 0 ┆ 4.71 ┆ 0.0 │ + │ 3.0 ┆ -ade ┆ true ┆ 1 ┆ -0.7 ┆ 2.5 │ + │ -7.7 ┆ pq ┆ true ┆ 0 ┆ -0.7 ┆ 2.5 │ + └───────┴───────────┴───────┴─────┴────────┴─────┘ + """ new = copy.copy(table) new._ast = Ungroup(table._ast) new._cache = copy.copy(table._cache) @@ -702,6 +760,7 @@ def summarize(**kwargs: ColExpr) -> Pipeable: ... @verb def summarize(table: Table, **kwargs: ColExpr) -> Pipeable: + """ """ names, values = map(list, zip(*kwargs.items(), strict=True)) uuids = [uuid.uuid1() for _ in names] new = copy.copy(table) @@ -760,6 +819,27 @@ def slice_head(table: Table, n: int, *, offset: int = 0) -> Pipeable: :param offset: The index of the first row (0-based) that is included in the selection. + + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [65, 5, 312, -55, 0], + ... "b": ["l", "r", "srq", "---", " "], + ... } + ... ) + >>> t >> slice_head(3, offset=1) >> export(Polars()) + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 5 ┆ r │ + │ 312 ┆ srq │ + │ -55 ┆ --- │ + └─────┴─────┘ """ errors.check_arg_type(int, "slice_head", "n", n) From d0903828bdeea53ff7c1b979973ea478b924ecb5 Mon Sep 17 00:00:00 2001 From: Finn Rudolph Date: Thu, 2 Jan 2025 09:53:59 +0100 Subject: [PATCH 16/16] Document summarize and data types --- docs/source/reference/api.rst | 1 + docs/source/reference/types.rst | 28 +++++++++++ .../transform/_internal/pipe/verbs.py | 46 ++++++++++++++++++- .../transform/_internal/tree/types.py | 10 ++++ src/pydiverse/transform/types.py | 2 + 5 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 docs/source/reference/types.rst diff --git a/docs/source/reference/api.rst b/docs/source/reference/api.rst index 2df837b..a3ade8d 100644 --- a/docs/source/reference/api.rst +++ b/docs/source/reference/api.rst @@ -8,6 +8,7 @@ API verbs operators/index targets + types .. currentmodule:: pydiverse.transform diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst new file mode 100644 index 0000000..97b69fc --- /dev/null +++ b/docs/source/reference/types.rst @@ -0,0 +1,28 @@ +===== +Types +===== + +.. currentmodule:: pydiverse.transform +.. autosummary:: + :toctree: _generated/ + :nosignatures: + :template: autosummary/short_title.rst + + Dtype + Bool + Date + Datetime + Decimal + Float + Float32 + Float64 + Int + Int8 + Int16 + Int32 + Int64 + String + Uint8 + Uint16 + Uint32 + Uint64 diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index 9882b78..ce1f6ab 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -760,7 +760,51 @@ def summarize(**kwargs: ColExpr) -> Pipeable: ... @verb def summarize(table: Table, **kwargs: ColExpr) -> Pipeable: - """ """ + """ + Computes aggregates over groups of rows. + + :param kwargs: + Each key is the name of a new column, and its value is the column + expression defining the new column. The column expression may not contain + columns that are neither part of the grouping columns nor wrapped in an + aggregation function. + + + In contrast to :doc:`pydiverse.transform.mutate`, this verb in general reduces the + number of rows and only keeps the grouping columns and the new columns defined in + the kwargs. One row for each unique combination of values in the grouping columns + is created. + + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [1.2, 5.077, -2.29, -0.0, 3.0, -7.7], + ... "b": ["a ", "transform", "pipedag", "cdegh", " -ade ", " pq"], + ... "c": [True, False, None, None, True, True], + ... } + ... ) + >>> ( + ... t + ... >> group_by(t.c) + ... >> summarize( + ... u=t.b.str.len().mean(), + ... v=t.a.sum(filter=t.a >= 0), + ... ) + ... >> export(Polars()) + ... ) + shape: (3, 3) + ┌───────┬──────────┬───────┐ + │ c ┆ u ┆ v │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ f64 ┆ f64 │ + ╞═══════╪══════════╪═══════╡ + │ true ┆ 4.666667 ┆ 4.2 │ + │ null ┆ 6.0 ┆ -0.0 │ + │ false ┆ 9.0 ┆ 5.077 │ + └───────┴──────────┴───────┘ + """ names, values = map(list, zip(*kwargs.items(), strict=True)) uuids = [uuid.uuid1() for _ in names] new = copy.copy(table) diff --git a/src/pydiverse/transform/_internal/tree/types.py b/src/pydiverse/transform/_internal/tree/types.py index b9466d1..ba94124 100644 --- a/src/pydiverse/transform/_internal/tree/types.py +++ b/src/pydiverse/transform/_internal/tree/types.py @@ -5,6 +5,10 @@ class Dtype: + """ + Base class for all data types. + """ + __slots__ = ("const",) def __init__(self, *, const: bool = False): @@ -37,9 +41,15 @@ def __repr__(self) -> str: return ("const " if self.const else "") + self.__class__.__name__ def with_const(self) -> Dtype: + """ + Adds a `const` modifier from the data type. + """ return type(self)(const=True) def without_const(self) -> Dtype: + """ + Removes a `const` modifier from the data type (if present). + """ return type(self)() def converts_to(self, target: Dtype) -> bool: diff --git a/src/pydiverse/transform/types.py b/src/pydiverse/transform/types.py index 4e2e7b5..8f2383c 100644 --- a/src/pydiverse/transform/types.py +++ b/src/pydiverse/transform/types.py @@ -5,6 +5,7 @@ Date, Datetime, Decimal, + Dtype, Float, Float32, Float64, @@ -21,6 +22,7 @@ ) __all__ = [ + "Dtype", "Bool", "Date", "Datetime",