From edda56c697b1c848daa658b5660bf0d8199c855c Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 9 Oct 2024 11:21:21 +0000 Subject: [PATCH] Remove superclass init calls This is marginally faster, and makes it clearer that the base classes are abstract. --- python/cudf_polars/cudf_polars/dsl/expr.py | 39 +++++------ python/cudf_polars/cudf_polars/dsl/ir.py | 80 ++++++++++------------ 2 files changed, 55 insertions(+), 64 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index b4434de8c5d..7099a781e4b 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -79,10 +79,6 @@ class Expr(Node): """Data type of the expression.""" children: tuple[Expr, ...] = () - # Constructor must take arguments in order (*_non_child, *children) - def __init__(self, dtype: plc.DataType) -> None: - self.dtype = dtype - def do_evaluate( self, df: DataFrame, @@ -271,7 +267,7 @@ class Literal(Expr): children: tuple[()] def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None: - super().__init__(dtype) + self.dtype = dtype assert value.type == plc.interop.to_arrow(dtype) self.value = value @@ -298,7 +294,7 @@ class LiteralColumn(Expr): children: tuple[()] def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: - super().__init__(dtype) + self.dtype = dtype data = value.to_arrow() self.value = data.cast(dtypes.downcast_arrow_lists(data.type)) @@ -355,6 +351,9 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Len(Expr): children: tuple[()] + def __init__(self, dtype: plc.DataType) -> None: + self.dtype = dtype + def do_evaluate( self, df: DataFrame, @@ -392,7 +391,7 @@ def __init__( options: tuple[Any, ...], *children: Expr, ) -> None: - super().__init__(dtype) + self.dtype = dtype self.options = options self.name = name self.children = children @@ -631,7 +630,7 @@ def __init__( options: tuple[Any, ...], *children: Expr, ) -> None: - super().__init__(dtype) + self.dtype = dtype self.options = options self.name = name self.children = children @@ -887,7 +886,7 @@ def __init__( options: tuple[Any, ...], *children: Expr, ) -> None: - super().__init__(dtype) + self.dtype = dtype self.options = options self.name = name self.children = children @@ -992,7 +991,7 @@ class UnaryFunction(Expr): def __init__( self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr ) -> None: - super().__init__(dtype) + self.dtype = dtype self.name = name self.options = options self.children = children @@ -1231,7 +1230,7 @@ class Sort(Expr): def __init__( self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr ) -> None: - super().__init__(dtype) + self.dtype = dtype self.options = options self.children = (column,) @@ -1271,7 +1270,7 @@ def __init__( column: Expr, *by: Expr, ) -> None: - super().__init__(dtype) + self.dtype = dtype self.options = options self.children = (column, *by) @@ -1304,7 +1303,7 @@ class Gather(Expr): children: tuple[Expr, Expr] def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None: - super().__init__(dtype) + self.dtype = dtype self.children = (values, indices) def do_evaluate( @@ -1346,7 +1345,7 @@ class Filter(Expr): children: tuple[Expr, Expr] def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): - super().__init__(dtype) + self.dtype = dtype self.children = (values, indices) def do_evaluate( @@ -1373,7 +1372,7 @@ class RollingWindow(Expr): children: tuple[Expr] def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None: - super().__init__(dtype) + self.dtype = dtype self.options = options self.children = (agg,) raise NotImplementedError("Rolling window not implemented") @@ -1385,7 +1384,7 @@ class GroupedRollingWindow(Expr): children: tuple[Expr, ...] def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None: - super().__init__(dtype) + self.dtype = dtype self.options = options self.children = (agg, *by) raise NotImplementedError("Grouped rolling window not implemented") @@ -1397,7 +1396,7 @@ class Cast(Expr): children: tuple[Expr] def __init__(self, dtype: plc.DataType, value: Expr) -> None: - super().__init__(dtype) + self.dtype = dtype self.children = (value,) if not dtypes.can_cast(value.dtype, self.dtype): raise NotImplementedError( @@ -1431,7 +1430,7 @@ class Agg(Expr): def __init__( self, dtype: plc.DataType, name: str, options: Any, *children: Expr ) -> None: - super().__init__(dtype) + self.dtype = dtype self.name = name self.options = options self.children = children @@ -1631,7 +1630,7 @@ class Ternary(Expr): def __init__( self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr ) -> None: - super().__init__(dtype) + self.dtype = dtype self.children = (when, then, otherwise) def do_evaluate( @@ -1663,7 +1662,7 @@ def __init__( left: Expr, right: Expr, ) -> None: - super().__init__(dtype) + self.dtype = dtype if plc.traits.is_boolean(self.dtype): # For boolean output types, bitand and bitor implement # boolean logic, so translate. bitxor also does, but the diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index b2edf0084bc..ee47f154a35 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -127,12 +127,9 @@ class IR(Node): __slots__ = ("schema",) _non_child: ClassVar[tuple[str, ...]] = ("schema",) - children: tuple[IR, ...] = () schema: Schema """Mapping from column names to their data types.""" - - def __init__(self, schema: Schema) -> None: - self.schema = schema + children: tuple[IR, ...] = () def get_hash(self) -> int: """Hash of node, treating schema dictionary.""" @@ -179,7 +176,7 @@ class PythonScan(IR): """Filter to apply to the constructed dataframe before returning it.""" def __init__(self, schema: Schema, options: Any, predicate: expr.NamedExpr | None): - super().__init__(schema) + self.schema = schema self.options = options self.predicate = predicate raise NotImplementedError("PythonScan not implemented") @@ -230,23 +227,6 @@ class Scan(IR): predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" - def get_hash(self) -> int: - """Hash of the node.""" - return hash( - ( - type(self), - self.typ, - json.dumps(self.reader_options), - json.dumps(self.cloud_options), - tuple(self.paths), - tuple(self.with_columns) if self.with_columns is not None else None, - self.skip_rows, - self.n_rows, - self.row_index, - self.predicate, - ) - ) - def __init__( self, schema: Schema, @@ -260,7 +240,7 @@ def __init__( row_index: tuple[str, int] | None, predicate: expr.NamedExpr | None, ): - super().__init__(schema) + self.schema = schema self.typ = typ self.reader_options = reader_options self.cloud_options = cloud_options @@ -329,6 +309,23 @@ def __init__( "Reading only parquet metadata to produce row index." ) + def get_hash(self) -> int: + """Hash of the node.""" + return hash( + ( + type(self), + self.typ, + json.dumps(self.reader_options), + json.dumps(self.cloud_options), + tuple(self.paths), + tuple(self.with_columns) if self.with_columns is not None else None, + self.skip_rows, + self.n_rows, + self.row_index, + self.predicate, + ) + ) + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" with_columns = self.with_columns @@ -482,14 +479,13 @@ class Cache(IR): __slots__ = ("key", "children") _non_child = ("schema", "key") children: tuple[IR] - key: int """The cache key.""" value: IR """The unevaluated node to cache.""" def __init__(self, schema: Schema, key: int, value: IR): - super().__init__(schema) + self.schema = schema self.key = key self.children = (value,) @@ -511,7 +507,6 @@ class DataFrameScan(IR): __slots__ = ("df", "projection", "predicate") _non_child = ("schema", "df", "projection", "predicate") - df: Any """Polars LazyFrame object.""" projection: tuple[str, ...] | None @@ -526,7 +521,7 @@ def __init__( projection: Sequence[str] | None, predicate: expr.NamedExpr | None, ): - super().__init__(schema) + self.schema = schema self.df = df self.projection = tuple(projection) if projection is not None else None self.predicate = predicate @@ -562,7 +557,6 @@ class Select(IR): __slots__ = ("exprs", "children", "should_broadcast") _non_child = ("schema", "exprs", "should_broadcast") children: tuple[IR] - df: IR """Input dataframe.""" exprs: tuple[expr.NamedExpr, ...] @@ -577,7 +571,7 @@ def __init__( should_broadcast: bool, # noqa: FBT001 df: IR, ): - super().__init__(schema) + self.schema = schema self.exprs = tuple(exprs) self.should_broadcast = should_broadcast self.children = (df,) @@ -611,7 +605,7 @@ class Reduce(IR): def __init__( self, schema: Schema, exprs: Sequence[expr.NamedExpr], df: IR ): # pragma: no cover; polars doesn't emit this node yet - super().__init__(schema) + self.schema = schema self.exprs = tuple(exprs) self.children = (df,) @@ -649,7 +643,7 @@ def __init__( options: Any, df: IR, ): - super().__init__(schema) + self.schema = schema self.keys = tuple(keys) self.agg_requests = tuple(agg_requests) self.maintain_order = maintain_order @@ -819,7 +813,7 @@ def __init__( left: IR, right: IR, ): - super().__init__(schema) + self.schema = schema self.left_on = tuple(left_on) self.right_on = tuple(right_on) self.options = options @@ -1026,7 +1020,7 @@ def __init__( should_broadcast: bool, # noqa: FBT001 df: IR, ): - super().__init__(schema) + self.schema = schema self.columns = tuple(columns) self.should_broadcast = should_broadcast self.children = (df,) @@ -1066,7 +1060,7 @@ def __init__( stable: bool, # noqa: FBT001 df: IR, ): - super().__init__(schema) + self.schema = schema self.keep = keep self.subset = subset self.zlice = zlice @@ -1139,7 +1133,7 @@ def __init__( zlice: tuple[int, int] | None, df: IR, ): - super().__init__(schema) + self.schema = schema self.by = tuple(by) self.order = tuple(order) self.null_order = tuple(null_order) @@ -1189,16 +1183,14 @@ class Slice(IR): __slots__ = ("offset", "length", "children") _non_child = ("schema", "offset", "length") - - df: IR - """Input.""" + children: tuple[IR] offset: int """Start of the slice.""" length: int """Length of the slice.""" def __init__(self, schema: Schema, offset: int, length: int, df: IR): - super().__init__(schema) + self.schema = schema self.offset = offset self.length = length self.children = (df,) @@ -1218,7 +1210,7 @@ class Filter(IR): children: tuple[IR] def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR): - super().__init__(schema) + self.schema = schema self.mask = mask self.children = (df,) @@ -1238,7 +1230,7 @@ class Projection(IR): children: tuple[IR] def __init__(self, schema: Schema, df: IR): - super().__init__(schema) + self.schema = schema self.children = (df,) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: @@ -1274,7 +1266,7 @@ class MapFunction(IR): ) def __init__(self, schema: Schema, name: str, options: Any, df: IR): - super().__init__(schema) + self.schema = schema self.name = name self.options = options self.children = (df,) @@ -1380,7 +1372,7 @@ class Union(IR): _non_child = ("schema", "zlice") def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR): - super().__init__(schema) + self.schema = schema self.zlice = zlice self.children = children schema = self.children[0].schema @@ -1403,7 +1395,7 @@ class HConcat(IR): _non_child = ("schema",) def __init__(self, schema: Schema, *children: IR): - super().__init__(schema) + self.schema = schema self.children = children @staticmethod