From 8f2ced12282de4043262c7d317ce709e226b1f19 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 11 Oct 2023 07:43:34 -0400 Subject: [PATCH] feat(api): move analytic window functions to top-level --- ibis/backends/base/sql/alchemy/registry.py | 8 +- ibis/expr/api.py | 92 ++++++++++++++++++++++ ibis/expr/operations/analytic.py | 9 +-- ibis/expr/types/generic.py | 58 ++++++++++---- 4 files changed, 142 insertions(+), 25 deletions(-) diff --git a/ibis/backends/base/sql/alchemy/registry.py b/ibis/backends/base/sql/alchemy/registry.py index 65bd7221e22c..cbd17837cf82 100644 --- a/ibis/backends/base/sql/alchemy/registry.py +++ b/ibis/backends/base/sql/alchemy/registry.py @@ -675,10 +675,10 @@ class array_filter(FunctionElement): ops.FirstValue: unary(sa.func.first_value), ops.LastValue: unary(sa.func.last_value), ops.RowNumber: fixed_arity(sa.func.row_number, 0), - ops.DenseRank: unary(lambda _: sa.func.dense_rank()), - ops.MinRank: unary(lambda _: sa.func.rank()), - ops.PercentRank: unary(lambda _: sa.func.percent_rank()), - ops.CumeDist: unary(lambda _: sa.func.cume_dist()), + ops.DenseRank: fixed_arity(sa.func.dense_rank, 0), + ops.MinRank: fixed_arity(sa.func.rank, 0), + ops.PercentRank: fixed_arity(sa.func.percent_rank, 0), + ops.CumeDist: fixed_arity(sa.func.cume_dist, 0), ops.NthValue: _nth_value, ops.WindowFunction: _window_function, } diff --git a/ibis/expr/api.py b/ibis/expr/api.py index 3303562da1ab..6b384566f0ae 100644 --- a/ibis/expr/api.py +++ b/ibis/expr/api.py @@ -57,6 +57,11 @@ "connect", "cross_join", "cumulative_window", + "cume_dist", + "rank", + "ntile", + "dense_rank", + "percent_rank", "date", "desc", "decompile", @@ -1021,6 +1026,93 @@ def now() -> ir.TimestampScalar: return ops.TimestampNow().to_expr() +def rank() -> ir.IntegerColumn: + """Compute position of first element within each equal-value group in sorted order. + + Equivalent to SQL's `RANK()` window function. + + Returns + ------- + Int64Column + The min rank + + Examples + -------- + >>> import ibis + >>> ibis.options.interactive = True + >>> t = ibis.memtable({"values": [1, 2, 1, 2, 3, 2]}) + >>> t.mutate(rank=ibis.rank().over(order_by=t.values)) + ┏━━━━━━━━┳━━━━━━━┓ + ┃ values ┃ rank ┃ + ┡━━━━━━━━╇━━━━━━━┩ + │ int64 │ int64 │ + ├────────┼───────┤ + │ 1 │ 0 │ + │ 1 │ 0 │ + │ 2 │ 2 │ + │ 2 │ 2 │ + │ 2 │ 2 │ + │ 3 │ 5 │ + └────────┴───────┘ + """ + return ops.MinRank().to_expr() + + +def dense_rank() -> ir.IntegerColumn: + """Position of first element within each group of equal values. + + Values are returned in sorted order and duplicate values are ignored. + + Equivalent to SQL's `DENSE_RANK()`. + + Returns + ------- + IntegerColumn + The rank + + Examples + -------- + >>> import ibis + >>> ibis.options.interactive = True + >>> t = ibis.memtable({"values": [1, 2, 1, 2, 3, 2]}) + >>> t.mutate(rank=ibis.dense_rank().over(order_by=t.values)) + ┏━━━━━━━━┳━━━━━━━┓ + ┃ values ┃ rank ┃ + ┡━━━━━━━━╇━━━━━━━┩ + │ int64 │ int64 │ + ├────────┼───────┤ + │ 1 │ 0 │ + │ 1 │ 0 │ + │ 2 │ 1 │ + │ 2 │ 1 │ + │ 2 │ 1 │ + │ 3 │ 2 │ + └────────┴───────┘ + """ + return ops.DenseRank().to_expr() + + +def percent_rank() -> ir.FloatingColumn: + """Return the relative rank of the values in the column.""" + return ops.PercentRank().to_expr() + + +def cume_dist() -> ir.FloatingColumn: + """Return the cumulative distribution over a window.""" + return ops.CumeDist().to_expr() + + +def ntile(buckets: int | ir.IntegerValue) -> ir.IntegerColumn: + """Return the integer number of a partitioning of the column values. + + Parameters + ---------- + buckets + Number of buckets to partition into + """ + return ops.NTile(buckets).to_expr() + + def row_number() -> ir.IntegerColumn: """Return an analytic function expression for the current row number. diff --git a/ibis/expr/operations/analytic.py b/ibis/expr/operations/analytic.py index 444dcfec930c..37b75834b7ef 100644 --- a/ibis/expr/operations/analytic.py +++ b/ibis/expr/operations/analytic.py @@ -45,12 +45,12 @@ class RankBase(Analytic): @public class MinRank(RankBase): - arg: Column[dt.Any] + pass @public class DenseRank(RankBase): - arg: Column[dt.Any] + pass @public @@ -76,21 +76,16 @@ class RowNumber(RankBase): @public class PercentRank(Analytic): - arg: Column[dt.Any] - dtype = dt.double @public class CumeDist(Analytic): - arg: Column[dt.Any] - dtype = dt.double @public class NTile(Analytic): - arg: Column[dt.Any] buckets: Scalar[dt.Integer] dtype = dt.int64 diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py index d6fc93a655cd..e9501273613e 100644 --- a/ibis/expr/types/generic.py +++ b/ibis/expr/types/generic.py @@ -1765,7 +1765,13 @@ def rank(self) -> ir.IntegerColumn: │ 3 │ 5 │ └────────┴───────┘ """ - return ops.MinRank(self).to_expr() + import ibis.expr.analysis as an + + return ( + ibis.rank() + .over(order_by=self) + .resolve(an.find_first_base_table(self.op()).to_expr()) + ) def dense_rank(self) -> ir.IntegerColumn: """Position of first element within each group of equal values. @@ -1798,15 +1804,49 @@ def dense_rank(self) -> ir.IntegerColumn: │ 3 │ 2 │ └────────┴───────┘ """ - return ops.DenseRank(self).to_expr() + import ibis.expr.analysis as an + + return ( + ibis.dense_rank() + .over(order_by=self) + .resolve(an.find_first_base_table(self.op()).to_expr()) + ) def percent_rank(self) -> Column: """Return the relative rank of the values in the column.""" - return ops.PercentRank(self).to_expr() + import ibis.expr.analysis as an + + return ( + ibis.percent_rank() + .over(order_by=self) + .resolve(an.find_first_base_table(self.op()).to_expr()) + ) def cume_dist(self) -> Column: """Return the cumulative distribution over a window.""" - return ops.CumeDist(self).to_expr() + import ibis.expr.analysis as an + + return ( + ibis.cume_dist() + .over(order_by=self) + .resolve(an.find_first_base_table(self.op()).to_expr()) + ) + + def ntile(self, buckets: int | ir.IntegerValue) -> ir.IntegerColumn: + """Return the integer number of a partitioning of the column values. + + Parameters + ---------- + buckets + Number of buckets to partition into + """ + import ibis.expr.analysis as an + + return ( + ibis.ntile(buckets) + .over(order_by=self) + .resolve(an.find_first_base_table(self.op()).to_expr()) + ) def cummin(self, *, where=None, group_by=None, order_by=None) -> Column: """Return the cumulative min over a window.""" @@ -1852,16 +1892,6 @@ def lead( """ return ops.Lead(self, offset, default).to_expr() - def ntile(self, buckets: int | ir.IntegerValue) -> ir.IntegerColumn: - """Return the integer number of a partitioning of the column values. - - Parameters - ---------- - buckets - Number of buckets to partition into - """ - return ops.NTile(self, buckets).to_expr() - def nth(self, n: int | ir.IntegerValue) -> Column: """Return the `n`th value (0-indexed) over a window.