Skip to content

Commit

Permalink
Normalize sorting options (#771)
Browse files Browse the repository at this point in the history
* first attempt at normalizing sorting options

* use consistent name

* use the same wording in all docs

* actually use df_arrange

* note about `:parallel` in docs

* remove unused multithreaded option from _with funs

* change the defaults on :parallel and :stable

* update the option docs to reflect the new defaults

also simplify the wording on the parallel option
  • Loading branch information
billylanchantin authored Dec 14, 2023
1 parent 6a8d4d4 commit b80fd1a
Show file tree
Hide file tree
Showing 15 changed files with 265 additions and 123 deletions.
5 changes: 3 additions & 2 deletions lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,9 @@ defmodule Explorer.Backend.DataFrame do
df,
out_df :: df(),
directions :: [{:asc | :desc, lazy_series()}],
nulls_last :: boolean(),
maintain_order :: boolean()
maintain_order? :: boolean(),
multithreaded? :: boolean(),
nulls_last? :: boolean()
) :: df
@callback distinct(df, out_df :: df(), columns :: [column_name()]) :: df
@callback rename(df, out_df :: df(), [{old :: column_name(), new :: column_name()}]) :: df
Expand Down
12 changes: 6 additions & 6 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ defmodule Explorer.Backend.LazySeries do
# Transformation
column: 1,
reverse: 1,
argsort: 3,
sort: 3,
argsort: 5,
sort: 5,
distinct: 1,
unordered_distinct: 1,
slice: 2,
Expand Down Expand Up @@ -221,16 +221,16 @@ defmodule Explorer.Backend.LazySeries do
end

@impl true
def argsort(%Series{} = s, descending?, nils_last?) do
args = [lazy_series!(s), descending?, nils_last?]
def argsort(%Series{} = s, descending?, maintain_order?, multithreaded?, nulls_last?) do
args = [lazy_series!(s), descending?, maintain_order?, multithreaded?, nulls_last?]
data = new(:argsort, args, :integer, aggregations?(args))

Backend.Series.new(data, :integer)
end

@impl true
def sort(%Series{} = s, descending?, nils_last?) do
args = [lazy_series!(s), descending?, nils_last?]
def sort(%Series{} = s, descending?, maintain_order?, multithreaded?, nulls_last?) do
args = [lazy_series!(s), descending?, maintain_order?, multithreaded?, nulls_last?]
data = new(:sort, args, s.dtype, aggregations?(args))

Backend.Series.new(data, s.dtype)
Expand Down
16 changes: 14 additions & 2 deletions lib/explorer/backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,20 @@ defmodule Explorer.Backend.Series do

# Sort

@callback sort(s, descending? :: boolean(), nils_last :: boolean()) :: s
@callback argsort(s, descending? :: boolean(), nils_last :: boolean()) :: s
@callback sort(
s,
descending? :: boolean(),
maintain_order? :: boolean(),
multithreaded? :: boolean(),
nulls_last? :: boolean()
) :: s
@callback argsort(
s,
descending? :: boolean(),
maintain_order? :: boolean(),
multithreaded? :: boolean(),
nulls_last? :: boolean()
) :: s
@callback reverse(s) :: s

# Distinct
Expand Down
55 changes: 24 additions & 31 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3052,14 +3052,20 @@ defmodule Explorer.DataFrame do
## Options
* `:nils` - `:first | :last`.
Determines if `nil`s get sorted first or last in the result.
Default is `:last`.
* `:nils` - `:first` or `:last`.
By default it is `:last` if direction is `:asc`, and `:first` otherwise.
* `:stable` - `boolean()`.
* `:parallel` - boolean.
Whether to parallelize the sorting.
By default it is `true`.
Parallel sort isn't available on certain lazy operations.
In those situations this option is ignored.
* `:stable` - boolean.
Determines if the sorting is stable (ties are guaranteed to maintain their order) or not.
Unstable sorting may be more performant.
Default is `true`.
By default it is `false`.
## Examples
Expand Down Expand Up @@ -3157,14 +3163,20 @@ defmodule Explorer.DataFrame do
## Options
* `:nils` - `:first | :last`.
Determines if `nil`s get sorted first or last in the result.
Default is `:last`.
* `:nils` - `:first` or `:last`.
By default it is `:last` if direction is `:asc`, and `:first` otherwise.
* `:parallel` - boolean.
Whether to parallelize the sorting.
By default it is `true`.
Parallel sort isn't available on certain lazy operations.
In those situations this option is ignored.
* `:stable` - `boolean()`.
* `:stable` - boolean.
Determines if the sorting is stable (ties are guaranteed to maintain their order) or not.
Unstable sorting may be more performant.
Default is `true`.
By default it is `false`.
## Examples
Expand Down Expand Up @@ -3230,21 +3242,7 @@ defmodule Explorer.DataFrame do
opts :: [nils: :first | :last, stable: boolean()]
) :: DataFrame.t()
def arrange_with(%DataFrame{} = df, fun, opts \\ []) when is_function(fun, 1) do
opts = Keyword.validate!(opts, nils: :last, stable: true)

nulls_last =
case opts[:nils] do
:first -> false
:last -> true
_ -> raise ArgumentError, "`nils` must be `:first` or `:last`"
end

maintain_order =
case opts[:stable] do
true -> true
false -> false
_ -> raise ArgumentError, "`stable` must be `true` or `false`"
end
[_descending? | opts] = Shared.validate_sort_options!(opts)

ldf = Explorer.Backend.LazyFrame.new(df)

Expand All @@ -3270,12 +3268,7 @@ defmodule Explorer.DataFrame do
raise "not a valid lazy series or arrange instruction: #{inspect(other)}"
end)

Shared.apply_impl(df, :arrange_with, [
df,
dir_and_lazy_series_pairs,
nulls_last,
maintain_order
])
Shared.apply_impl(df, :arrange_with, [df, dir_and_lazy_series_pairs] ++ opts)
end

@doc """
Expand Down
54 changes: 38 additions & 16 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -643,22 +643,44 @@ defmodule Explorer.PolarsBackend.DataFrame do
end

@impl true
def arrange_with(%DataFrame{} = df, out_df, column_pairs, nulls_last, maintain_order) do
{directions, expressions} =
column_pairs
|> Enum.map(fn {direction, lazy_series} ->
expr = to_expr(lazy_series)
{direction == :desc, expr}
end)
|> Enum.unzip()

Shared.apply_dataframe(df, out_df, :df_arrange_with, [
expressions,
directions,
nulls_last,
maintain_order,
df.groups
])
def arrange_with(
%DataFrame{} = df,
out_df,
column_pairs,
maintain_order?,
multithreaded?,
nulls_last?
)
when is_boolean(maintain_order?) and is_boolean(multithreaded?) and
is_boolean(nulls_last?) do
if Enum.all?(column_pairs, fn {_, %{op: op}} -> op == :column end) do
{directions, column_names} =
column_pairs
|> Enum.map(fn {dir, %{args: [col]}} -> {dir == :desc, col} end)
|> Enum.unzip()

Shared.apply_dataframe(df, out_df, :df_arrange, [
column_names,
directions,
maintain_order?,
multithreaded?,
nulls_last?,
df.groups
])
else
{directions, expressions} =
column_pairs
|> Enum.map(fn {dir, lazy_series} -> {dir == :desc, to_expr(lazy_series)} end)
|> Enum.unzip()

Shared.apply_dataframe(df, out_df, :df_arrange_with, [
expressions,
directions,
maintain_order?,
nulls_last?,
df.groups
])
end
end

@impl true
Expand Down
4 changes: 2 additions & 2 deletions lib/explorer/polars_backend/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ defmodule Explorer.PolarsBackend.Expression do

@first_only_expressions [
quantile: 2,
argsort: 3,
sort: 3,
argsort: 5,
sort: 5,
head: 2,
tail: 2,
peaks: 2,
Expand Down
17 changes: 13 additions & 4 deletions lib/explorer/polars_backend/lazy_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,16 @@ defmodule Explorer.PolarsBackend.LazyFrame do
end

@impl true
def arrange_with(%DF{groups: []} = df, out_df, column_pairs, nulls_last, maintain_order) do
def arrange_with(
%DF{groups: []} = df,
out_df,
column_pairs,
maintain_order?,
multithreaded?,
nulls_last?
)
when is_boolean(maintain_order?) and is_boolean(multithreaded?) and
is_boolean(nulls_last?) do
{directions, expressions} =
column_pairs
|> Enum.map(fn {direction, lazy_series} -> {direction == :desc, to_expr(lazy_series)} end)
Expand All @@ -363,13 +372,13 @@ defmodule Explorer.PolarsBackend.LazyFrame do
Shared.apply_dataframe(df, out_df, :lf_arrange_with, [
expressions,
directions,
nulls_last,
maintain_order
maintain_order?,
nulls_last?
])
end

@impl true
def arrange_with(_df, _out_df, _directions, _nulls_last, _maintain_order) do
def arrange_with(_df, _out_df, _directions, _maintain_order?, _multithreaded?, _nulls_last?) do
raise "arrange_with/2 with groups is not supported yet for lazy frames"
end

Expand Down
28 changes: 23 additions & 5 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,20 @@ defmodule Explorer.PolarsBackend.Native do
defstruct [:inner]

def df_from_arrow_stream_pointer(_stream_ptr), do: err()
def df_arrange(_df, _by, _reverse, _nulls_last, _maintain_order, _groups), do: err()

def df_arrange_with(_df, _expressions, _directions, _nulls_last, _maintain_order, _groups),
def df_arrange(_df, _by, _reverse, _maintain_order?, _multithreaded?, _nulls_last?, _groups),
do: err()

def df_arrange_with(
_df,
_expressions,
_directions,
_maintain_order?,
_nulls_last?,
_groups
),
do: err()

def df_concat_columns(_df, _others), do: err()
def df_concat_rows(_df, _others), do: err()
def df_distinct(_df, _subset, _selection), do: err()
Expand Down Expand Up @@ -236,7 +245,16 @@ defmodule Explorer.PolarsBackend.Native do
do: err()

def lf_filter_with(_df, _expression), do: err()
def lf_arrange_with(_df, _expressions, _directions, _nulls_last, _maintain_order), do: err()

def lf_arrange_with(
_df,
_expressions,
_directions,
_maintain_order?,
_nulls_last?
),
do: err()

def lf_distinct(_df, _subset, _selection), do: err()
def lf_mutate_with(_df, _exprs), do: err()
def lf_summarise_with(_df, _groups, _aggs), do: err()
Expand All @@ -258,7 +276,7 @@ defmodule Explorer.PolarsBackend.Native do
def s_any(_s), do: err()
def s_argmax(_s), do: err()
def s_argmin(_s), do: err()
def s_argsort(_s, _descending?, _nils_last?), do: err()
def s_argsort(_s, _descending?, _maintain_order?, _multithreaded?, _nulls_last?), do: err()
def s_cast(_s, _dtype), do: err()
def s_categories(_s), do: err()
def s_categorise(_s, _s_categories), do: err()
Expand Down Expand Up @@ -367,7 +385,7 @@ defmodule Explorer.PolarsBackend.Native do
def s_slice(_s, _offset, _length), do: err()
def s_slice_by_indices(_s, _indices), do: err()
def s_slice_by_series(_s, _series), do: err()
def s_sort(_s, _descending?, _nils_last?), do: err()
def s_sort(_s, _descending?, _maintain_order?, _multithreaded?, _nulls_last?), do: err()
def s_standard_deviation(_s, _ddof), do: err()
def s_strip(_s, _string), do: err()
def s_subtract(_s, _other), do: err()
Expand Down
24 changes: 18 additions & 6 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -407,15 +407,27 @@ defmodule Explorer.PolarsBackend.Series do
# Sort

@impl true
def sort(series, descending?, nils_last?)
when is_boolean(descending?) and is_boolean(nils_last?) do
Shared.apply_series(series, :s_sort, [descending?, nils_last?])
def sort(series, descending?, maintain_order?, multithreaded?, nulls_last?)
when is_boolean(descending?) and is_boolean(maintain_order?) and is_boolean(multithreaded?) and
is_boolean(nulls_last?) do
Shared.apply_series(series, :s_sort, [
descending?,
maintain_order?,
multithreaded?,
nulls_last?
])
end

@impl true
def argsort(series, descending?, nils_last?)
when is_boolean(descending?) and is_boolean(nils_last?) do
Shared.apply_series(series, :s_argsort, [descending?, nils_last?])
def argsort(series, descending?, maintain_order?, multithreaded?, nulls_last?)
when is_boolean(descending?) and is_boolean(maintain_order?) and is_boolean(multithreaded?) and
is_boolean(nulls_last?) do
Shared.apply_series(series, :s_argsort, [
descending?,
maintain_order?,
multithreaded?,
nulls_last?
])
end

@impl true
Expand Down
Loading

0 comments on commit b80fd1a

Please sign in to comment.