Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filter and filter_with to Series #728

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -1405,6 +1405,117 @@ defmodule Explorer.Series do
@spec at_every(series :: Series.t(), every_n :: integer()) :: Series.t()
def at_every(series, every_n), do: apply_series(series, :at_every, [every_n])

@doc """
Picks values based on an `Explorer.Query`.

The query is compiled and runs efficiently against the series.
The query must return a boolean expression or a list of boolean expressions.
When a list is returned, they are joined as `and` expressions.

> #### Notice {: .notice}
>
> This is a macro. You must `require Explorer.Series` before using it.

Besides element-wise series operations, you can also use window functions
and aggregations inside comparisons.

See `filter_with/2` for a callback version of this function without
`Explorer.Query`.
billylanchantin marked this conversation as resolved.
Show resolved Hide resolved

## Syntax

> #### Notice {: .notice}
>
> This macro uses the special `_` syntax.

DataFrames have named columns, so their queries use column names as variables:

iex> require Explorer.DataFrame
iex> df = Explorer.DataFrame.new(col_name: [1, 2, 3])
iex> Explorer.DataFrame.filter(df, col_name > 2)
#Explorer.DataFrame<
Polars[1 x 1]
col_name integer [3]
>

Series have no named columns.
(A series constitutes a single column, so no name is required.)
This means their queries can't use column names as variables.
Instead, series queries use the special `_` variable like so:

iex> s = Explorer.Series.from_list([1, 2, 3])
iex> Explorer.Series.filter(s, _ > 2)
#Explorer.Series<
Polars[1]
integer [3]
>
billylanchantin marked this conversation as resolved.
Show resolved Hide resolved

## Examples

iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.filter(s, _ == "b")
#Explorer.Series<
Polars[1]
string ["b"]
>

iex> s = Explorer.Series.from_list([1, 2, 3])
iex> Explorer.Series.filter(s, remainder(_, 2) == 1)
#Explorer.Series<
Polars[2]
integer [1, 3]
>

Returning a non-boolean expression errors:

iex> s = Explorer.Series.from_list([1, 2, 3])
iex> Explorer.Series.filter(s, cumulative_max(_))
** (ArgumentError) expecting the function to return a boolean LazySeries, but instead it returned a LazySeries of type :integer

Which can be addressed by converting it to boolean:

iex> s = Explorer.Series.from_list([1, 2, 3])
iex> Explorer.Series.filter(s, cumulative_max(_) == 1)
#Explorer.Series<
Polars[1]
integer [1]
>
"""
@doc type: :element_wise
defmacro filter(series, query) do
quote do
require Explorer.Query

Explorer.DataFrame.new([{:_, unquote(series)}])
billylanchantin marked this conversation as resolved.
Show resolved Hide resolved
|> Explorer.DataFrame.filter_with(Explorer.Query.query(unquote(query)))
|> Explorer.DataFrame.pull(:_)
end
end

@doc """
Filters a series with a callback function.

## Examples

iex> series = Explorer.Series.from_list([1, 2, 3])
iex> is_odd = fn s -> s |> Explorer.Series.remainder(2) |> Explorer.Series.equal(1) end
iex> Explorer.Series.filter_with(series, is_odd)
#Explorer.Series<
Polars[2]
integer [1, 3]
>
"""
@doc type: :element_wise
@spec filter_with(
series :: Series.t(),
fun :: (Series.t() -> Series.lazy_t())
) :: Series.t()
def filter_with(%Series{} = series, fun) when is_function(fun, 1) do
Explorer.DataFrame.new(series: series)
|> Explorer.DataFrame.filter_with(&fun.(&1[:series]))
|> Explorer.DataFrame.pull(:series)
end

@doc """
Filters a series with a mask.

Expand Down
48 changes: 48 additions & 0 deletions test/explorer/series_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2727,6 +2727,54 @@ defmodule Explorer.SeriesTest do
end
end

describe "filter/2" do
test "basic example" do
require Explorer.Series

s = Series.from_list([1, 2, 3, 4])
filtered = Series.filter(s, _ > 2)
assert Series.to_list(filtered) == [3, 4]
end

test "aggregation" do
require Explorer.Series

s = Series.from_list([1, 2, 3, 4])
filtered = Series.filter(s, _ == count(_))
assert Series.to_list(filtered) == [4]
end

test "mismatched columns" do
require Explorer.Series

s = Series.from_list([1, 2, 3, 4])
message = "could not find column name \"n\". The available entries are: [\"_\"]"

assert_raise ArgumentError, message, fn ->
Series.filter(s, n > 2)
end
end
end

describe "filter_with/2" do
test "basic example" do
s = Series.from_list([1, 2, 3, 4])
filtered = Series.filter_with(s, &Series.greater(&1, 2))
assert Series.to_list(filtered) == [3, 4]
end

test "raise an error if the function is not returning a lazy series" do
s = Series.from_list([1, 2, 3, 4])

message =
"expecting the function to return a single or a list of boolean LazySeries, but instead it contains:\ntrue"

assert_raise ArgumentError, message, fn ->
Series.filter_with(s, &(&1 > 2))
end
end
end

describe "sample/2" do
test "sample taking 10 elements" do
s = 1..100 |> Enum.to_list() |> Series.from_list()
Expand Down
Loading