Skip to content

Commit

Permalink
Support more integer dtypes in Series (#824)
Browse files Browse the repository at this point in the history
* Support signed/unsigned dtypes in aggregation for series

This is part of #794

* WIP: need to fix casting before subtract

* Fix issues after rebase

* Fix min/max, categorise and subtract from Series

This is to make them work with integers.

* Divide will always give a f64

* Make Series.subtract/2 work by mixing int dtypes

* More tests to `Series.divide/2`

* Fix `Series.in/2` to support mixing integer dtypes

* Fix Series.peaks/2 to support more numeric dtypes

* Fix Series.quotient/2 to work with mixied int dtypes

* Make `Series.rank/2` return a u32 series for ordinal ranking

* Document about mixing series of different dtypes on select/2

* Ensure that we are "targeting" the right dtype

This changes a little bit the algorithm for the "out_dtype" in some
arithmetic operations.

* Fix `Series.remainder/2` to work with more int dtypes

* Fix "DF.dummies/2" to use :u8 columns instead of :s64

* Refactor to use "right" int types in Series' min/max/sum

* Use u32 in Series.argsort/2

* Update lib/explorer/series.ex

Co-authored-by: Billy Lanchantin <[email protected]>

* Simplify calculation of out_dtype in arithmetic ops

* Move cast to rust code in "Series.subtract/2"

The idea is to have fewer series in memory.

* Fix cast_to_divide

* Simplify rule for subtract unsigned integers

Overflow may occur, but it's fine.
  • Loading branch information
philss authored Jan 16, 2024
1 parent b3e3dd5 commit e30207e
Show file tree
Hide file tree
Showing 6 changed files with 483 additions and 112 deletions.
26 changes: 13 additions & 13 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3732,9 +3732,9 @@ defmodule Explorer.DataFrame do
iex> Explorer.DataFrame.dummies(df, "col_x")
#Explorer.DataFrame<
Polars[4 x 3]
col_x_a s64 [1, 0, 1, 0]
col_x_b s64 [0, 1, 0, 0]
col_x_c s64 [0, 0, 0, 1]
col_x_a u8 [1, 0, 1, 0]
col_x_b u8 [0, 1, 0, 0]
col_x_c u8 [0, 0, 0, 1]
>
Or multiple columns:
Expand All @@ -3743,12 +3743,12 @@ defmodule Explorer.DataFrame do
iex> Explorer.DataFrame.dummies(df, ["col_x", "col_y"])
#Explorer.DataFrame<
Polars[4 x 6]
col_x_a s64 [1, 0, 1, 0]
col_x_b s64 [0, 1, 0, 0]
col_x_c s64 [0, 0, 0, 1]
col_y_b s64 [1, 0, 1, 0]
col_y_a s64 [0, 1, 0, 0]
col_y_d s64 [0, 0, 0, 1]
col_x_a u8 [1, 0, 1, 0]
col_x_b u8 [0, 1, 0, 0]
col_x_c u8 [0, 0, 0, 1]
col_y_b u8 [1, 0, 1, 0]
col_y_a u8 [0, 1, 0, 0]
col_y_d u8 [0, 0, 0, 1]
>
Or all string columns:
Expand All @@ -3757,9 +3757,9 @@ defmodule Explorer.DataFrame do
iex> Explorer.DataFrame.dummies(df, fn _name, type -> type == :string end)
#Explorer.DataFrame<
Polars[4 x 3]
col_y_b s64 [1, 0, 1, 0]
col_y_a s64 [0, 1, 0, 0]
col_y_d s64 [0, 0, 0, 1]
col_y_b u8 [1, 0, 1, 0]
col_y_a u8 [0, 1, 0, 0]
col_y_d u8 [0, 0, 0, 1]
>
Ranges, regexes, and functions are also accepted in column names, as in `select/2`.
Expand All @@ -3779,7 +3779,7 @@ defmodule Explorer.DataFrame do
value <- Series.to_list(Series.distinct(df[column])),
do: column <> "_#{value}"

out_dtypes = for new_column <- out_columns, into: %{}, do: {new_column, {:s, 64}}
out_dtypes = for new_column <- out_columns, into: %{}, do: {new_column, {:u, 8}}

out_df = %{df | groups: [], names: out_columns, dtypes: out_dtypes}
Shared.apply_impl(df, :dummies, [out_df, columns])
Expand Down
11 changes: 7 additions & 4 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,11 @@ defmodule Explorer.PolarsBackend.Series do
def mode(series), do: Shared.apply_series(series, :s_mode)

@impl true
def variance(series, ddof), do: Shared.apply_series(series, :s_variance, [ddof])
def variance(series, ddof), do: series |> Shared.apply_series(:s_variance, [ddof]) |> at(0)

@impl true
def standard_deviation(series, ddof),
do: Shared.apply_series(series, :s_standard_deviation, [ddof])
do: series |> Shared.apply_series(:s_standard_deviation, [ddof]) |> at(0)

@impl true
def quantile(series, quantile),
Expand Down Expand Up @@ -271,8 +271,11 @@ defmodule Explorer.PolarsBackend.Series do
do: Shared.apply_series(matching_size!(left, right), :s_add, [right.data])

@impl true
def subtract(_out_dtype, left, right),
do: Shared.apply_series(matching_size!(left, right), :s_subtract, [right.data])
def subtract(_out_dtype, left, right) do
left = matching_size!(left, right)

Shared.apply_series(left, :s_subtract, [right.data])
end

@impl true
def multiply(out_dtype, left, right) do
Expand Down
91 changes: 50 additions & 41 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -1234,11 +1234,11 @@ defmodule Explorer.Series do
"""
@doc type: :element_wise
def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: dtype} = categories)
when K.and(K.in(l_dtype, [{:s, 64}, :string]), K.in(dtype, [:string, :category])),
when K.and(K.in(l_dtype, [:string | @integer_types]), K.in(dtype, [:string, :category])),
do: apply_series(series, :categorise, [categories])

def categorise(%Series{dtype: l_dtype} = series, [head | _] = categories)
when K.and(K.in(l_dtype, [{:s, 64}, :string]), is_binary(head)),
when K.and(K.in(l_dtype, [:string | @integer_types]), is_binary(head)),
do: apply_series(series, :categorise, [from_list(categories, dtype: :string)])

# Slice and dice
Expand Down Expand Up @@ -1337,6 +1337,10 @@ defmodule Explorer.Series do
`predicate` must be a boolean series. `on_true` and `on_false` must be
a series of the same size as `predicate` or a series of size 1.
It is possible to mix numeric series in the `on_true` and `on_false`,
and the resultant series will have the dtype of the greater side.
For example, `:u8` and `:s16` is going to result in `:s16` series.
"""
@doc type: :element_wise
@spec select(
Expand Down Expand Up @@ -1840,7 +1844,7 @@ defmodule Explorer.Series do
iex> Explorer.Series.rank(s, method: :ordinal)
#Explorer.Series<
Polars[3]
s64 [1, 2, 3]
u32 [1, 2, 3]
>
iex> s = Explorer.Series.from_list([ ~N[2022-07-07 17:44:13.020548], ~N[2022-07-07 17:43:08.473561], ~N[2022-07-07 17:45:00.116337] ])
Expand Down Expand Up @@ -3095,6 +3099,7 @@ defmodule Explorer.Series do
|> enforce_highest_precision()
end

# TODO: maybe we can move this casting to Rust.
defp enforce_highest_precision([
%Series{dtype: {left_base, left_timeunit}} = left,
%Series{dtype: {right_base, right_timeunit}} = right
Expand Down Expand Up @@ -3167,22 +3172,29 @@ defmodule Explorer.Series do
end
end

# TODO: fix the logic for integer dtypes
defp cast_to_add({:s, left}, {:s, right}), do: {:s, max(left, right)}
defp cast_to_add({:s, _}, {:f, _} = float), do: float
defp cast_to_add({:f, _} = float, {:s, _}), do: float
defp cast_to_add({:f, _}, {:f, _}), do: {:f, 64}
defp cast_to_add(:date, {:duration, _}), do: :date
defp cast_to_add({:duration, _}, :date), do: :date
defp cast_to_add({:datetime, p}, {:duration, p}), do: {:datetime, p}
defp cast_to_add({:duration, p}, {:datetime, p}), do: {:datetime, p}
defp cast_to_add({:duration, p}, {:duration, p}), do: {:duration, p}
defp cast_to_add(_, _), do: nil
defp cast_to_add(left, right), do: cast_numeric(left, right)

defp cast_numeric({int_type, left}, {int_type, right}) when K.in(int_type, [:s, :u]),
do: {int_type, max(left, right)}

defp cast_numeric({:s, s_size}, {:u, u_size}), do: {:s, max(min(64, u_size * 2), s_size)}
defp cast_numeric({:u, s_size}, {:s, u_size}), do: {:s, max(min(64, u_size * 2), s_size)}
defp cast_numeric({int_type, _}, {:f, _} = float) when K.in(int_type, [:s, :u]), do: float
defp cast_numeric({:f, _} = float, {int_type, _}) when K.in(int_type, [:s, :u]), do: float
defp cast_numeric({:f, left}, {:f, right}), do: {:f, max(left, right)}
defp cast_numeric(_, _), do: nil

@doc """
Subtracts right from left, element-wise.
When mixing floats and integers, the resulting series will have dtype `{:f, 64}`.
In case both series are of unsigned integers, we will try to subtract,
but an exception is raised if overflow occurs.
At least one of the arguments must be a series. If both
sizes are series, the series must have the same size or
Expand Down Expand Up @@ -3234,18 +3246,12 @@ defmodule Explorer.Series do
end
end

# TODO: fix the logic for new integer dtypes
defp cast_to_subtract({:s, left}, {:s, right}), do: {:s, max(left, right)}
defp cast_to_subtract({:s, _}, {:f, _} = float), do: float
defp cast_to_subtract({:f, _} = float, {:s, _}), do: float
defp cast_to_subtract({:f, _}, {:f, _}), do: {:f, 64}

defp cast_to_subtract(:date, :date), do: {:duration, :millisecond}
defp cast_to_subtract(:date, {:duration, _}), do: :date
defp cast_to_subtract({:datetime, p}, {:datetime, p}), do: {:duration, p}
defp cast_to_subtract({:datetime, p}, {:duration, p}), do: {:datetime, p}
defp cast_to_subtract({:duration, p}, {:duration, p}), do: {:duration, p}
defp cast_to_subtract(_, _), do: nil
defp cast_to_subtract(left, right), do: cast_numeric(left, right)

@doc """
Multiplies left and right, element-wise.
Expand Down Expand Up @@ -3293,16 +3299,11 @@ defmodule Explorer.Series do
end
end

# TODO: fix the logic for new dtypes
defp cast_to_multiply({:s, left}, {:s, right}), do: {:s, max(left, right)}
defp cast_to_multiply({:s, _}, {:f, _} = float), do: float
defp cast_to_multiply({:f, _} = float, {:s, _}), do: float
defp cast_to_multiply({:f, _}, {:f, _}), do: {:f, 64}
defp cast_to_multiply({:s, _}, {:duration, p}), do: {:duration, p}
defp cast_to_multiply({:duration, p}, {:s, _}), do: {:duration, p}
defp cast_to_multiply({:f, _}, {:duration, p}), do: {:duration, p}
defp cast_to_multiply({:duration, p}, {:f, _}), do: {:duration, p}
defp cast_to_multiply(_, _), do: nil
defp cast_to_multiply(left, right), do: cast_numeric(left, right)

@doc """
Divides left by right, element-wise.
Expand Down Expand Up @@ -3368,11 +3369,13 @@ defmodule Explorer.Series do
end
end

# Fix the logic for new integer dtypes
defp cast_to_divide({:s, _}, {:s, _}), do: {:f, 64}
defp cast_to_divide({:s, _}, {:f, _} = float), do: float
defp cast_to_divide({:f, _} = float, {:s, _}), do: float
defp cast_to_divide({:f, _}, {:f, _}), do: {:f, 64}
# Review the size needed for this operation.
defp cast_to_divide({int_type, _}, {int_type, _}) when K.in(int_type, [:s, :u]), do: {:f, 64}
defp cast_to_divide({:s, _}, {:u, _}), do: {:f, 64}
defp cast_to_divide({:u, _}, {:s, _}), do: {:f, 64}
defp cast_to_divide({int_type, _}, {:f, _} = float) when K.in(int_type, [:s, :u]), do: float
defp cast_to_divide({:f, _} = float, {int_type, _}) when K.in(int_type, [:s, :u]), do: float
defp cast_to_divide({:f, left}, {:f, right}), do: {:f, max(left, right)}
defp cast_to_divide({:duration, p}, {:s, _}), do: {:duration, p}
defp cast_to_divide({:duration, p}, {:f, _}), do: {:duration, p}
defp cast_to_divide(_, _), do: nil
Expand Down Expand Up @@ -3534,14 +3537,17 @@ defmodule Explorer.Series do
"""
@doc type: :element_wise
@spec quotient(left :: Series.t(), right :: Series.t() | integer()) :: Series.t()
def quotient(%Series{dtype: {:s, 64}} = left, %Series{dtype: {:s, 64}} = right),
do: apply_series_list(:quotient, [left, right])
def quotient(%Series{dtype: l_dtype} = left, %Series{dtype: r_dtype} = right)
when K.and(K.in(l_dtype, @integer_types), K.in(r_dtype, @integer_types)),
do: apply_series_list(:quotient, [left, right])

def quotient(%Series{dtype: {:s, 64}} = left, right) when is_integer(right),
do: apply_series_list(:quotient, [left, from_list([right])])
def quotient(%Series{dtype: l_dtype} = left, right)
when K.and(K.in(l_dtype, @integer_types), is_integer(right)),
do: apply_series_list(:quotient, [left, from_list([right])])

def quotient(left, %Series{dtype: {:s, 64}} = right) when is_integer(left),
do: apply_series_list(:quotient, [from_list([left]), right])
def quotient(left, %Series{dtype: r_dtype} = right)
when K.and(K.in(r_dtype, @integer_types), is_integer(left)),
do: apply_series_list(:quotient, [from_list([left]), right])

@doc """
Computes the remainder of an element-wise integer division.
Expand Down Expand Up @@ -3584,14 +3590,17 @@ defmodule Explorer.Series do
"""
@doc type: :element_wise
@spec remainder(left :: Series.t(), right :: Series.t() | integer()) :: Series.t()
def remainder(%Series{dtype: {:s, 64}} = left, %Series{dtype: {:s, 64}} = right),
do: apply_series_list(:remainder, [left, right])
def remainder(%Series{dtype: l_dtype} = left, %Series{dtype: r_dtype} = right)
when K.and(K.in(l_dtype, @integer_types), K.in(r_dtype, @integer_types)),
do: apply_series_list(:remainder, [left, right])

def remainder(%Series{dtype: {:s, 64}} = left, right) when is_integer(right),
do: apply_series_list(:remainder, [left, from_list([right])])
def remainder(%Series{dtype: l_dtype} = left, right)
when K.and(K.in(l_dtype, @integer_types), is_integer(right)),
do: apply_series_list(:remainder, [left, from_list([right])])

def remainder(left, %Series{dtype: {:s, 64}} = right) when is_integer(left),
do: apply_series_list(:remainder, [from_list([left]), right])
def remainder(left, %Series{dtype: r_dtype} = right)
when K.and(K.in(r_dtype, @integer_types), is_integer(left)),
do: apply_series_list(:remainder, [from_list([left]), right])

@doc """
Computes the the sine of a number (in radians).
Expand Down Expand Up @@ -4401,14 +4410,14 @@ defmodule Explorer.Series do
iex> Explorer.Series.argsort(s)
#Explorer.Series<
Polars[4]
s64 [3, 1, 2, 0]
u32 [3, 1, 2, 0]
>
iex> s = Explorer.Series.from_list([9, 3, 7, 1])
iex> Explorer.Series.argsort(s, direction: :desc)
#Explorer.Series<
Polars[4]
s64 [0, 2, 1, 3]
u32 [0, 2, 1, 3]
>
"""
Expand Down
7 changes: 2 additions & 5 deletions native/explorer/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -477,11 +477,8 @@ pub fn df_to_dummies(df: ExDataFrame, selection: Vec<&str>) -> Result<ExDataFram
let dummies = df
.select(selection)
.and_then(|df| df.to_dummies(None, drop_first))?;
let series = dummies
.iter()
.map(|series| series.cast(&DataType::Int64).unwrap())
.collect();
Ok(ExDataFrame::new(DataFrame::new(series)?))

Ok(ExDataFrame::new(dummies))
}

#[rustler::nif(schedule = "DirtyCpu")]
Expand Down
Loading

0 comments on commit e30207e

Please sign in to comment.