Support more integer dtypes in Series (#824)

* Support signed/unsigned dtypes in aggregation for series This is part of #794 * WIP: need to fix casting before subtract * Fix issues after rebase * Fix min/max, categorise and subtract from Series This is to make them work with integers. * Divide will always give a f64 * Make Series.subtract/2 work by mixing int dtypes * More tests to `Series.divide/2` * Fix `Series.in/2` to support mixing integer dtypes * Fix Series.peaks/2 to support more numeric dtypes * Fix Series.quotient/2 to work with mixied int dtypes * Make `Series.rank/2` return a u32 series for ordinal ranking * Document about mixing series of different dtypes on select/2 * Ensure that we are "targeting" the right dtype This changes a little bit the algorithm for the "out_dtype" in some arithmetic operations. * Fix `Series.remainder/2` to work with more int dtypes * Fix "DF.dummies/2" to use :u8 columns instead of :s64 * Refactor to use "right" int types in Series' min/max/sum * Use u32 in Series.argsort/2 * Update lib/explorer/series.ex Co-authored-by: Billy Lanchantin <[email protected]> * Simplify calculation of out_dtype in arithmetic ops * Move cast to rust code in "Series.subtract/2" The idea is to have fewer series in memory. * Fix cast_to_divide * Simplify rule for subtract unsigned integers Overflow may occur, but it's fine.
elixir-explorer · Jan 16, 2024 · e30207e · e30207e
1 parent b3e3dd5
commit e30207e
Show file tree

Hide file tree

Showing 6 changed files with 483 additions and 112 deletions.
diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
@@ -3732,9 +3732,9 @@ defmodule Explorer.DataFrame do
       iex> Explorer.DataFrame.dummies(df, "col_x")
       #Explorer.DataFrame<
         Polars[4 x 3]
-        col_x_a s64 [1, 0, 1, 0]
-        col_x_b s64 [0, 1, 0, 0]
-        col_x_c s64 [0, 0, 0, 1]
+        col_x_a u8 [1, 0, 1, 0]
+        col_x_b u8 [0, 1, 0, 0]
+        col_x_c u8 [0, 0, 0, 1]
       >
 
   Or multiple columns:
@@ -3743,12 +3743,12 @@ defmodule Explorer.DataFrame do
       iex> Explorer.DataFrame.dummies(df, ["col_x", "col_y"])
       #Explorer.DataFrame<
         Polars[4 x 6]
-        col_x_a s64 [1, 0, 1, 0]
-        col_x_b s64 [0, 1, 0, 0]
-        col_x_c s64 [0, 0, 0, 1]
-        col_y_b s64 [1, 0, 1, 0]
-        col_y_a s64 [0, 1, 0, 0]
-        col_y_d s64 [0, 0, 0, 1]
+        col_x_a u8 [1, 0, 1, 0]
+        col_x_b u8 [0, 1, 0, 0]
+        col_x_c u8 [0, 0, 0, 1]
+        col_y_b u8 [1, 0, 1, 0]
+        col_y_a u8 [0, 1, 0, 0]
+        col_y_d u8 [0, 0, 0, 1]
       >
 
   Or all string columns:
@@ -3757,9 +3757,9 @@ defmodule Explorer.DataFrame do
       iex> Explorer.DataFrame.dummies(df, fn _name, type -> type == :string end)
       #Explorer.DataFrame<
         Polars[4 x 3]
-        col_y_b s64 [1, 0, 1, 0]
-        col_y_a s64 [0, 1, 0, 0]
-        col_y_d s64 [0, 0, 0, 1]
+        col_y_b u8 [1, 0, 1, 0]
+        col_y_a u8 [0, 1, 0, 0]
+        col_y_d u8 [0, 0, 0, 1]
       >
 
   Ranges, regexes, and functions are also accepted in column names, as in `select/2`.
@@ -3779,7 +3779,7 @@ defmodule Explorer.DataFrame do
           value <- Series.to_list(Series.distinct(df[column])),
           do: column <> "_#{value}"
 
-    out_dtypes = for new_column <- out_columns, into: %{}, do: {new_column, {:s, 64}}
+    out_dtypes = for new_column <- out_columns, into: %{}, do: {new_column, {:u, 8}}
 
     out_df = %{df | groups: [], names: out_columns, dtypes: out_dtypes}
     Shared.apply_impl(df, :dummies, [out_df, columns])

diff --git a/lib/explorer/polars_backend/series.ex b/lib/explorer/polars_backend/series.ex
@@ -208,11 +208,11 @@ defmodule Explorer.PolarsBackend.Series do
   def mode(series), do: Shared.apply_series(series, :s_mode)
 
   @impl true
-  def variance(series, ddof), do: Shared.apply_series(series, :s_variance, [ddof])
+  def variance(series, ddof), do: series |> Shared.apply_series(:s_variance, [ddof]) |> at(0)
 
   @impl true
   def standard_deviation(series, ddof),
-    do: Shared.apply_series(series, :s_standard_deviation, [ddof])
+    do: series |> Shared.apply_series(:s_standard_deviation, [ddof]) |> at(0)
 
   @impl true
   def quantile(series, quantile),
@@ -271,8 +271,11 @@ defmodule Explorer.PolarsBackend.Series do
     do: Shared.apply_series(matching_size!(left, right), :s_add, [right.data])
 
   @impl true
-  def subtract(_out_dtype, left, right),
-    do: Shared.apply_series(matching_size!(left, right), :s_subtract, [right.data])
+  def subtract(_out_dtype, left, right) do
+    left = matching_size!(left, right)
+
+    Shared.apply_series(left, :s_subtract, [right.data])
+  end
 
   @impl true
   def multiply(out_dtype, left, right) do

diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex
@@ -1234,11 +1234,11 @@ defmodule Explorer.Series do
   """
   @doc type: :element_wise
   def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: dtype} = categories)
-      when K.and(K.in(l_dtype, [{:s, 64}, :string]), K.in(dtype, [:string, :category])),
+      when K.and(K.in(l_dtype, [:string | @integer_types]), K.in(dtype, [:string, :category])),
       do: apply_series(series, :categorise, [categories])
 
   def categorise(%Series{dtype: l_dtype} = series, [head | _] = categories)
-      when K.and(K.in(l_dtype, [{:s, 64}, :string]), is_binary(head)),
+      when K.and(K.in(l_dtype, [:string | @integer_types]), is_binary(head)),
       do: apply_series(series, :categorise, [from_list(categories, dtype: :string)])
 
   # Slice and dice
@@ -1337,6 +1337,10 @@ defmodule Explorer.Series do
 
   `predicate` must be a boolean series. `on_true` and `on_false` must be
   a series of the same size as `predicate` or a series of size 1.
+
+  It is possible to mix numeric series in the `on_true` and `on_false`,
+  and the resultant series will have the dtype of the greater side.
+  For example, `:u8` and `:s16` is going to result in `:s16` series.
   """
   @doc type: :element_wise
   @spec select(
@@ -1840,7 +1844,7 @@ defmodule Explorer.Series do
       iex> Explorer.Series.rank(s, method: :ordinal)
       #Explorer.Series<
         Polars[3]
-        s64 [1, 2, 3]
+        u32 [1, 2, 3]
       >
 
       iex> s = Explorer.Series.from_list([ ~N[2022-07-07 17:44:13.020548], ~N[2022-07-07 17:43:08.473561], ~N[2022-07-07 17:45:00.116337] ])
@@ -3095,6 +3099,7 @@ defmodule Explorer.Series do
     |> enforce_highest_precision()
   end
 
+  # TODO: maybe we can move this casting to Rust.
   defp enforce_highest_precision([
          %Series{dtype: {left_base, left_timeunit}} = left,
          %Series{dtype: {right_base, right_timeunit}} = right
@@ -3167,22 +3172,29 @@ defmodule Explorer.Series do
     end
   end
 
-  # TODO: fix the logic for integer dtypes
-  defp cast_to_add({:s, left}, {:s, right}), do: {:s, max(left, right)}
-  defp cast_to_add({:s, _}, {:f, _} = float), do: float
-  defp cast_to_add({:f, _} = float, {:s, _}), do: float
-  defp cast_to_add({:f, _}, {:f, _}), do: {:f, 64}
   defp cast_to_add(:date, {:duration, _}), do: :date
   defp cast_to_add({:duration, _}, :date), do: :date
   defp cast_to_add({:datetime, p}, {:duration, p}), do: {:datetime, p}
   defp cast_to_add({:duration, p}, {:datetime, p}), do: {:datetime, p}
   defp cast_to_add({:duration, p}, {:duration, p}), do: {:duration, p}
-  defp cast_to_add(_, _), do: nil
+  defp cast_to_add(left, right), do: cast_numeric(left, right)
+
+  defp cast_numeric({int_type, left}, {int_type, right}) when K.in(int_type, [:s, :u]),
+    do: {int_type, max(left, right)}
+
+  defp cast_numeric({:s, s_size}, {:u, u_size}), do: {:s, max(min(64, u_size * 2), s_size)}
+  defp cast_numeric({:u, s_size}, {:s, u_size}), do: {:s, max(min(64, u_size * 2), s_size)}
+  defp cast_numeric({int_type, _}, {:f, _} = float) when K.in(int_type, [:s, :u]), do: float
+  defp cast_numeric({:f, _} = float, {int_type, _}) when K.in(int_type, [:s, :u]), do: float
+  defp cast_numeric({:f, left}, {:f, right}), do: {:f, max(left, right)}
+  defp cast_numeric(_, _), do: nil
 
   @doc """
   Subtracts right from left, element-wise.
 
   When mixing floats and integers, the resulting series will have dtype `{:f, 64}`.
+  In case both series are of unsigned integers, we will try to subtract,
+  but an exception is raised if overflow occurs.
 
   At least one of the arguments must be a series. If both
   sizes are series, the series must have the same size or
@@ -3234,18 +3246,12 @@ defmodule Explorer.Series do
     end
   end
 
-  # TODO: fix the logic for new integer dtypes
-  defp cast_to_subtract({:s, left}, {:s, right}), do: {:s, max(left, right)}
-  defp cast_to_subtract({:s, _}, {:f, _} = float), do: float
-  defp cast_to_subtract({:f, _} = float, {:s, _}), do: float
-  defp cast_to_subtract({:f, _}, {:f, _}), do: {:f, 64}
-
   defp cast_to_subtract(:date, :date), do: {:duration, :millisecond}
   defp cast_to_subtract(:date, {:duration, _}), do: :date
   defp cast_to_subtract({:datetime, p}, {:datetime, p}), do: {:duration, p}
   defp cast_to_subtract({:datetime, p}, {:duration, p}), do: {:datetime, p}
   defp cast_to_subtract({:duration, p}, {:duration, p}), do: {:duration, p}
-  defp cast_to_subtract(_, _), do: nil
+  defp cast_to_subtract(left, right), do: cast_numeric(left, right)
 
   @doc """
   Multiplies left and right, element-wise.
@@ -3293,16 +3299,11 @@ defmodule Explorer.Series do
     end
   end
 
-  # TODO: fix the logic for new dtypes
-  defp cast_to_multiply({:s, left}, {:s, right}), do: {:s, max(left, right)}
-  defp cast_to_multiply({:s, _}, {:f, _} = float), do: float
-  defp cast_to_multiply({:f, _} = float, {:s, _}), do: float
-  defp cast_to_multiply({:f, _}, {:f, _}), do: {:f, 64}
   defp cast_to_multiply({:s, _}, {:duration, p}), do: {:duration, p}
   defp cast_to_multiply({:duration, p}, {:s, _}), do: {:duration, p}
   defp cast_to_multiply({:f, _}, {:duration, p}), do: {:duration, p}
   defp cast_to_multiply({:duration, p}, {:f, _}), do: {:duration, p}
-  defp cast_to_multiply(_, _), do: nil
+  defp cast_to_multiply(left, right), do: cast_numeric(left, right)
 
   @doc """
   Divides left by right, element-wise.
@@ -3368,11 +3369,13 @@ defmodule Explorer.Series do
     end
   end
 
-  # Fix the logic for new integer dtypes
-  defp cast_to_divide({:s, _}, {:s, _}), do: {:f, 64}
-  defp cast_to_divide({:s, _}, {:f, _} = float), do: float
-  defp cast_to_divide({:f, _} = float, {:s, _}), do: float
-  defp cast_to_divide({:f, _}, {:f, _}), do: {:f, 64}
+  # Review the size needed for this operation.
+  defp cast_to_divide({int_type, _}, {int_type, _}) when K.in(int_type, [:s, :u]), do: {:f, 64}
+  defp cast_to_divide({:s, _}, {:u, _}), do: {:f, 64}
+  defp cast_to_divide({:u, _}, {:s, _}), do: {:f, 64}
+  defp cast_to_divide({int_type, _}, {:f, _} = float) when K.in(int_type, [:s, :u]), do: float
+  defp cast_to_divide({:f, _} = float, {int_type, _}) when K.in(int_type, [:s, :u]), do: float
+  defp cast_to_divide({:f, left}, {:f, right}), do: {:f, max(left, right)}
   defp cast_to_divide({:duration, p}, {:s, _}), do: {:duration, p}
   defp cast_to_divide({:duration, p}, {:f, _}), do: {:duration, p}
   defp cast_to_divide(_, _), do: nil
@@ -3534,14 +3537,17 @@ defmodule Explorer.Series do
   """
   @doc type: :element_wise
   @spec quotient(left :: Series.t(), right :: Series.t() | integer()) :: Series.t()
-  def quotient(%Series{dtype: {:s, 64}} = left, %Series{dtype: {:s, 64}} = right),
-    do: apply_series_list(:quotient, [left, right])
+  def quotient(%Series{dtype: l_dtype} = left, %Series{dtype: r_dtype} = right)
+      when K.and(K.in(l_dtype, @integer_types), K.in(r_dtype, @integer_types)),
+      do: apply_series_list(:quotient, [left, right])
 
-  def quotient(%Series{dtype: {:s, 64}} = left, right) when is_integer(right),
-    do: apply_series_list(:quotient, [left, from_list([right])])
+  def quotient(%Series{dtype: l_dtype} = left, right)
+      when K.and(K.in(l_dtype, @integer_types), is_integer(right)),
+      do: apply_series_list(:quotient, [left, from_list([right])])
 
-  def quotient(left, %Series{dtype: {:s, 64}} = right) when is_integer(left),
-    do: apply_series_list(:quotient, [from_list([left]), right])
+  def quotient(left, %Series{dtype: r_dtype} = right)
+      when K.and(K.in(r_dtype, @integer_types), is_integer(left)),
+      do: apply_series_list(:quotient, [from_list([left]), right])
 
   @doc """
   Computes the remainder of an element-wise integer division.
@@ -3584,14 +3590,17 @@ defmodule Explorer.Series do
   """
   @doc type: :element_wise
   @spec remainder(left :: Series.t(), right :: Series.t() | integer()) :: Series.t()
-  def remainder(%Series{dtype: {:s, 64}} = left, %Series{dtype: {:s, 64}} = right),
-    do: apply_series_list(:remainder, [left, right])
+  def remainder(%Series{dtype: l_dtype} = left, %Series{dtype: r_dtype} = right)
+      when K.and(K.in(l_dtype, @integer_types), K.in(r_dtype, @integer_types)),
+      do: apply_series_list(:remainder, [left, right])
 
-  def remainder(%Series{dtype: {:s, 64}} = left, right) when is_integer(right),
-    do: apply_series_list(:remainder, [left, from_list([right])])
+  def remainder(%Series{dtype: l_dtype} = left, right)
+      when K.and(K.in(l_dtype, @integer_types), is_integer(right)),
+      do: apply_series_list(:remainder, [left, from_list([right])])
 
-  def remainder(left, %Series{dtype: {:s, 64}} = right) when is_integer(left),
-    do: apply_series_list(:remainder, [from_list([left]), right])
+  def remainder(left, %Series{dtype: r_dtype} = right)
+      when K.and(K.in(r_dtype, @integer_types), is_integer(left)),
+      do: apply_series_list(:remainder, [from_list([left]), right])
 
   @doc """
   Computes the the sine of a number (in radians).
@@ -4401,14 +4410,14 @@ defmodule Explorer.Series do
       iex> Explorer.Series.argsort(s)
       #Explorer.Series<
         Polars[4]
-        s64 [3, 1, 2, 0]
+        u32 [3, 1, 2, 0]
       >
 
       iex> s = Explorer.Series.from_list([9, 3, 7, 1])
       iex> Explorer.Series.argsort(s, direction: :desc)
       #Explorer.Series<
         Polars[4]
-        s64 [0, 2, 1, 3]
+        u32 [0, 2, 1, 3]
       >
 
   """

diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs
@@ -477,11 +477,8 @@ pub fn df_to_dummies(df: ExDataFrame, selection: Vec<&str>) -> Result<ExDataFram
     let dummies = df
         .select(selection)
         .and_then(|df| df.to_dummies(None, drop_first))?;
-    let series = dummies
-        .iter()
-        .map(|series| series.cast(&DataType::Int64).unwrap())
-        .collect();
-    Ok(ExDataFrame::new(DataFrame::new(series)?))
+
+    Ok(ExDataFrame::new(dummies))
 }
 
 #[rustler::nif(schedule = "DirtyCpu")]