Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump Polars 0.37 #861

Merged
merged 4 commits into from
Feb 25, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion datasets/iris.csv
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,3 @@ sepal_length,sepal_width,petal_length,petal_width,species
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

empty line is causing an extra row to be added to data frame and its related tests are failing..

7 changes: 6 additions & 1 deletion lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,12 @@ defmodule Explorer.Backend.LazySeries do

@impl true
def format(list) do
series_list = Enum.map(list, &series_or_lazy_series!/1)
series_list =
Enum.map(list, fn
s when is_binary(s) -> s
s -> series_or_lazy_series!(s)
end)

data = new(:format, [series_list], :string, aggregations?(series_list))

Backend.Series.new(data, :string)
Expand Down
1 change: 0 additions & 1 deletion lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,6 @@ defmodule Explorer.PolarsBackend.Native do
def s_fill_missing_with_atom(_s, _value), do: err()
def s_fill_missing_with_date(_s, _value), do: err()
def s_fill_missing_with_datetime(_s, _value), do: err()
def s_format(_series_list), do: err()
def s_greater(_s, _rhs), do: err()
def s_greater_equal(_s, _rhs), do: err()
def s_head(_s, _length), do: err()
Expand Down
22 changes: 19 additions & 3 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,26 @@ defmodule Explorer.PolarsBackend.Series do

@impl true
def format(list) do
polars_series = for s <- list, do: s.data
{_, df_args, params} =
Enum.reduce(list, {0, [], []}, fn s, {counter, df_args, params} ->
if is_binary(s) or Kernel.is_nil(s) do
{counter, df_args, [s | params]}
else
counter = counter + 1
name = "#{counter}"
column = Explorer.Backend.LazySeries.new(:column, [name], :string)
{counter, [{name, s} | df_args], [column | params]}
end
end)

Shared.apply(:s_format, [polars_series])
|> Shared.create_series()
df = Explorer.PolarsBackend.DataFrame.from_series(df_args)
format_expr = Explorer.Backend.LazySeries.new(:format, [Enum.reverse(params)], :string)
out_dtypes = Map.put(df.dtypes, "result", :string)
out_names = df.names ++ ["result"]
lkarthee marked this conversation as resolved.
Show resolved Hide resolved
out_df = %{df | dtypes: out_dtypes, names: out_names}

Explorer.PolarsBackend.DataFrame.mutate_with(df, out_df, [{"result", format_expr}])
|> Explorer.PolarsBackend.DataFrame.pull("result")
end

@impl true
Expand Down
41 changes: 34 additions & 7 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -1274,13 +1274,33 @@ defmodule Explorer.Series do

"""
@doc type: :element_wise
def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: dtype} = categories)
when K.and(K.in(l_dtype, [:string | @integer_types]), K.in(dtype, [:string, :category])),
def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: :category} = categories)
when K.in(l_dtype, [:string | @integer_types]),
do: apply_series(series, :categorise, [categories])

def categorise(%Series{dtype: l_dtype} = series, %Series{dtype: :string} = categories)
when K.in(l_dtype, [:string | @integer_types]) do
if nil_count(categories) != 0,
do:
raise(
ArgumentError,
"categories as strings cannot have nil values"
)

if count(categories) != n_distinct(categories),
do:
raise(
ArgumentError,
"categories as strings cannot have duplicated values"
)
lkarthee marked this conversation as resolved.
Show resolved Hide resolved

categories = cast(categories, :category)
apply_series(series, :categorise, [categories])
end

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved string series and list of strings categorise to here.

  • instead of throwing errors, it applies distinct to categories series
  • nil check is missing - should we add nil_count check ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of throwing errors, it applies distinct to categories series

I don't think we should do this because we are mapping indexes into the list. If you remove duplicates, the indexes are shifted, and the result changes. Is there a reason we removed the Rust code responsible for this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok.

https://pola.rs/posts/polars-string-type/

This caused errors in Series.categorise and encoding of strings/binary.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hit a wall with getting RevMapping to work in Series.Categorise - so tried if it can be fixed in elixir

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can do it in Elixir, but we need to get the unique_count and raise if different than size, and the nil count and raise.

def categorise(%Series{dtype: l_dtype} = series, [head | _] = categories)
when K.and(K.in(l_dtype, [:string | @integer_types]), is_binary(head)),
do: apply_series(series, :categorise, [from_list(categories, dtype: :string)])
do: categorise(series, from_list(categories, dtype: :string))

# Slice and dice

Expand Down Expand Up @@ -2086,13 +2106,20 @@ defmodule Explorer.Series do
iex> s1 = Explorer.Series.from_list([<<1>>, <<239, 191, 19>>], dtype: :binary)
iex> s2 = Explorer.Series.from_list([<<3>>, <<4>>], dtype: :binary)
iex> Explorer.Series.format([s1, s2])
** (RuntimeError) Polars Error: invalid utf-8 sequence
** (RuntimeError) Polars Error: invalid utf8
"""
@doc type: :shape
@spec format([Series.t() | String.t()]) :: Series.t()
def format([_ | _] = list) do
list = cast_to_string(list)
impl!(list).format(list)

if impl = impl!(list) do
impl.format(list)
else
[hd | rest] = list
s = Series.from_list([hd], dtype: :string)
impl!([s]).format([s | rest])
end
end

defp cast_to_string(list) do
Expand All @@ -2103,8 +2130,8 @@ defmodule Explorer.Series do
%Series{} = s ->
cast(s, :string)

value when is_binary(value) ->
from_list([value], dtype: :string)
value when K.or(is_binary(value), K.is_nil(value)) ->
value

other ->
raise ArgumentError,
Expand Down
Loading
Loading