Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic support for categorical dtype #464

Merged
merged 7 commits into from
Jan 9, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion lib/explorer/backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,17 @@ defmodule Explorer.Backend.Series do
The behaviour for series backends.
"""

@valid_dtypes [:integer, :float, :boolean, :string, :date, :datetime, :list, :binary]
@valid_dtypes [
:integer,
:float,
:boolean,
:string,
:date,
:datetime,
:list,
:binary,
:categorical
]

@type t :: struct()

Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ defmodule Explorer.PolarsBackend.Native do
def s_from_list_u32(_name, _val), do: err()
def s_from_list_str(_name, _val), do: err()
def s_from_list_binary(_name, _val), do: err()
def s_from_list_categories(_name, _val), do: err()
def s_from_binary_f64(_name, _val), do: err()
def s_from_binary_i32(_name, _val), do: err()
def s_from_binary_i64(_name, _val), do: err()
Expand Down
3 changes: 3 additions & 0 deletions lib/explorer/polars_backend/shared.ex
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ defmodule Explorer.PolarsBackend.Shared do
:float -> Native.s_from_list_f64(name, list)
:boolean -> Native.s_from_list_bool(name, list)
:string -> Native.s_from_list_str(name, list)
:categorical -> Native.s_from_list_categories(name, list)
:date -> Native.s_from_list_date(name, list)
:datetime -> Native.s_from_list_datetime(name, list)
:binary -> Native.s_from_list_binary(name, list)
Expand All @@ -130,6 +131,7 @@ defmodule Explorer.PolarsBackend.Shared do
def normalise_dtype("f64"), do: :float
def normalise_dtype("bool"), do: :boolean
def normalise_dtype("str"), do: :string
def normalise_dtype("cat"), do: :categorical
def normalise_dtype("binary"), do: :binary
def normalise_dtype("date"), do: :date
def normalise_dtype("datetime[ms]"), do: :datetime
Expand All @@ -141,6 +143,7 @@ defmodule Explorer.PolarsBackend.Shared do
def internal_from_dtype(:float), do: "f64"
def internal_from_dtype(:boolean), do: "bool"
def internal_from_dtype(:string), do: "str"
def internal_from_dtype(:categorical), do: "cat"
def internal_from_dtype(:date), do: "date"
def internal_from_dtype(:datetime), do: "datetime[μs]"

Expand Down
36 changes: 30 additions & 6 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ defmodule Explorer.Series do

A series can be of the following data types:

* `:float` - 64-bit floating point number
* `:integer` - 64-bit signed integer
* `:boolean` - Boolean
* `:string` - UTF-8 encoded binary
* `:binary` - Binary
* `:boolean` - Boolean
* `:categorical` - UTF-8 encoded binary, but as categories
* `:date` - Date type that unwraps to `Elixir.Date`
* `:datetime` - DateTime type that unwraps to `Elixir.NaiveDateTime`
* `:float` - 64-bit floating point number
* `:integer` - 64-bit signed integer
* `:string` - UTF-8 encoded binary

A series must consist of a single data type only. Series may have `nil` values in them.

Expand Down Expand Up @@ -56,7 +57,16 @@ defmodule Explorer.Series do

@valid_dtypes Explorer.Shared.dtypes()

@type dtype :: :integer | :float | :boolean | :string | :date | :datetime | :binary
@type dtype ::
:binary
| :boolean
| :categorical
| :date
| :datetime
| :float
| :integer
| :string

@type t :: %Series{data: Explorer.Backend.Series.t(), dtype: dtype()}
@type lazy_t :: %Series{data: Explorer.Backend.LazySeries.t(), dtype: dtype()}

Expand Down Expand Up @@ -194,7 +204,15 @@ defmodule Explorer.Series do
binary [<<228, 146, 51>>, "Elixir"]
>

It is possible to create a series of `:datetime` from a list of microseconds since Unix Epoch:
Another option is to create a categorical series from a list of strings:

iex> Explorer.Series.from_list(["EUA", "Brazil", "Poland"], dtype: :categorical)
#Explorer.Series<
Polars[3]
categorical ["EUA", "Brazil", "Poland"]
>

It is possible to create a series of `:datetime` from a list of microseconds since Unix Epoch.

iex> Explorer.Series.from_list([1649883642 * 1_000 * 1_000], dtype: :datetime)
#Explorer.Series<
Expand Down Expand Up @@ -460,6 +478,12 @@ defmodule Explorer.Series do
iex> Explorer.Series.to_iovec(series)
[<<-62135596800000000::signed-64-native, 0::signed-64-native, 529550625987654::signed-64-native>>]

Categories are encoded as u32, with their internal representation:

iex> series = Explorer.Series.from_list(["a", "b", "c", "b"], dtype: :categorical)
iex> Explorer.Series.to_iovec(series)
[<<0::unsigned-32-native, 1::unsigned-32-native, 2::unsigned-32-native, 1::unsigned-32-native>>]

"""
@doc type: :conversion
@spec to_iovec(series :: Series.t()) :: [binary]
Expand Down
6 changes: 4 additions & 2 deletions lib/explorer/shared.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ defmodule Explorer.Shared do
@doc """
All supported dtypes.
"""
def dtypes, do: [:float, :integer, :boolean, :string, :date, :datetime, :binary]
def dtypes, do: [:binary, :boolean, :categorical, :date, :datetime, :float, :integer, :string]

@doc """
Gets the backend from a `Keyword.t()` or `nil`.
Expand Down Expand Up @@ -107,7 +107,8 @@ defmodule Explorer.Shared do
without the need to cast it later.
"""
def check_types!(list, preferable_type \\ nil) do
initial_type = if preferable_type in [:binary, :float, :integer], do: preferable_type
initial_type =
if preferable_type in [:binary, :float, :integer, :categorical], do: preferable_type

type =
Enum.reduce(list, initial_type, fn el, type ->
Expand Down Expand Up @@ -138,6 +139,7 @@ defmodule Explorer.Shared do
defp type(item, _type) when is_boolean(item), do: :boolean

defp type(item, :binary) when is_binary(item), do: :binary
defp type(item, :categorical) when is_binary(item), do: :categorical
defp type(item, _type) when is_binary(item), do: :string

defp type(%Date{} = _item, _type), do: :date
Expand Down
1 change: 1 addition & 0 deletions native/explorer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ features = [
"dtype-date",
"dtype-datetime",
"dtype-binary",
"dtype-categorical",
"ipc",
"ipc_streaming",
# JSON won't compile on RISCV and ARM 32 bits targets, so we disable in the "release" workflow.
Expand Down
42 changes: 41 additions & 1 deletion native/explorer/src/encoding.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use chrono::prelude::*;
use polars::export::arrow::array::GenericBinaryArray;
use polars::prelude::*;
use rustler::{Encoder, Env, OwnedBinary, ResourceArc, Term};
use rustler::{Encoder, Env, NewBinary, OwnedBinary, ResourceArc, Term};
use std::collections::HashMap;
use std::{mem, slice};

use crate::atoms::{
Expand Down Expand Up @@ -249,6 +250,38 @@ where
Ok(unsafe { Term::new(env, list) })
}

fn categorical_series_to_list<'b>(
s: &Series,
env: Env<'b>,
mapping: &Arc<RevMapping>,
) -> Result<Term<'b>, ExplorerError> {
let env_as_c_arg = env.as_c_arg();
let nil_as_c_arg = atom::nil().to_term(env).as_c_arg();
let mut list = unsafe { list::make_list(env_as_c_arg, &[]) };

let mut terms: HashMap<u32, NIF_TERM> = HashMap::new();

let logical = s.categorical()?.logical();

for maybe_id in &logical.reverse() {
let term_as_c_arg = match maybe_id {
None => &nil_as_c_arg,
Some(id) => terms.entry(id).or_insert_with(|| {
let values = mapping.get(id);
let mut binary = NewBinary::new(env, values.len());
binary.copy_from_slice(values.as_bytes());

let binary_term: Term = binary.into();
binary_term.as_c_arg()
}),
};

list = unsafe { list::make_list_cell(env_as_c_arg, *term_as_c_arg, list) }
}

Ok(unsafe { Term::new(env, list) })
}

// Convert f64 series taking into account NaN and Infinity floats (they are encoded as atoms).
#[inline]
fn float64_series_to_list<'b>(s: &Series, env: Env<'b>) -> Result<Term<'b>, ExplorerError> {
Expand Down Expand Up @@ -334,6 +367,7 @@ pub fn term_from_value<'b>(v: AnyValue, env: Env<'b>) -> Result<Term<'b>, Explor
AnyValue::Float64(v) => Ok(Some(v).encode(env)),
AnyValue::Date(v) => encode_date(v, env),
AnyValue::Datetime(v, time_unit, None) => encode_datetime(v, time_unit, env),
AnyValue::Categorical(idx, mapping) => Ok(mapping.get(idx).encode(env)),
dt => panic!("cannot encode value {dt:?} to term"),
}
}
Expand All @@ -354,6 +388,7 @@ pub fn list_from_series(data: ExSeries, env: Env) -> Result<Term, ExplorerError>
DataType::Binary => {
generic_binary_series_to_list(&data.resource, s.binary()?.downcast_iter(), env)
}
DataType::Categorical(Some(mapping)) => categorical_series_to_list(s, env, mapping),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is tricky. I wonder if we should return the categories (as strings) or their indexes in the mapping. And I think term_from_value should return the string.

iovec_from_series should definitely return indexes though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After further thought: let's keep it as above. This will remain as is, term_from_value will return binaries to, and iovec_from_series will return the indexes. We will need a function to return the mappings either as a tuple or as a map. I will change to_iovec and to_binary docs to mention they work exclusively with fixed-width binaries.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will need a function to return the mappings either as a tuple or as a map

Do you have a name in mind for that function? We could also return a dataframe for that. WDYT?

Also, I added the branch in iovec_from_series, but the original data is using UInt32, which we want to avoid.
Should I cast it to Int64?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want to avoid UInt32. But we want to avoid multiple representations for the same dtype. I.e. integers should have only a single representation. Therefore it is fine to return UInt32 for categorical types. :)

dt => panic!("to_list/1 not implemented for {dt:?}"),
}
}
Expand All @@ -379,6 +414,11 @@ pub fn iovec_from_series(data: ExSeries, env: Env) -> Result<Term, ExplorerError
DataType::Datetime(TimeUnit::Microseconds, None) => {
series_to_iovec!(resource, s, env, datetime, i64)
}
DataType::Categorical(Some(_)) => {
let cat_series = s.cast(&DataType::UInt32)?;

series_to_iovec!(resource, cat_series, env, u32, u32)
}
dt => panic!("to_iovec/1 not implemented for {dt:?}"),
}
}
1 change: 1 addition & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ rustler::init!(
s_from_list_u32,
s_from_list_str,
s_from_list_binary,
s_from_list_categories,
s_from_binary_f64,
s_from_binary_i64,
s_from_binary_i32,
Expand Down
9 changes: 9 additions & 0 deletions native/explorer/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,15 @@ pub fn s_from_list_binary(name: &str, val: Vec<Option<Binary>>) -> ExSeries {
))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_from_list_categories(name: &str, val: Vec<Option<String>>) -> ExSeries {
ExSeries::new(
Series::new(name, val.as_slice())
.cast(&DataType::Categorical(None))
.unwrap(),
)
}

macro_rules! from_binary {
($name:ident, $type:ty, $bytes:expr) => {
#[rustler::nif(schedule = "DirtyCpu")]
Expand Down
45 changes: 40 additions & 5 deletions test/explorer/series_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,48 @@ defmodule Explorer.SeriesTest do
Series.from_list([<<228, 146, 51>>, <<22, 197, 116>>, <<42, 209, 236>>])
end
end

test "with strings as categories" do
s = Series.from_list(["a", "b", "c"], dtype: :categorical)
assert Series.to_list(s) === ["a", "b", "c"]
assert Series.dtype(s) == :categorical
end
end

test "fetch/2" do
s = Series.from_list([1, 2, 3])
assert s[0] === 1
assert s[0..1] |> Series.to_list() === [1, 2]
assert s[[0, 1]] |> Series.to_list() === [1, 2]
describe "fetch/2" do
test "integer series" do
s = Series.from_list([1, 2, 3, nil, 5])
assert s[0] === 1
assert s[0..1] |> Series.to_list() === [1, 2]
assert s[[0, 1]] |> Series.to_list() === [1, 2]

assert s[3] == nil
assert s[-1] == 5
end

test "float series" do
s = Series.from_list([1.2, 2.3, 3.4, nil, 5.6])
assert s[0] === 1.2
assert s[0..1] |> Series.to_list() === [1.2, 2.3]
assert s[[0, 1]] |> Series.to_list() === [1.2, 2.3]

assert s[3] == nil
assert s[-1] == 5.6
end

test "string series" do
s = Series.from_list(["a", "b", nil, "d"])
assert s[0] === "a"
assert s[2] == nil
assert s[-1] == "d"
end

test "categorical series" do
s = Series.from_list(["a", "b", nil, "d"], dtype: :categorical)
assert s[0] === "a"
assert s[2] == nil
assert s[-1] == "d"
end
end

test "pop/2" do
Expand Down