Skip to content

Commit

Permalink
Fix representation of time series from us to ns (#596)
Browse files Browse the repository at this point in the history
Since Polars represents Time as nanoseconds internally, we should also
do the same. This is because some conversions, like casting to string,
cannot work properly if we treat the time as microseconds.
  • Loading branch information
philss authored May 11, 2023
1 parent dfea0a9 commit 25f8cd9
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 65 deletions.
2 changes: 1 addition & 1 deletion lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -1320,7 +1320,7 @@ defmodule Explorer.DataFrame do
iex> Explorer.DataFrame.new(%{
...> floats: Nx.tensor([1.0, 2.0], type: :f64),
...> times: Nx.tensor([3, 4])
...> times: Nx.tensor([3_000, 4_000])
...> }, dtypes: [times: :time])
#Explorer.DataFrame<
Polars[2 x 2]
Expand Down
37 changes: 20 additions & 17 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -253,9 +253,9 @@ defmodule Explorer.Series do
datetime [2022-04-13 21:00:42.000000]
>
It is possible to create a series of `:time` from a list of microseconds since midnight.
It is possible to create a series of `:time` from a list of nanoseconds since midnight.
iex> Explorer.Series.from_list([123 * 1_000 * 1_000], dtype: :time)
iex> Explorer.Series.from_list([123 * 1_000 * 1_000 * 1_000], dtype: :time)
#Explorer.Series<
Polars[1]
time [00:02:03.000000]
Expand Down Expand Up @@ -333,9 +333,9 @@ defmodule Explorer.Series do
date [0001-01-01, 1970-01-01, 1986-10-13]
>
Times are encoded as i64 representing microseconds from midnight:
Times are encoded as i64 representing nanoseconds from midnight:
iex> binary = <<0::signed-64-native, 86399999999::signed-64-native>>
iex> binary = <<0::signed-64-native, 86399999999000::signed-64-native>>
iex> Explorer.Series.from_binary(binary, :time)
#Explorer.Series<
Polars[2]
Expand Down Expand Up @@ -416,9 +416,10 @@ defmodule Explorer.Series do
date [0001-01-01, 1970-01-01, 1986-10-13]
>
Times are signed 64-bit and therefore must have their dtype explicitly given:
Times are signed 64-bit representing nanoseconds from midnight and
therefore must have their dtype explicitly given:
iex> tensor = Nx.tensor([0, 86399999999])
iex> tensor = Nx.tensor([0, 86399999999000])
iex> Explorer.Series.from_tensor(tensor, dtype: :time)
#Explorer.Series<
Polars[2]
Expand Down Expand Up @@ -597,11 +598,11 @@ defmodule Explorer.Series do
iex> Explorer.Series.to_iovec(series)
[<<-719162::signed-32-native, 0::signed-32-native, 6129::signed-32-native>>]
Times are encoded as i64 representing microseconds from midnight:
Times are encoded as i64 representing nanoseconds from midnight:
iex> series = Explorer.Series.from_list([~T[00:00:00.000000], ~T[23:59:59.999999]])
iex> Explorer.Series.to_iovec(series)
[<<0::signed-64-native, 86399999999::signed-64-native>>]
[<<0::signed-64-native, 86399999999000::signed-64-native>>]
Datetimes are encoded as i64 representing microseconds from the Unix epoch (1970-01-01):
Expand Down Expand Up @@ -729,16 +730,18 @@ defmodule Explorer.Series do
date [1970-01-02, 1970-01-03, 1970-01-04]
>
Note that `time` is represented as an integer of microseconds since midnight.
Note that `time` is represented as an integer of nanoseconds since midnight.
In Elixir we can't represent nanoseconds, only microseconds. So be aware that
information can be lost if a conversion is needed (e.g. calling `to_list/1`).
iex> s = Explorer.Series.from_list([1, 2, 3])
iex> s = Explorer.Series.from_list([1_000, 2_000, 3_000])
iex> Explorer.Series.cast(s, :time)
#Explorer.Series<
Polars[3]
time [00:00:00.000001, 00:00:00.000002, 00:00:00.000003]
>
iex> s = Explorer.Series.from_list([86399 * 1_000 * 1_000])
iex> s = Explorer.Series.from_list([86399 * 1_000 * 1_000 * 1_000])
iex> Explorer.Series.cast(s, :time)
#Explorer.Series<
Polars[1]
Expand Down Expand Up @@ -1610,9 +1613,9 @@ defmodule Explorer.Series do
iex> Explorer.Series.min(s)
~N[1999-12-31 00:00:00.000000]
iex> s = Explorer.Series.from_list([~T[00:02:03.000000], ~T[00:05:04.000000]])
iex> s = Explorer.Series.from_list([~T[00:02:03.000451], ~T[00:05:04.000134]])
iex> Explorer.Series.min(s)
~T[00:02:03.000000]
~T[00:02:03.000451]
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.min(s)
Expand Down Expand Up @@ -1656,9 +1659,9 @@ defmodule Explorer.Series do
iex> Explorer.Series.max(s)
~N[2021-01-01 00:00:00.000000]
iex> s = Explorer.Series.from_list([~T[00:02:03.000000], ~T[00:05:04.000000]])
iex> s = Explorer.Series.from_list([~T[00:02:03.000212], ~T[00:05:04.000456]])
iex> Explorer.Series.max(s)
~T[00:05:04.000000]
~T[00:05:04.000456]
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.max(s)
Expand Down Expand Up @@ -1819,9 +1822,9 @@ defmodule Explorer.Series do
iex> Explorer.Series.quantile(s, 0.5)
~N[2021-01-01 00:00:00.000000]
iex> s = Explorer.Series.from_list([~T[01:55:00.000000], ~T[15:35:00.000000], ~T[23:00:00.000000]])
iex> s = Explorer.Series.from_list([~T[01:55:00], ~T[15:35:00], ~T[23:00:00]])
iex> Explorer.Series.quantile(s, 0.5)
~T[15:35:00.000000]
~T[15:35:00]
iex> s = Explorer.Series.from_list([true, false, true])
iex> Explorer.Series.quantile(s, 0.5)
Expand Down
39 changes: 23 additions & 16 deletions native/explorer/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,24 +301,30 @@ pub struct ExTime {
pub microsecond: (u32, u32),
}

pub use polars::export::arrow::temporal_conversions::time64us_to_time as timestamp_to_time;
pub use polars::export::arrow::temporal_conversions::time64ns_to_time;

impl From<i64> for ExTime {
fn from(microseconds: i64) -> Self {
timestamp_to_time(microseconds).into()
fn from(nanoseconds: i64) -> Self {
time64ns_to_time(nanoseconds).into()
}
}

// In Polars, Time is represented as an i64 in nanoseconds.
// Since we don't have nanoseconds precision in Elixir, we just ignore the extra
// precision when is available.
impl From<ExTime> for i64 {
fn from(t: ExTime) -> i64 {
let midnight = NaiveTime::from_hms_opt(0, 0, 0).unwrap();
let duration = NaiveTime::from_hms_micro_opt(t.hour, t.minute, t.second, t.microsecond.0)
.unwrap()
.signed_duration_since(NaiveTime::from_hms_opt(0, 0, 0).unwrap());

match duration.num_microseconds() {
Some(us) => us,
None => duration.num_milliseconds() * 1_000,
}
.signed_duration_since(midnight);

duration.num_nanoseconds().unwrap_or_else(|| {
duration
.num_microseconds()
.unwrap_or_else(|| duration.num_milliseconds() * 1_000)
* 1_000
})
}
}

Expand All @@ -330,19 +336,20 @@ impl From<ExTime> for NaiveTime {

impl From<NaiveTime> for ExTime {
fn from(t: NaiveTime) -> Self {
let microseconds = t
.signed_duration_since(
NaiveTime::from_hms_opt(t.hour(), t.minute(), t.second()).unwrap(),
)
.num_microseconds()
.unwrap();
let microseconds = t.nanosecond() / 1_000;

let ex_microseconds = if microseconds > 0 {
(microseconds_six_digits(microseconds), 6)
} else {
(0, 0)
};

ExTime {
calendar: atoms::calendar_iso_module(),
hour: t.hour(),
minute: t.minute(),
second: t.second(),
microsecond: (microseconds_six_digits(microseconds.try_into().unwrap()), 6),
microsecond: ex_microseconds,
}
}
}
Expand Down
16 changes: 3 additions & 13 deletions native/explorer/src/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::atoms::{
year,
};
use crate::datatypes::{
days_to_date, timestamp_to_datetime, timestamp_to_time, ExSeries, ExSeriesRef,
days_to_date, time64ns_to_time, timestamp_to_datetime, ExSeries, ExSeriesRef,
};
use crate::ExplorerError;

Expand Down Expand Up @@ -213,18 +213,8 @@ fn datetime_series_to_list<'b>(

macro_rules! unsafe_encode_time {
($v: expr, $naive_time_struct_keys: ident, $calendar_iso_module: ident, $time_module: ident, $env: ident) => {{
let t = timestamp_to_time($v);
let duration =
NaiveTime::from_hms_micro_opt(t.hour(), t.minute(), t.second(), t.nanosecond() / 1_000)
.unwrap()
.signed_duration_since(
NaiveTime::from_hms_opt(t.hour(), t.minute(), t.second()).unwrap(),
);

let microseconds = match duration.num_microseconds() {
Some(us) => us,
None => duration.num_milliseconds() * 1_000,
};
let t = time64ns_to_time($v);
let microseconds = t.nanosecond() / 1_000;

// Limit the number of digits in the microsecond part of a timestamp to 6.
// This is necessary because the microsecond part of Elixir is only 6 digits.
Expand Down
27 changes: 9 additions & 18 deletions test/explorer/series_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2514,23 +2514,14 @@ defmodule Explorer.SeriesTest do
end

test "with two time series" do
s1 = Series.from_list([~T[01:00:00.000000], ~T[02:00:00.000000]])
s2 = Series.from_list([~T[03:00:00.000000], ~T[04:00:00.000000]])
# Notice that Polars drops the microseconds part when converting
# a Time series to String series.
# See: https://github.com/pola-rs/polars/pull/8351
s1 = Series.from_list([~T[01:00:00.000543], ~T[02:00:00.000000]])
s2 = Series.from_list([~T[03:00:00.000000], ~T[04:00:00.000201]])

assert Series.format([s1, s2]) |> Series.to_list() ==
["360000000010800000000", "720000000014400000000"]
end

test "with many time series with separator" do
s1 = Series.from_list([~T[01:00:00.000000], ~T[02:00:00.000000]])
s2 = Series.from_list([~T[03:00:00.000000], ~T[04:00:00.000000]])
s3 = Series.from_list([~T[05:00:00.000000], ~T[06:00:00.000000]])
s4 = Series.from_list([~T[07:00:00.000000], ~T[08:00:00.000000]])

assert Series.format([s1, " / ", s2, " - ", s3, " / ", s4]) |> Series.to_list() == [
"3600000000 / 10800000000 - 18000000000 / 25200000000",
"7200000000 / 14400000000 - 21600000000 / 28800000000"
]
assert Series.format([s1, " <=> ", s2]) |> Series.to_list() ==
["3600000543000 <=> 10800000000000", "7200000000000 <=> 14400000201000"]
end

test "with two datetime series" do
Expand Down Expand Up @@ -3059,7 +3050,7 @@ defmodule Explorer.SeriesTest do
end

test "integer series to time" do
s = Series.from_list([1, 2, 3])
s = Series.from_list([1, 2, 3]) |> Series.multiply(1_000)
s1 = Series.cast(s, :time)

assert Series.to_list(s1) == [
Expand All @@ -3070,7 +3061,7 @@ defmodule Explorer.SeriesTest do

assert Series.dtype(s1) == :time

s2 = Series.from_list([86399 * 1_000 * 1_000])
s2 = Series.from_list([86399 * 1_000 * 1_000 * 1_000])
s3 = Series.cast(s2, :time)

assert Series.to_list(s3) == [~T[23:59:59.000000]]
Expand Down

0 comments on commit 25f8cd9

Please sign in to comment.