Skip to content

Commit

Permalink
Add Struct json_decode/3 for decoding json from string (#841)
Browse files Browse the repository at this point in the history
  • Loading branch information
lkarthee authored Feb 1, 2024
1 parent 8b03726 commit 42075a0
Show file tree
Hide file tree
Showing 13 changed files with 152 additions and 0 deletions.
8 changes: 8 additions & 0 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ defmodule Explorer.Backend.LazySeries do
downcase: 1,
substring: 3,
split: 2,
json_decode: 2,
# Float round
round: 2,
floor: 1,
Expand Down Expand Up @@ -1094,6 +1095,13 @@ defmodule Explorer.Backend.LazySeries do
Backend.Series.new(data, dtype)
end

@impl true
def json_decode(series, dtype) do
data = new(:json_decode, [lazy_series!(series), dtype], dtype)

Backend.Series.new(data, dtype)
end

@remaining_non_lazy_operations [
at: 2,
at_every: 2,
Expand Down
1 change: 1 addition & 0 deletions lib/explorer/backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ defmodule Explorer.Backend.Series do
@callback rstrip(s, String.t() | nil) :: s
@callback substring(s, integer(), non_neg_integer() | nil) :: s
@callback split(s, String.t()) :: s
@callback json_decode(s, dtype()) :: s

# Date / DateTime

Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ defmodule Explorer.PolarsBackend.Expression do
upcase: 1,
substring: 3,
split: 2,
json_decode: 2,

# Lists
join: 2,
Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ defmodule Explorer.PolarsBackend.Native do
def s_member(_s, _value, _inner_dtype), do: err()

def s_field(_s, _name), do: err()
def s_json_decode(_s, _dtype), do: err()

defp err, do: :erlang.nif_error(:nif_not_loaded)
end
4 changes: 4 additions & 0 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,10 @@ defmodule Explorer.PolarsBackend.Series do
def field(%Series{dtype: {:struct, _inner_dtype}} = series, name),
do: Shared.apply_series(series, :s_field, [name])

@impl true
def json_decode(series, dtype),
do: Shared.apply_series(series, :s_json_decode, [dtype])

# Polars specific functions

def name(series), do: Shared.apply_series(series, :s_name)
Expand Down
39 changes: 39 additions & 0 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6043,6 +6043,45 @@ defmodule Explorer.Series do
end
end

@doc """
Decodes a string series containing valid JSON according to `dtype`.
## Examples
iex> s = Series.from_list(["1"])
iex> Series.json_decode(s, {:s, 64})
#Explorer.Series<
Polars[1]
s64 [1]
>
iex> s = Series.from_list(["{\\"a\\":1}"])
iex> Series.json_decode(s, {:struct, %{"a" => {:s, 64}}})
#Explorer.Series<
Polars[1]
struct[1] [%{"a" => 1}]
>
If the decoded value does not match the given `dtype`,
nil is returned for the given entry:
iex> s = Series.from_list(["\\"1\\""])
iex> Series.json_decode(s, {:s, 64})
#Explorer.Series<
Polars[1]
s64 [nil]
>
It raises an exception if the string is invalid JSON.
"""
@doc type: :string_wise
@spec json_decode(Series.t(), dtype()) :: Series.t()
def json_decode(%Series{dtype: :string} = series, dtype) do
dtype = Shared.normalise_dtype!(dtype)

apply_series(series, :json_decode, [dtype])
end

# Helpers

defp apply_series(series, fun, args \\ []) do
Expand Down
14 changes: 14 additions & 0 deletions native/explorer/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions native/explorer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ object_store = { version = "0.8", default-features = false, optional = true }
[target.'cfg(not(any(all(windows, target_env = "gnu"), all(target_os = "linux", target_env = "musl"))))'.dependencies]
mimalloc = { version = "*", default-features = false }

[patch.crates-io]
jsonpath_lib = { version = "0.3", git = "https://github.com/ritchie46/jsonpath", branch = "improve_compiled" }

[dependencies.polars]
version = "0.36"
default-features = false
Expand Down Expand Up @@ -79,6 +82,7 @@ features = [
"moment",
"rank",
"propagate_nans",
"extract_jsonpath"
]

[dependencies.polars-ops]
Expand Down
7 changes: 7 additions & 0 deletions native/explorer/src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1065,3 +1065,10 @@ pub fn expr_field(expr: ExExpr, name: &str) -> ExExpr {
let expr = expr.clone_inner().struct_().field_by_name(name);
ExExpr::new(expr)
}

#[rustler::nif]
pub fn expr_json_decode(expr: ExExpr, ex_dtype: ExSeriesDtype) -> ExExpr {
let dtype = DataType::try_from(&ex_dtype).unwrap();
let expr = expr.clone_inner().str().json_decode(Some(dtype), None);
ExExpr::new(expr)
}
2 changes: 2 additions & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ rustler::init!(
expr_member,
// struct expressions
expr_field,
expr_json_decode,
// lazyframe
lf_collect,
lf_describe_plan,
Expand Down Expand Up @@ -480,6 +481,7 @@ rustler::init!(
s_lengths,
s_member,
s_field,
s_json_decode,
],
load = on_load
);
17 changes: 17 additions & 0 deletions native/explorer/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1824,3 +1824,20 @@ pub fn s_field(s: ExSeries, name: &str) -> Result<ExSeries, ExplorerError> {
.clone();
Ok(ExSeries::new(s2))
}

#[rustler::nif]
pub fn s_json_decode(s: ExSeries, ex_dtype: ExSeriesDtype) -> Result<ExSeries, ExplorerError> {
let dtype = DataType::try_from(&ex_dtype).unwrap();
let s2 = s
.clone_inner()
.into_frame()
.lazy()
.select([col(s.name())
.str()
.json_decode(Some(dtype), None)
.alias(s.name())])
.collect()?
.column(s.name())?
.clone();
Ok(ExSeries::new(s2))
}
30 changes: 30 additions & 0 deletions test/explorer/data_frame_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4278,4 +4278,34 @@ defmodule Explorer.DataFrameTest do
}
end
end

describe "json_decode/2" do
test "decodes primitives, lists, structs" do
df = DF.new([%{st: "{\"n\": 1}", f: "1.0", l: "[1]", dt: "1"}], lazy: true)

df1 =
DF.mutate(df,
st: json_decode(st, {:struct, %{"n" => {:s, 64}}}),
f: json_decode(f, {:f, 64}),
l: json_decode(l, {:list, {:s, 64}}),
dt: json_decode(dt, {:datetime, :microsecond})
)

assert df.dtypes == %{"dt" => :string, "f" => :string, "l" => :string, "st" => :string}

assert df1.dtypes == %{
"dt" => {:datetime, :microsecond},
"f" => {:f, 64},
"l" => {:list, {:s, 64}},
"st" => {:struct, %{"n" => {:s, 64}}}
}

assert df1 |> DF.collect() |> DF.to_columns() == %{
"dt" => [~N[1970-01-01 00:00:00.000001]],
"f" => [1.0],
"l" => [[1]],
"st" => [%{"n" => 1}]
}
end
end
end
24 changes: 24 additions & 0 deletions test/explorer/series_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -5866,4 +5866,28 @@ defmodule Explorer.SeriesTest do
end
end
end

describe "json_decode/2" do
test "raises for invalid json" do
assert_raise RuntimeError,
"Polars Error: error deserializing JSON: json parsing error: 'InternalError(TapeError) at character 1 ('a')'",
fn ->
Series.from_list(["a"]) |> Series.json_decode(:string)
end
end

test "extracts primitive from json and nil for mismatch" do
s = Series.from_list(["1", "\"a\""])
sj = Series.json_decode(s, {:s, 64})
assert sj.dtype == {:s, 64}
assert Series.to_list(sj) == [1, nil]
end

test "extracts struct from json with dtype" do
s = Series.from_list(["{\"n\": 1}"])
sj = Series.json_decode(s, {:struct, %{"n" => {:f, 64}}})
assert sj.dtype == {:struct, %{"n" => {:f, 64}}}
assert Series.to_list(sj) == [%{"n" => 1.0}]
end
end
end

0 comments on commit 42075a0

Please sign in to comment.