Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement relocate inspired by dplyr #619

Merged
merged 16 commits into from
Jun 16, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ defmodule Explorer.Backend.DataFrame do
@callback arrange_with(df, out_df :: df(), directions :: [{:asc | :desc, lazy_series()}]) :: df
@callback distinct(df, out_df :: df(), columns :: [column_name()]) :: df
@callback rename(df, out_df :: df(), [{old :: column_name(), new :: column_name()}]) :: df
@callback relocate(
df,
out_df :: df(),
columns :: [column_name()] | column_name(),
position :: :first | :last | column_name()
) :: df
@callback dummies(df, out_df :: df(), columns :: [column_name()]) :: df
@callback sample(
df,
Expand Down
120 changes: 120 additions & 0 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2973,6 +2973,126 @@ defmodule Explorer.DataFrame do
Shared.apply_impl(df, :drop_nil, [columns])
end

@doc """
Relocates columns.

Change column order within a DataFrame. The `before` and `after` options are mutually exclusive.
Providing no options will relocate the columns to beginning of the DataFrame.

## Options

* `:before` - Specifies to relocate before the given column. You can also pass `:first` and `:last` to relocate to the beginning or end of the DataFrame.

* `:after` - Specifies to relocate after the given column. You can also pass `:first` and `:last` to relocate to the beginning or end of the DataFrame.

## Examples

Relocate a single column

iex> df = Explorer.DataFrame.new(a: ["a", "b", "a"], b: [1, 3, 1], c: [nil, 5, 6])
iex> Explorer.DataFrame.relocate(df, "a", after: "c")
#Explorer.DataFrame<
Polars[3 x 3]
b integer [1, 3, 1]
c integer [nil, 5, 6]
a string ["a", "b", "a"]
>

Relocate (and reorder) multiple columns to the beginning

iex> df = Explorer.DataFrame.new(a: [1, 2], b: [5.1, 5.2], c: [4, 5], d: ["yes", "no"])
iex> Explorer.DataFrame.relocate(df, ["d", 1], before: :first)
#Explorer.DataFrame<
Polars[2 x 4]
d string ["yes", "no"]
b float [5.1, 5.2]
a integer [1, 2]
c integer [4, 5]
>

Relocate before another column

iex> df = Explorer.DataFrame.new(a: [1, 2], b: [5.1, 5.2], c: [4, 5], d: ["yes", "no"])
iex> Explorer.DataFrame.relocate(df, ["a", "c"], before: "b")
#Explorer.DataFrame<
Polars[2 x 4]
a integer [1, 2]
c integer [4, 5]
b float [5.1, 5.2]
d string ["yes", "no"]
>
"""

@doc type: :single
@spec relocate(
df :: DataFrame.t(),
columns :: [column()] | column(),
opts :: Keyword.t()
) :: DataFrame.t()

def relocate(df, columns_or_column, opts)

def relocate(df, column, opts) when is_column(column),
do: relocate(df, [column], opts)

def relocate(df, columns, opts) do
opts = Keyword.validate!(opts, before: nil, after: nil)

columns = to_existing_columns(df, columns)

{new_names, col_index} =
case {opts[:before], opts[:after]} do
{nil, nil} ->
{:before, :first}

{before_col, nil} ->
{:before, before_col}

{nil, after_col} ->
{:after, after_col}

{before_col, after_col} ->
raise(
ArgumentError,
"only one location must be given. Got both " <>
"before: #{inspect(before_col)} and after: #{inspect(after_col)}"
)
end
|> relocate_columns(df, columns)

out_df = %{df | names: new_names}

Shared.apply_impl(df, :relocate, [out_df, columns, col_index])
end

defp relocate_columns({direction, :first}, df, columns_to_relocate),
do: relocate_columns({direction, 0}, df, columns_to_relocate)

defp relocate_columns({direction, :last}, df, columns_to_relocate),
do: relocate_columns({direction, -1}, df, columns_to_relocate)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this "special casing" is somewhat redundant when 0 and -1 can be provided as the index. Should I omit this in favor for consistency with other functions that don't use :first and :last in this way?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the readability of using :first | :last, so I would keep it. :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, there is an ambiguity if you have a column name first or last, so we should probably remove it. :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's true. Although you'd be able to specify the column with a string instead. I'm leaning on removing it still, though

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed in 09d2bf7, will look at the lazy data frames now!


defp relocate_columns({direction, target_column}, df, columns_to_relocate) do
[target_column] = to_existing_columns(df, [target_column])

offset =
case direction do
:before -> 0
:after -> 1
end

target_index = Enum.find_index(df.names, fn col -> col == target_column end) + offset

new_names =
df.names
|> Enum.split(target_index)
|> Kernel.then(fn {before_cols, after_cols} ->
Enum.reject(before_cols, &(&1 in columns_to_relocate)) ++
columns_to_relocate ++ Enum.reject(after_cols, &(&1 in columns_to_relocate))
end)

{new_names, target_index}
end

@doc """
Renames columns.

Expand Down
4 changes: 4 additions & 0 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,10 @@ defmodule Explorer.PolarsBackend.DataFrame do
def rename(%DataFrame{} = df, %DataFrame{} = out_df, pairs),
do: Shared.apply_dataframe(df, out_df, :df_rename_columns, [pairs])

@impl true
def relocate(%DataFrame{} = df, %DataFrame{} = out_df, columns, position),
do: Shared.apply_dataframe(df, out_df, :df_relocate, [columns, position])

@impl true
def dummies(df, out_df, names),
do: Shared.apply_dataframe(df, out_df, :df_to_dummies, [names])
Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/lazy_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
pivot_wider: 5,
pull: 2,
put: 4,
relocate: 4,
sample: 5,
slice: 2,
to_csv: 4,
Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ defmodule Explorer.PolarsBackend.Native do
def df_pivot_wider(_df, _id_columns, _pivot_column, _values_column, _names_prefix), do: err()
def df_pull(_df, _name), do: err()
def df_put_column(_df, _series), do: err()
def df_relocate(_df, _columns, _position), do: err()
def df_rename_columns(_df, _old_new_pairs), do: err()
def df_sample_frac(_df, _frac, _with_replacement, _shuffle, _seed, _groups), do: err()
def df_sample_n(_df, _n, _with_replacement, _shuffle, _seed, _groups), do: err()
Expand Down
39 changes: 39 additions & 0 deletions native/explorer/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,45 @@ pub fn df_from_series(columns: Vec<ExSeries>) -> Result<ExDataFrame, ExplorerErr
Ok(ExDataFrame::new(df))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_relocate(
df: ExDataFrame,
columns: Vec<&str>,
position: u64,
) -> Result<ExDataFrame, ExplorerError> {
let column_indexes: HashMap<&str, usize> = columns
.into_iter()
.enumerate()
.map(|(index, col)| (col, index))
.collect();

let mut columns = df.get_columns().to_owned();
let right_columns = columns.split_off(position as usize);

let (mut columns, mut to_relocate): (Vec<Series>, Vec<Series>) = columns
.into_iter()
.partition(|series| !column_indexes.contains_key(&series.name()));

let (mut right_columns, mut rest_relocate): (Vec<Series>, Vec<Series>) = right_columns
.into_iter()
.partition(|series| !column_indexes.contains_key(&series.name()));

// Ensure that the columns we want to relocate are sorted by the order the caller specifies
to_relocate.append(&mut rest_relocate);
to_relocate.sort_by_key(|series| {
column_indexes
.get(series.name())
.expect("column should exist")
});

columns.append(&mut to_relocate);
columns.append(&mut right_columns);

let df = DataFrame::new(columns)?;

Ok(ExDataFrame::new(df))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_rename_columns(
df: ExDataFrame,
Expand Down
1 change: 1 addition & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ rustler::init!(
df_pivot_wider,
df_pull,
df_put_column,
df_relocate,
df_rename_columns,
df_sample_frac,
df_sample_n,
Expand Down
101 changes: 101 additions & 0 deletions test/explorer/data_frame_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -1951,6 +1951,107 @@ defmodule Explorer.DataFrameTest do
assert DF.to_columns(df4) == %{"a" => [1], "b" => [1]}
end

describe "relocate/2" do
test "with single column and relative" do
df =
DF.new(
first: ["a", "b", "a"],
second: ["x", "y", "z"],
third: [2.2, 3.3, nil],
last: [1, 3, 1]
)

df1 = DF.relocate(df, "first", after: "second")

assert df1.names == ["second", "first", "third", "last"]
assert Series.to_list(df1["first"]) == Series.to_list(df["first"])
assert Series.to_list(df1["second"]) == Series.to_list(df["second"])
assert Series.to_list(df1["third"]) == Series.to_list(df["third"])
assert Series.to_list(df1["last"]) == Series.to_list(df["last"])

df2 = DF.relocate(df, "second", before: "last")
assert df2.names == ["first", "third", "second", "last"]

df3 = DF.relocate(df, 0, after: 3)
assert df3.names == ["second", "third", "last", "first"]
end

test "with multiple columns and relative" do
df =
DF.new(
first: ["a", "b", "a"],
second: ["x", "y", "z"],
third: [2.2, 3.3, nil],
last: [1, 3, 1]
)

df1 = DF.relocate(df, ["third", 1], before: -1)
assert df1.names == ["first", "third", "second", "last"]

df2 = DF.relocate(df, ["first", "last"], after: "third")
assert df2.names == ["second", "third", "first", "last"]

df3 = DF.relocate(df, ["second", "last"], before: 0)
assert df3.names == ["second", "last", "first", "third"]

df4 = DF.relocate(df, ["third", "second"], after: "second")
assert df4.names == ["first", "third", "second", "last"]
end

test "with the :last atom" do
df =
DF.new(
a: ["a value", "some other value", "a third value!"],
b: [0, 5, -2],
c: [nil, nil, nil]
)

df1 = DF.relocate(df, "a", after: :last)
assert df1.names == ["b", "c", "a"]

df2 = DF.relocate(df, 0, before: :last)
assert df2.names == ["b", "a", "c"]

df3 = DF.relocate(df, [2, "a"], after: :last)
assert df3.names == ["b", "c", "a"]
end

test "with the :first atom" do
df =
DF.new(
a: ["a value", "some other value", "a third value!"],
b: [0, 5, -2],
c: [nil, nil, nil]
)

df1 = DF.relocate(df, "c", after: :first)
assert df1.names == ["a", "c", "b"]

df2 = DF.relocate(df, 2, before: :first)
assert df2.names == ["c", "a", "b"]

df3 = DF.relocate(df, ["b", "a"], after: :first)
assert df3.names == ["b", "a", "c"]
end

test "ordered DataFrame output after relocation" do
df1 =
Explorer.DataFrame.new(
a: [1, 2],
b: [5.1, 5.2],
c: [4, 5],
d: ["yes", "no"],
e: [4, 1]
)

df2 = DF.relocate(df1, [4, 0], before: 2)
assert df2.names == ["b", "e", "a", "c", "d"]

assert DF.dump_csv(df2) ==
{:ok, "b,e,a,c,d\n5.1,4,1,4,yes\n5.2,1,2,5,no\n"}
end
end

describe "rename/2" do
test "with lists" do
df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"])
Expand Down