elixir-explorer · philss · Jul 3, 2024 · Jul 1, 2024 · Jul 2, 2024
diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex
@@ -880,7 +880,8 @@ defmodule Explorer.PolarsBackend.DataFrame do
         df.data
       end)
 
-    Shared.apply_dataframe(head, out_df, :df_concat_columns, [tail])
+    out_data = Shared.apply(:df_concat_columns, [[head.data | tail]])
+    %{out_df | data: out_data}
   end
 
   # Groups

diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex
@@ -600,8 +600,9 @@ defmodule Explorer.PolarsBackend.LazyFrame do
   end
 
   @impl true
-  def concat_columns([%DF{} = head | tail], %DF{} = out_df) do
-    Shared.apply_dataframe(head, out_df, :lf_concat_columns, [Enum.map(tail, & &1.data)])
+  def concat_columns([%DF{data: head_data} | tail], %DF{} = out_df) do
+    out_ldf_data = Shared.apply(:lf_concat_columns, [[head_data | Enum.map(tail, & &1.data)]])
+    %{out_df | data: out_ldf_data}
   end
 
   @impl true

diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex
@@ -72,7 +72,7 @@ defmodule Explorer.PolarsBackend.Native do
       ),
       do: err()
 
-  def df_concat_columns(_df, _others), do: err()
+  def df_concat_columns(_dfs), do: err()
   def df_drop(_df, _name), do: err()
   def df_dtypes(_df), do: err()
   def df_dump_csv(_df, _has_headers, _delimiter), do: err()
@@ -267,7 +267,7 @@ defmodule Explorer.PolarsBackend.Native do
   def lf_pivot_longer(_df, _id_vars, _value_vars, _names_to, _values_to), do: err()
   def lf_join(_df, _other, _left_on, _right_on, _how, _suffix), do: err()
   def lf_concat_rows(_dfs), do: err()
-  def lf_concat_columns(_df, _others), do: err()
+  def lf_concat_columns(_ldfs), do: err()
   def lf_to_parquet(_df, _filename, _compression, _streaming), do: err()
   def lf_to_parquet_cloud(_df, _filename, _compression), do: err()
   def lf_to_ipc(_df, _filename, _compression, _streaming), do: err()

diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs
@@ -65,33 +65,35 @@ pub fn df_width(df: ExDataFrame) -> Result<usize, ExplorerError> {
 }
 
 #[rustler::nif(schedule = "DirtyCpu")]
-pub fn df_concat_columns(
-    data: ExDataFrame,
-    others: Vec<ExDataFrame>,
-) -> Result<ExDataFrame, ExplorerError> {
-    let id_column = "__row_count_id__";
-    let first = data.clone_inner().lazy().with_row_index(id_column, None);
+pub fn df_concat_columns(dfs: Vec<ExDataFrame>) -> Result<ExDataFrame, ExplorerError> {
+    let mut previous_names = PlHashSet::new();
 
-    // We need to be able to handle arbitrary column name overlap.
-    // This builds up a join and suffixes conflicting names with _N where
-    // N is the index of the df in the join array.
-    let (out_df, _) = others
+    let cols = dfs
         .iter()
-        .map(|data| data.clone_inner().lazy().with_row_index(id_column, None))
-        .fold((first, 1), |(acc_df, count), lazy_df| {
-            let suffix = format!("_{count}");
-            let new_df = acc_df
-                .join_builder()
-                .with(lazy_df)
-                .how(JoinType::Inner)
-                .left_on([col(id_column)])
-                .right_on([col(id_column)])
-                .suffix(suffix)
-                .finish();
-            (new_df, count + 1)
-        });
-
-    Ok(ExDataFrame::new(out_df.drop([id_column]).collect()?))
+        .enumerate()
+        .flat_map(|(idx, ex_df)| {
+            let df = ex_df.clone_inner();
+
+            df.get_columns()
+                .iter()
+                .map(|col| {
+                    let name = col.name();
+                    if previous_names.contains(name) {
+                        let new_name = format!("{name}_{idx}");
+                        previous_names.insert(new_name.clone());
+                        col.clone().rename(&new_name).to_owned()
+                    } else {
+                        previous_names.insert(name.to_string());
+                        col.clone().to_owned()
+                    }
+                })
+                .collect::<Vec<Series>>()
+        })
+        .collect::<Vec<Series>>();
+
+    let out_df = DataFrame::new(cols)?;
+
+    Ok(ExDataFrame::new(out_df))
 }
 
 #[rustler::nif(schedule = "DirtyCpu")]

diff --git a/native/explorer/src/lazyframe.rs b/native/explorer/src/lazyframe.rs
@@ -314,34 +314,47 @@ pub fn lf_concat_rows(lazy_frames: Vec<ExLazyFrame>) -> Result<ExLazyFrame, Expl
     Ok(ExLazyFrame::new(out_df))
 }
 
-#[rustler::nif]
-pub fn lf_concat_columns(
-    data: ExLazyFrame,
-    others: Vec<ExLazyFrame>,
-) -> Result<ExLazyFrame, ExplorerError> {
-    let id_column = "__row_count_id__";
-    let first = data.clone_inner().with_row_index(id_column, None);
+#[rustler::nif(schedule = "DirtyCpu")]
+pub fn lf_concat_columns(ldfs: Vec<ExLazyFrame>) -> Result<ExLazyFrame, ExplorerError> {
+    let mut previous_names = PlHashSet::new();
 
-    // We need to be able to handle arbitrary column name overlap.
-    // This builds up a join and suffixes conflicting names with _N where
-    // N is the index of the df in the join array.
-    let (out_df, _) = others
+    let renamed_ldfs: Vec<LazyFrame> = ldfs
         .iter()
-        .map(|data| data.clone_inner().with_row_index(id_column, None))
-        .fold((first, 1), |(acc_df, count), df| {
-            let suffix = format!("_{count}");
-            let new_df = acc_df
-                .join_builder()
-                .with(df)
-                .how(JoinType::Inner)
-                .left_on([col(id_column)])
-                .right_on([col(id_column)])
-                .suffix(suffix)
-                .finish();
-            (new_df, count + 1)
-        });
-
-    Ok(ExLazyFrame::new(out_df.drop([id_column])))
+        .enumerate()
+        .map(|(idx, ex_ldf)| {
+            let ldf = ex_ldf.clone_inner();
+            let names: Vec<String> = ldf
+                .schema()
+                .expect("should be able to get schema")
+                .iter_names()
+                .map(|smart_string| smart_string.to_string())
+                .collect();
+
+            let mut substitutions = vec![];
+
+            for name in names {
+                if previous_names.contains(&name) {
+                    let new_name = format!("{name}_{idx}");
+                    previous_names.insert(new_name.clone());
+                    substitutions.push((name, new_name))
+                } else {
+                    previous_names.insert(name.clone());
+                }
+            }
+
+            if substitutions.is_empty() {
+                ldf
+            } else {
+                let (existing, new): (Vec<String>, Vec<String>) =
+                    substitutions.iter().cloned().unzip();
+                ldf.rename(existing, new)
+            }
+        })
+        .collect();
+
+    let out_ldf = concat_lf_horizontal(renamed_ldfs, UnionArgs::default())?;
+
+    Ok(ExLazyFrame::new(out_ldf))
 }
 
 #[rustler::nif]

diff --git a/test/explorer/data_frame/lazy_test.exs b/test/explorer/data_frame/lazy_test.exs
@@ -1524,7 +1524,7 @@ defmodule Explorer.DataFrame.LazyTest do
              }
     end
 
-    test "with a bigger df in the right side removes the last row" do
+    test "with a bigger df in the right side add nils for smaller columns" do
       ldf1 = DF.new([x: [1, 2, 3], y: ["a", "b", "c"]], lazy: true)
       ldf2 = DF.new([z: [4, 5, 6, 7], a: ["d", "e", "f", "g"]], lazy: true)
 
@@ -1535,10 +1535,10 @@ defmodule Explorer.DataFrame.LazyTest do
       df = DF.compute(ldf)
 
       assert DF.to_columns(df, atom_keys: true) == %{
-               x: [1, 2, 3],
-               y: ["a", "b", "c"],
-               z: [4, 5, 6],
-               a: ["d", "e", "f"]
+               x: [1, 2, 3, nil],
+               y: ["a", "b", "c", nil],
+               z: [4, 5, 6, 7],
+               a: ["d", "e", "f", "g"]
              }
     end
   end