Update Polars to v0.43.1 (#985)

The most relevant change is internal. Now most of the strings are represented by the PlSmallStr type. Changes upstream: - https://github.com/pola-rs/polars/releases/tag/rs-0.43.0 - Version 0.43.0 - https://github.com/pola-rs/polars/releases/tag/rs-0.43.1 - Version 0.43.1
elixir-explorer · Sep 13, 2024 · 56026d5 · 56026d5
1 parent 0d27533
commit 56026d5
Show file tree

Hide file tree

Showing 11 changed files with 230 additions and 195 deletions.
diff --git a/native/explorer/Cargo.lock b/native/explorer/Cargo.lock
diff --git a/native/explorer/Cargo.toml b/native/explorer/Cargo.toml
@@ -33,7 +33,7 @@ object_store = { version = "0.10", default-features = false, optional = true }
 mimalloc = { version = "*", default-features = false }
 
 [dependencies.polars]
-version = "0.42"
+version = "0.43"
 default-features = false
 features = [
   "abs",
@@ -81,15 +81,9 @@ features = [
 ]
 
 [dependencies.polars-ops]
-version = "0.42"
+version = "0.43"
 features = ["abs", "ewma", "cum_agg", "cov"]
 
-# This dep is only needed to activate "timezones" feature
-# for the polars-json crate. We should remove when Polars fixes it.
-[dependencies.polars-json]
-version = "*"
-features = ["timezones", "chrono-tz"]
-
 [features]
 default = ["ndjson", "cloud", "nif_version_2_15"]
 

diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs
@@ -29,7 +29,11 @@ pub fn df_transpose(
 
 #[rustler::nif]
 pub fn df_names(df: ExDataFrame) -> Result<Vec<String>, ExplorerError> {
-    let names = to_string_names(df.get_column_names());
+    let names = df
+        .get_column_names()
+        .iter()
+        .map(|name| name.to_string())
+        .collect();
     Ok(names)
 }
 
@@ -73,10 +77,10 @@ pub fn df_concat_columns(dfs: Vec<ExDataFrame>) -> Result<ExDataFrame, ExplorerE
                 .iter()
                 .map(|col| {
                     let name = col.name();
-                    if previous_names.contains(name) {
+                    if previous_names.contains(&name.clone().to_string()) {
                         let new_name = format!("{name}_{idx}");
                         previous_names.insert(new_name.clone());
-                        col.clone().rename(&new_name).to_owned()
+                        col.clone().rename(new_name.into()).to_owned()
                     } else {
                         previous_names.insert(name.to_string());
                         col.clone().to_owned()
@@ -125,7 +129,7 @@ pub fn df_slice_by_indices(
     indices: Vec<u32>,
     groups: Vec<&str>,
 ) -> Result<ExDataFrame, ExplorerError> {
-    let idx = UInt32Chunked::from_vec("idx", indices);
+    let idx = UInt32Chunked::from_vec("idx".into(), indices);
     let new_df = if groups.is_empty() {
         df.take(&idx)?
     } else {
@@ -167,7 +171,7 @@ pub fn df_sample_n(
     seed: Option<u64>,
     groups: Vec<String>,
 ) -> Result<ExDataFrame, ExplorerError> {
-    let n_s = Series::new("n", &[n]);
+    let n_s = Series::new("n".into(), &[n]);
     let new_df = if groups.is_empty() {
         df.sample_n(&n_s, replace, shuffle, seed)?
     } else {
@@ -187,7 +191,7 @@ pub fn df_sample_frac(
     seed: Option<u64>,
     groups: Vec<String>,
 ) -> Result<ExDataFrame, ExplorerError> {
-    let frac_s = Series::new("frac", &[frac]);
+    let frac_s = Series::new("frac".into(), &[frac]);
     let new_df = if groups.is_empty() {
         df.sample_frac(&frac_s, replace, shuffle, seed)?
     } else {
@@ -396,7 +400,7 @@ pub fn df_pivot_wider(
         .collect();
 
     for (id_name, new_name) in id_columns.iter().zip(&temp_id_names) {
-        df.rename(id_name, new_name)?;
+        df.rename(id_name, new_name.into())?;
     }
 
     let mut new_df = pivot_stable(
@@ -465,7 +469,7 @@ pub fn df_lazy(df: ExDataFrame) -> Result<ExLazyFrame, ExplorerError> {
 
 #[rustler::nif(schedule = "DirtyCpu")]
 pub fn df_re_dtype(pattern: &str) -> Result<ExSeriesDtype, ExplorerError> {
-    let s = Series::new("dummy", [""])
+    let s = Series::new("dummy".into(), [""])
         .into_frame()
         .lazy()
         .with_column(col("dummy").str().extract_groups(pattern)?.alias("dummy"))

diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs
@@ -56,15 +56,22 @@ pub fn df_from_csv(
         .with_skip_rows_after_header(skip_rows_after_header)
         .with_projection(projection.map(Arc::new))
         .with_rechunk(do_rechunk)
-        .with_columns(column_names.map(Arc::from))
+        .with_columns(column_names.map(|names| {
+            names
+                .iter()
+                .map(|name| PlSmallStr::from_string(name.clone()))
+                .collect()
+        }))
         .with_parse_options(
             CsvParseOptions::default()
                 .with_encoding(encoding)
                 .with_truncate_ragged_lines(true)
                 .with_try_parse_dates(parse_dates)
                 .with_separator(delimiter_as_byte)
                 .with_eol_char(eol_delimiter.unwrap_or(b'\n'))
-                .with_null_values(Some(NullValues::AllColumns(null_vals))),
+                .with_null_values(Some(NullValues::AllColumns(
+                    null_vals.iter().map(|val| val.into()).collect(),
+                ))),
         )
         .try_into_reader_with_file_path(Some(filename.into()))?
         .finish();
@@ -79,7 +86,7 @@ pub fn schema_from_dtypes_pairs(
         return Ok(None);
     }
 
-    let mut schema = Schema::new();
+    let mut schema = Schema::with_capacity(dtypes.len());
     for (name, ex_dtype) in dtypes {
         let dtype = DataType::try_from(&ex_dtype)?;
         schema.with_column(name.into(), dtype);
@@ -174,7 +181,12 @@ pub fn df_load_csv(
         .with_has_header(has_header)
         .with_infer_schema_length(infer_schema_length)
         .with_n_rows(stop_after_n_rows)
-        .with_columns(column_names.map(Arc::from))
+        .with_columns(column_names.map(|names| {
+            names
+                .iter()
+                .map(|name| PlSmallStr::from_string(name.clone()))
+                .collect()
+        }))
         .with_skip_rows(skip_rows)
         .with_skip_rows_after_header(skip_rows_after_header)
         .with_projection(projection.map(Arc::new))
@@ -183,7 +195,9 @@ pub fn df_load_csv(
             CsvParseOptions::default()
                 .with_separator(delimiter_as_byte)
                 .with_encoding(encoding)
-                .with_null_values(Some(NullValues::AllColumns(null_vals)))
+                .with_null_values(Some(NullValues::AllColumns(
+                    null_vals.iter().map(|x| x.into()).collect(),
+                )))
                 .with_try_parse_dates(parse_dates)
                 .with_eol_char(eol_delimiter.unwrap_or(b'\n')),
         )

diff --git a/native/explorer/src/datatypes.rs b/native/explorer/src/datatypes.rs
@@ -493,7 +493,7 @@ impl<'tz> Literal for ExDateTime<'tz> {
         Expr::Literal(LiteralValue::DateTime(
             ndt.and_utc().timestamp_micros(),
             TimeUnit::Microseconds,
-            Some(time_zone),
+            Some(time_zone.into()),
         ))
     }
 }

diff --git a/native/explorer/src/datatypes/ex_dtypes.rs b/native/explorer/src/datatypes/ex_dtypes.rs
@@ -107,8 +107,7 @@ impl TryFrom<&DataType> for ExSeriesDtype {
                 let mut struct_fields = Vec::new();
 
                 for field in fields {
-                    struct_fields
-                        .push((field.name().to_string(), Self::try_from(field.data_type())?));
+                    struct_fields.push((field.name().to_string(), Self::try_from(field.dtype())?));
                 }
 
                 Ok(ExSeriesDtype::Struct(struct_fields))
@@ -160,7 +159,7 @@ impl TryFrom<&ExSeriesDtype> for DataType {
             }
             ExSeriesDtype::Datetime(ex_timeunit, tz_option) => Ok(DataType::Datetime(
                 ex_timeunit.try_into()?,
-                Some(tz_option.clone()),
+                Some(tz_option.into()),
             )),
             ExSeriesDtype::Duration(ex_timeunit) => Ok(DataType::Duration(ex_timeunit.try_into()?)),
             ExSeriesDtype::List(inner) => {
@@ -169,7 +168,7 @@ impl TryFrom<&ExSeriesDtype> for DataType {
             ExSeriesDtype::Struct(fields) => Ok(DataType::Struct(
                 fields
                     .iter()
-                    .map(|(k, v)| Ok(Field::new(k.as_str(), v.try_into()?)))
+                    .map(|(k, v)| Ok(Field::new(k.into(), v.try_into()?)))
                     .collect::<Result<Vec<Field>, Self::Error>>()?,
             )),
         }

diff --git a/native/explorer/src/expressions.rs b/native/explorer/src/expressions.rs
@@ -980,7 +980,7 @@ pub fn expr_atan(expr: ExExpr) -> ExExpr {
 #[rustler::nif]
 pub fn expr_strptime(expr: ExExpr, format_string: &str) -> ExExpr {
     let options = StrptimeOptions {
-        format: Some(format_string.to_string()),
+        format: Some(format_string.into()),
         strict: false,
         exact: true,
         cache: true,

diff --git a/native/explorer/src/lazyframe.rs b/native/explorer/src/lazyframe.rs
@@ -65,7 +65,7 @@ pub fn lf_tail(
 pub fn lf_names(data: ExLazyFrame) -> Result<Vec<String>, ExplorerError> {
     let mut lf = data.clone_inner();
     let names = lf
-        .schema()?
+        .collect_schema()?
         .iter_names()
         .map(|smart_string| smart_string.to_string())
         .collect();
@@ -76,9 +76,9 @@ pub fn lf_names(data: ExLazyFrame) -> Result<Vec<String>, ExplorerError> {
 #[rustler::nif]
 pub fn lf_dtypes(data: ExLazyFrame) -> Result<Vec<ExSeriesDtype>, ExplorerError> {
     let mut dtypes: Vec<ExSeriesDtype> = vec![];
-    let schema = data.clone_inner().schema()?;
+    let schema = data.clone_inner().collect_schema()?;
 
-    for dtype in schema.iter_dtypes() {
+    for (_name, dtype) in schema.iter_names_and_dtypes() {
         dtypes.push(ExSeriesDtype::try_from(dtype)?)
     }
 
@@ -108,7 +108,7 @@ pub fn lf_slice(
     let result_lf = if groups.is_empty() {
         lf.slice(offset, length)
     } else {
-        let groups_exprs: Vec<Expr> = groups.iter().map(|group| col(group)).collect();
+        let groups_exprs: Vec<Expr> = groups.iter().map(col).collect();
         lf.group_by_stable(groups_exprs)
             .agg([col("*").slice(offset, length)])
             .explode([col("*").exclude(groups)])
@@ -181,6 +181,7 @@ pub fn lf_distinct(
     columns_to_keep: Option<Vec<ExExpr>>,
 ) -> Result<ExLazyFrame, ExplorerError> {
     let df = data.clone_inner();
+    let subset = subset.iter().map(|x| x.into()).collect::<Vec<PlSmallStr>>();
     let new_df = df.unique_stable(Some(subset), UniqueKeepStrategy::First);
 
     match columns_to_keep {
@@ -219,11 +220,9 @@ pub fn lf_summarise_with(
         // We do add a "shadow" column to be able to group by it.
         // This is going to force some aggregations like "mode" to be always inside
         // a "list".
-        let s = Series::new_null("__explorer_null_for_group__", 1);
-        ldf.with_column(s.lit())
-            .group_by_stable(["__explorer_null_for_group__"])
+        ldf.group_by_stable([1.lit().alias("__explorer_literal_for_group__")])
             .agg(aggs)
-            .select(&[col("*").exclude(["__explorer_null_for_group__"])])
+            .select(&[col("*").exclude(["__explorer_literal_for_group__"])])
     } else {
         ldf.group_by_stable(groups).agg(aggs)
     };
@@ -344,7 +343,7 @@ pub fn lf_concat_columns(ldfs: Vec<ExLazyFrame>) -> Result<ExLazyFrame, Explorer
         .map(|(idx, ex_ldf)| {
             let mut ldf = ex_ldf.clone_inner();
             let names: Vec<String> = ldf
-                .schema()
+                .collect_schema()
                 .expect("should be able to get schema")
                 .iter_names()
                 .map(|smart_string| smart_string.to_string())

diff --git a/native/explorer/src/lazyframe/io.rs b/native/explorer/src/lazyframe/io.rs
@@ -19,7 +19,7 @@ pub fn lf_from_parquet(
     };
 
     let cols: Vec<Expr> = if let Some(cols) = columns {
-        cols.iter().map(|column| col(column)).collect()
+        cols.iter().map(col).collect()
     } else {
         vec![all()]
     };
@@ -43,7 +43,7 @@ pub fn lf_from_parquet_cloud(
         ..Default::default()
     };
     let cols: Vec<Expr> = if let Some(cols) = columns {
-        cols.iter().map(|column| col(column)).collect()
+        cols.iter().map(col).collect()
     } else {
         vec![all()]
     };
@@ -247,7 +247,9 @@ pub fn lf_from_csv(
         .with_rechunk(do_rechunk)
         .with_encoding(encoding)
         .with_dtype_overwrite(schema_from_dtypes_pairs(dtypes)?)
-        .with_null_values(Some(NullValues::AllColumns(null_vals)))
+        .with_null_values(Some(NullValues::AllColumns(
+            null_vals.iter().map(|x| x.into()).collect(),
+        )))
         .with_eol_char(eol_delimiter.unwrap_or(b'\n'))
         .finish()?;