Skip to content

Commit

Permalink
Update Polars to v0.43.1 (#985)
Browse files Browse the repository at this point in the history
The most relevant change is internal. Now most of the strings are represented by the PlSmallStr type.

Changes upstream:

- https://github.com/pola-rs/polars/releases/tag/rs-0.43.0 - Version 0.43.0
- https://github.com/pola-rs/polars/releases/tag/rs-0.43.1 - Version 0.43.1
  • Loading branch information
philss authored Sep 13, 2024
1 parent 0d27533 commit 56026d5
Show file tree
Hide file tree
Showing 11 changed files with 230 additions and 195 deletions.
226 changes: 119 additions & 107 deletions native/explorer/Cargo.lock

Large diffs are not rendered by default.

10 changes: 2 additions & 8 deletions native/explorer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ object_store = { version = "0.10", default-features = false, optional = true }
mimalloc = { version = "*", default-features = false }

[dependencies.polars]
version = "0.42"
version = "0.43"
default-features = false
features = [
"abs",
Expand Down Expand Up @@ -81,15 +81,9 @@ features = [
]

[dependencies.polars-ops]
version = "0.42"
version = "0.43"
features = ["abs", "ewma", "cum_agg", "cov"]

# This dep is only needed to activate "timezones" feature
# for the polars-json crate. We should remove when Polars fixes it.
[dependencies.polars-json]
version = "*"
features = ["timezones", "chrono-tz"]

[features]
default = ["ndjson", "cloud", "nif_version_2_15"]

Expand Down
20 changes: 12 additions & 8 deletions native/explorer/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ pub fn df_transpose(

#[rustler::nif]
pub fn df_names(df: ExDataFrame) -> Result<Vec<String>, ExplorerError> {
let names = to_string_names(df.get_column_names());
let names = df
.get_column_names()
.iter()
.map(|name| name.to_string())
.collect();
Ok(names)
}

Expand Down Expand Up @@ -73,10 +77,10 @@ pub fn df_concat_columns(dfs: Vec<ExDataFrame>) -> Result<ExDataFrame, ExplorerE
.iter()
.map(|col| {
let name = col.name();
if previous_names.contains(name) {
if previous_names.contains(&name.clone().to_string()) {
let new_name = format!("{name}_{idx}");
previous_names.insert(new_name.clone());
col.clone().rename(&new_name).to_owned()
col.clone().rename(new_name.into()).to_owned()
} else {
previous_names.insert(name.to_string());
col.clone().to_owned()
Expand Down Expand Up @@ -125,7 +129,7 @@ pub fn df_slice_by_indices(
indices: Vec<u32>,
groups: Vec<&str>,
) -> Result<ExDataFrame, ExplorerError> {
let idx = UInt32Chunked::from_vec("idx", indices);
let idx = UInt32Chunked::from_vec("idx".into(), indices);
let new_df = if groups.is_empty() {
df.take(&idx)?
} else {
Expand Down Expand Up @@ -167,7 +171,7 @@ pub fn df_sample_n(
seed: Option<u64>,
groups: Vec<String>,
) -> Result<ExDataFrame, ExplorerError> {
let n_s = Series::new("n", &[n]);
let n_s = Series::new("n".into(), &[n]);
let new_df = if groups.is_empty() {
df.sample_n(&n_s, replace, shuffle, seed)?
} else {
Expand All @@ -187,7 +191,7 @@ pub fn df_sample_frac(
seed: Option<u64>,
groups: Vec<String>,
) -> Result<ExDataFrame, ExplorerError> {
let frac_s = Series::new("frac", &[frac]);
let frac_s = Series::new("frac".into(), &[frac]);
let new_df = if groups.is_empty() {
df.sample_frac(&frac_s, replace, shuffle, seed)?
} else {
Expand Down Expand Up @@ -396,7 +400,7 @@ pub fn df_pivot_wider(
.collect();

for (id_name, new_name) in id_columns.iter().zip(&temp_id_names) {
df.rename(id_name, new_name)?;
df.rename(id_name, new_name.into())?;
}

let mut new_df = pivot_stable(
Expand Down Expand Up @@ -465,7 +469,7 @@ pub fn df_lazy(df: ExDataFrame) -> Result<ExLazyFrame, ExplorerError> {

#[rustler::nif(schedule = "DirtyCpu")]
pub fn df_re_dtype(pattern: &str) -> Result<ExSeriesDtype, ExplorerError> {
let s = Series::new("dummy", [""])
let s = Series::new("dummy".into(), [""])
.into_frame()
.lazy()
.with_column(col("dummy").str().extract_groups(pattern)?.alias("dummy"))
Expand Down
24 changes: 19 additions & 5 deletions native/explorer/src/dataframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,22 @@ pub fn df_from_csv(
.with_skip_rows_after_header(skip_rows_after_header)
.with_projection(projection.map(Arc::new))
.with_rechunk(do_rechunk)
.with_columns(column_names.map(Arc::from))
.with_columns(column_names.map(|names| {
names
.iter()
.map(|name| PlSmallStr::from_string(name.clone()))
.collect()
}))
.with_parse_options(
CsvParseOptions::default()
.with_encoding(encoding)
.with_truncate_ragged_lines(true)
.with_try_parse_dates(parse_dates)
.with_separator(delimiter_as_byte)
.with_eol_char(eol_delimiter.unwrap_or(b'\n'))
.with_null_values(Some(NullValues::AllColumns(null_vals))),
.with_null_values(Some(NullValues::AllColumns(
null_vals.iter().map(|val| val.into()).collect(),
))),
)
.try_into_reader_with_file_path(Some(filename.into()))?
.finish();
Expand All @@ -79,7 +86,7 @@ pub fn schema_from_dtypes_pairs(
return Ok(None);
}

let mut schema = Schema::new();
let mut schema = Schema::with_capacity(dtypes.len());
for (name, ex_dtype) in dtypes {
let dtype = DataType::try_from(&ex_dtype)?;
schema.with_column(name.into(), dtype);
Expand Down Expand Up @@ -174,7 +181,12 @@ pub fn df_load_csv(
.with_has_header(has_header)
.with_infer_schema_length(infer_schema_length)
.with_n_rows(stop_after_n_rows)
.with_columns(column_names.map(Arc::from))
.with_columns(column_names.map(|names| {
names
.iter()
.map(|name| PlSmallStr::from_string(name.clone()))
.collect()
}))
.with_skip_rows(skip_rows)
.with_skip_rows_after_header(skip_rows_after_header)
.with_projection(projection.map(Arc::new))
Expand All @@ -183,7 +195,9 @@ pub fn df_load_csv(
CsvParseOptions::default()
.with_separator(delimiter_as_byte)
.with_encoding(encoding)
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_null_values(Some(NullValues::AllColumns(
null_vals.iter().map(|x| x.into()).collect(),
)))
.with_try_parse_dates(parse_dates)
.with_eol_char(eol_delimiter.unwrap_or(b'\n')),
)
Expand Down
2 changes: 1 addition & 1 deletion native/explorer/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ impl<'tz> Literal for ExDateTime<'tz> {
Expr::Literal(LiteralValue::DateTime(
ndt.and_utc().timestamp_micros(),
TimeUnit::Microseconds,
Some(time_zone),
Some(time_zone.into()),
))
}
}
Expand Down
7 changes: 3 additions & 4 deletions native/explorer/src/datatypes/ex_dtypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ impl TryFrom<&DataType> for ExSeriesDtype {
let mut struct_fields = Vec::new();

for field in fields {
struct_fields
.push((field.name().to_string(), Self::try_from(field.data_type())?));
struct_fields.push((field.name().to_string(), Self::try_from(field.dtype())?));
}

Ok(ExSeriesDtype::Struct(struct_fields))
Expand Down Expand Up @@ -160,7 +159,7 @@ impl TryFrom<&ExSeriesDtype> for DataType {
}
ExSeriesDtype::Datetime(ex_timeunit, tz_option) => Ok(DataType::Datetime(
ex_timeunit.try_into()?,
Some(tz_option.clone()),
Some(tz_option.into()),
)),
ExSeriesDtype::Duration(ex_timeunit) => Ok(DataType::Duration(ex_timeunit.try_into()?)),
ExSeriesDtype::List(inner) => {
Expand All @@ -169,7 +168,7 @@ impl TryFrom<&ExSeriesDtype> for DataType {
ExSeriesDtype::Struct(fields) => Ok(DataType::Struct(
fields
.iter()
.map(|(k, v)| Ok(Field::new(k.as_str(), v.try_into()?)))
.map(|(k, v)| Ok(Field::new(k.into(), v.try_into()?)))
.collect::<Result<Vec<Field>, Self::Error>>()?,
)),
}
Expand Down
2 changes: 1 addition & 1 deletion native/explorer/src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,7 @@ pub fn expr_atan(expr: ExExpr) -> ExExpr {
#[rustler::nif]
pub fn expr_strptime(expr: ExExpr, format_string: &str) -> ExExpr {
let options = StrptimeOptions {
format: Some(format_string.to_string()),
format: Some(format_string.into()),
strict: false,
exact: true,
cache: true,
Expand Down
17 changes: 8 additions & 9 deletions native/explorer/src/lazyframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ pub fn lf_tail(
pub fn lf_names(data: ExLazyFrame) -> Result<Vec<String>, ExplorerError> {
let mut lf = data.clone_inner();
let names = lf
.schema()?
.collect_schema()?
.iter_names()
.map(|smart_string| smart_string.to_string())
.collect();
Expand All @@ -76,9 +76,9 @@ pub fn lf_names(data: ExLazyFrame) -> Result<Vec<String>, ExplorerError> {
#[rustler::nif]
pub fn lf_dtypes(data: ExLazyFrame) -> Result<Vec<ExSeriesDtype>, ExplorerError> {
let mut dtypes: Vec<ExSeriesDtype> = vec![];
let schema = data.clone_inner().schema()?;
let schema = data.clone_inner().collect_schema()?;

for dtype in schema.iter_dtypes() {
for (_name, dtype) in schema.iter_names_and_dtypes() {
dtypes.push(ExSeriesDtype::try_from(dtype)?)
}

Expand Down Expand Up @@ -108,7 +108,7 @@ pub fn lf_slice(
let result_lf = if groups.is_empty() {
lf.slice(offset, length)
} else {
let groups_exprs: Vec<Expr> = groups.iter().map(|group| col(group)).collect();
let groups_exprs: Vec<Expr> = groups.iter().map(col).collect();
lf.group_by_stable(groups_exprs)
.agg([col("*").slice(offset, length)])
.explode([col("*").exclude(groups)])
Expand Down Expand Up @@ -181,6 +181,7 @@ pub fn lf_distinct(
columns_to_keep: Option<Vec<ExExpr>>,
) -> Result<ExLazyFrame, ExplorerError> {
let df = data.clone_inner();
let subset = subset.iter().map(|x| x.into()).collect::<Vec<PlSmallStr>>();
let new_df = df.unique_stable(Some(subset), UniqueKeepStrategy::First);

match columns_to_keep {
Expand Down Expand Up @@ -219,11 +220,9 @@ pub fn lf_summarise_with(
// We do add a "shadow" column to be able to group by it.
// This is going to force some aggregations like "mode" to be always inside
// a "list".
let s = Series::new_null("__explorer_null_for_group__", 1);
ldf.with_column(s.lit())
.group_by_stable(["__explorer_null_for_group__"])
ldf.group_by_stable([1.lit().alias("__explorer_literal_for_group__")])
.agg(aggs)
.select(&[col("*").exclude(["__explorer_null_for_group__"])])
.select(&[col("*").exclude(["__explorer_literal_for_group__"])])
} else {
ldf.group_by_stable(groups).agg(aggs)
};
Expand Down Expand Up @@ -344,7 +343,7 @@ pub fn lf_concat_columns(ldfs: Vec<ExLazyFrame>) -> Result<ExLazyFrame, Explorer
.map(|(idx, ex_ldf)| {
let mut ldf = ex_ldf.clone_inner();
let names: Vec<String> = ldf
.schema()
.collect_schema()
.expect("should be able to get schema")
.iter_names()
.map(|smart_string| smart_string.to_string())
Expand Down
8 changes: 5 additions & 3 deletions native/explorer/src/lazyframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub fn lf_from_parquet(
};

let cols: Vec<Expr> = if let Some(cols) = columns {
cols.iter().map(|column| col(column)).collect()
cols.iter().map(col).collect()
} else {
vec![all()]
};
Expand All @@ -43,7 +43,7 @@ pub fn lf_from_parquet_cloud(
..Default::default()
};
let cols: Vec<Expr> = if let Some(cols) = columns {
cols.iter().map(|column| col(column)).collect()
cols.iter().map(col).collect()
} else {
vec![all()]
};
Expand Down Expand Up @@ -247,7 +247,9 @@ pub fn lf_from_csv(
.with_rechunk(do_rechunk)
.with_encoding(encoding)
.with_dtype_overwrite(schema_from_dtypes_pairs(dtypes)?)
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_null_values(Some(NullValues::AllColumns(
null_vals.iter().map(|x| x.into()).collect(),
)))
.with_eol_char(eol_delimiter.unwrap_or(b'\n'))
.finish()?;

Expand Down
Loading

0 comments on commit 56026d5

Please sign in to comment.