Skip to content

Commit

Permalink
fix: UTF-8 validation of nested string slice in Parquet
Browse files Browse the repository at this point in the history
Fixes #21202.
  • Loading branch information
coastalwhite committed Feb 14, 2025
1 parent d8aad1c commit ca05c76
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,8 @@ pub fn decode_plain_generic(
target.finish_in_progress();
unsafe { target.views_mut() }.reserve(num_rows);

let start_target_length = target.len();

let buffer_idx = target.completed_buffers().len() as u32;
let mut buffer = Vec::with_capacity(values.len() + 1);
let mut none_starting_with_continuation_byte = true; // Whether the transition from between strings is valid
Expand Down Expand Up @@ -346,7 +348,7 @@ pub fn decode_plain_generic(

// @NOTE: This is only valid because we initialize our inline View's to be zeroes on
// non-included bytes.
for view in &target.views()[target.len() - num_seen..] {
for view in &target.views()[start_target_length..] {
all_inlined_are_ascii &= (view.length > View::MAX_INLINE_SIZE)
| (view.as_u128() & 0x0000_0000_8080_8080_8080_8080_8080_8080 == 0);
}
Expand Down
20 changes: 20 additions & 0 deletions py-polars/tests/unit/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2849,3 +2849,23 @@ def test_equality_filter(
raise

pl.read_parquet(f)


def test_nested_string_slice_utf8_21202() -> None:
s = pl.Series(
"a",
[
["A" * 128],
["A"],
],
pl.List(pl.String()),
)

f = io.BytesIO()
s.to_frame().write_parquet(f)

f.seek(0)
assert_series_equal(
pl.scan_parquet(f).slice(1, 1).collect().to_series(),
s.slice(1, 1),
)

0 comments on commit ca05c76

Please sign in to comment.