From 90d77bd600e5db7430b32cea5405d98203cc00d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 14 Jan 2025 15:29:10 +0100 Subject: [PATCH] Faster parquet utf8 validation using `simdjson` (#6668) * Faster utf8 validation * Move dependency --------- Co-authored-by: Andrew Lamb --- parquet/Cargo.toml | 1 + parquet/src/arrow/array_reader/byte_view_array.rs | 7 +++++-- parquet/src/arrow/buffer/offset_buffer.rs | 8 ++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index e4085472ea20..c14c0e1d34c4 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -69,6 +69,7 @@ paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } +simdutf8 = { version = "0.1.5"} [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 0e16642940d2..00627ad612ea 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -683,9 +683,12 @@ impl ByteViewArrayDecoderDelta { /// Check that `val` is a valid UTF-8 sequence pub fn check_valid_utf8(val: &[u8]) -> Result<()> { - match std::str::from_utf8(val) { + match simdutf8::basic::from_utf8(val) { Ok(_) => Ok(()), - Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)), + Err(_) => { + let e = simdutf8::compat::from_utf8(val).unwrap_err(); + Err(general_err!("encountered non UTF-8 data: {}", e)) + } } } diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index ce9eb1142a5b..8dfb859612cb 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -117,9 +117,13 @@ impl OffsetBuffer { /// /// [`Self::try_push`] can perform this validation check on insertion pub fn check_valid_utf8(&self, start_offset: usize) -> Result<()> { - match std::str::from_utf8(&self.values.as_slice()[start_offset..]) { + match simdutf8::basic::from_utf8(&self.values.as_slice()[start_offset..]) { Ok(_) => Ok(()), - Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)), + Err(_) => { + let e = simdutf8::compat::from_utf8(&self.values.as_slice()[start_offset..]) + .unwrap_err(); + Err(general_err!("encountered non UTF-8 data: {}", e)) + } } }