diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index e4085472ea20..c14c0e1d34c4 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -69,6 +69,7 @@ paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } +simdutf8 = { version = "0.1.5"} [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 0e16642940d2..00627ad612ea 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -683,9 +683,12 @@ impl ByteViewArrayDecoderDelta { /// Check that `val` is a valid UTF-8 sequence pub fn check_valid_utf8(val: &[u8]) -> Result<()> { - match std::str::from_utf8(val) { + match simdutf8::basic::from_utf8(val) { Ok(_) => Ok(()), - Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)), + Err(_) => { + let e = simdutf8::compat::from_utf8(val).unwrap_err(); + Err(general_err!("encountered non UTF-8 data: {}", e)) + } } } diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index ce9eb1142a5b..8dfb859612cb 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -117,9 +117,13 @@ impl OffsetBuffer { /// /// [`Self::try_push`] can perform this validation check on insertion pub fn check_valid_utf8(&self, start_offset: usize) -> Result<()> { - match std::str::from_utf8(&self.values.as_slice()[start_offset..]) { + match simdutf8::basic::from_utf8(&self.values.as_slice()[start_offset..]) { Ok(_) => Ok(()), - Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)), + Err(_) => { + let e = simdutf8::compat::from_utf8(&self.values.as_slice()[start_offset..]) + .unwrap_err(); + Err(general_err!("encountered non UTF-8 data: {}", e)) + } } }