From 11e3de39d9f3faa7bf119023be39c0afd580a9c4 Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Wed, 6 Jan 2016 15:43:33 +0100 Subject: [PATCH 1/2] Add fast path for ASCII in UTF-8 validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink --- src/libcollectionstest/str.rs | 12 +++++ src/libcore/str/mod.rs | 84 ++++++++++++++++++++++++----------- 2 files changed, 69 insertions(+), 27 deletions(-) diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index e22ff7ca54061..ab831de65167a 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -470,6 +470,18 @@ fn test_is_utf8() { assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); } +#[test] +fn from_utf8_mostly_ascii() { + // deny invalid bytes embedded in long stretches of ascii + for i in 32..64 { + let mut data = [0; 128]; + data[i] = 0xC0; + assert!(from_utf8(&data).is_err()); + data[i] = 0xC2; + assert!(from_utf8(&data).is_err()); + } +} + #[test] fn test_is_utf16() { use rustc_unicode::str::is_utf16; diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 40bd108a91860..64c21836b0003 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -32,6 +32,7 @@ use option::Option::{self, None, Some}; use raw::{Repr, Slice}; use result::Result::{self, Ok, Err}; use slice::{self, SliceExt}; +use usize; pub mod pattern; @@ -240,7 +241,7 @@ impl Utf8Error { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { - try!(run_utf8_validation_iterator(&mut v.iter())); + try!(run_utf8_validation(v)); Ok(unsafe { from_utf8_unchecked(v) }) } @@ -1074,46 +1075,44 @@ unsafe fn cmp_slice(a: &str, b: &str, len: usize) -> i32 { } /* -Section: Misc +Section: UTF-8 validation */ +// use truncation to fit u64 into usize +const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; + +/// Return `true` if any byte in the word `x` is nonascii (>= 128). +#[inline] +fn contains_nonascii(x: usize) -> bool { + (x & NONASCII_MASK) != 0 +} + /// Walk through `iter` checking that it's a valid UTF-8 sequence, /// returning `true` in that case, or, if it is invalid, `false` with /// `iter` reset such that it is pointing at the first byte in the /// invalid sequence. #[inline(always)] -fn run_utf8_validation_iterator(iter: &mut slice::Iter) - -> Result<(), Utf8Error> { - let whole = iter.as_slice(); - loop { - // save the current thing we're pointing at. - let old = iter.clone(); - - // restore the iterator we had at the start of this codepoint. +fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { + let mut offset = 0; + let len = v.len(); + while offset < len { + let old_offset = offset; macro_rules! err { () => {{ - *iter = old.clone(); return Err(Utf8Error { - valid_up_to: whole.len() - iter.as_slice().len() + valid_up_to: old_offset }) }}} - macro_rules! next { () => { - match iter.next() { - Some(a) => *a, - // we needed data, but there was none: error! - None => err!(), + macro_rules! next { () => {{ + offset += 1; + // we needed data, but there was none: error! + if offset >= len { + err!() } - }} - - let first = match iter.next() { - Some(&b) => b, - // we're at the end of the iterator and a codepoint - // boundary at the same time, so this string is valid. - None => return Ok(()) - }; + v[offset] + }}} - // ASCII characters are always valid, so only large - // bytes need more examination. + let first = v[offset]; if first >= 128 { let w = UTF8_CHAR_WIDTH[first as usize]; let second = next!(); @@ -1156,8 +1155,39 @@ fn run_utf8_validation_iterator(iter: &mut slice::Iter) } _ => err!() } + offset += 1; + } else { + // Ascii case, try to skip forward quickly. + let ptr = v.as_ptr(); + let align = (ptr as usize + offset) & (usize::BYTES - 1); + if align == 0 { + // When the pointer is aligned, read 2 words of data per iteration + // until we find a word containing a non-ascii byte. + while offset <= len - 2 * usize::BYTES { + unsafe { + let u = *(ptr.offset(offset as isize) as *const usize); + let v = *(ptr.offset((offset + usize::BYTES) as isize) as *const usize); + + // break if there is a nonascii byte + let zu = contains_nonascii(u); + let zv = contains_nonascii(v); + if zu || zv { + break; + } + } + offset += usize::BYTES * 2; + } + // step from the point where the wordwise loop stopped + while offset < len && v[offset] < 128 { + offset += 1; + } + } else { + offset += 1; + } } } + + Ok(()) } // https://tools.ietf.org/html/rfc3629 From cadcd70775cf42b2add2526026a0a06c1ced411c Mon Sep 17 00:00:00 2001 From: Ulrik Sverdrup Date: Tue, 12 Jan 2016 23:04:46 +0100 Subject: [PATCH 2/2] UTF-8 validation: Add missing if conditional for short input We need to guard that `len` is large enough for the fast skip loop. --- src/libcore/str/mod.rs | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 64c21836b0003..d85212d25e792 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -1158,24 +1158,27 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { offset += 1; } else { // Ascii case, try to skip forward quickly. + // When the pointer is aligned, read 2 words of data per iteration + // until we find a word containing a non-ascii byte. + const BYTES_PER_ITERATION: usize = 2 * usize::BYTES; let ptr = v.as_ptr(); let align = (ptr as usize + offset) & (usize::BYTES - 1); if align == 0 { - // When the pointer is aligned, read 2 words of data per iteration - // until we find a word containing a non-ascii byte. - while offset <= len - 2 * usize::BYTES { - unsafe { - let u = *(ptr.offset(offset as isize) as *const usize); - let v = *(ptr.offset((offset + usize::BYTES) as isize) as *const usize); - - // break if there is a nonascii byte - let zu = contains_nonascii(u); - let zv = contains_nonascii(v); - if zu || zv { - break; + if len >= BYTES_PER_ITERATION { + while offset <= len - BYTES_PER_ITERATION { + unsafe { + let u = *(ptr.offset(offset as isize) as *const usize); + let v = *(ptr.offset((offset + usize::BYTES) as isize) as *const usize); + + // break if there is a nonascii byte + let zu = contains_nonascii(u); + let zv = contains_nonascii(v); + if zu || zv { + break; + } } + offset += BYTES_PER_ITERATION; } - offset += usize::BYTES * 2; } // step from the point where the wordwise loop stopped while offset < len && v[offset] < 128 {