From fa5af2377381145c8949c2de7c2bfceaf772d83e Mon Sep 17 00:00:00 2001 From: Milan Bouchet-Valat Date: Wed, 6 Jul 2016 02:27:08 +0200 Subject: [PATCH] Make endof() robust to invalid UTF-8 (#17276) When an invalid string contains only continuation bytes, endof() tried to index the underlying array at position 0. Instead of relying on bounds checking, explicitly check for > 0. Returning 0 when only continuation bytes where encountered is consistent with the definition of endof(), which gives the last valid index. This also allows removing the i == 0 check. The new code appears to be slightly faster than the old one. --- base/strings/string.jl | 3 +-- test/strings/basic.jl | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/base/strings/string.jl b/base/strings/string.jl index 57dabebe6925a..47eb243b21647 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -36,8 +36,7 @@ const utf8_trailing = [ function endof(s::String) d = s.data i = length(d) - i == 0 && return i - while is_valid_continuation(d[i]) + @inbounds while i > 0 && is_valid_continuation(d[i]) i -= 1 end i diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 940eae91afc7a..f8a63850a0253 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -477,3 +477,7 @@ foobaz(ch) = reinterpret(Char, typemax(UInt32)) @test typeof(ascii(GenericString("Hello, world"))) == String @test_throws ArgumentError ascii("Hello, ∀") @test_throws ArgumentError ascii(GenericString("Hello, ∀")) + +# issue #17271: endof() doesn't throw an error even with invalid strings +@test endof(String(b"\x90")) == 0 +@test endof(String(b"\xce")) == 1