diff --git a/jv_unicode.c b/jv_unicode.c index c3f9f11137..fbf7454be3 100644 --- a/jv_unicode.c +++ b/jv_unicode.c @@ -59,11 +59,12 @@ int jvp_utf8_is_valid(const char* in, const char* end) { return 1; } +/* Assumes startchar is the first byte of a valid character sequence */ int jvp_utf8_decode_length(char startchar) { - if ((startchar & 0x80) == 0) return 1; - else if ((startchar & 0xC0) == 0xC0) return 2; - else if ((startchar & 0xE0) == 0xE0) return 3; - else return 4; + if ((startchar & 0x80) == 0) return 1; // 0___ ____ + else if ((startchar & 0xE0) == 0xC0) return 2; // 110_ ____ + else if ((startchar & 0xF0) == 0xE0) return 3; // 1110 ____ + else return 4; // 1111 ____ } int jvp_utf8_encode_length(int codepoint) { diff --git a/tests/onig.test b/tests/onig.test index ed6cd3b06d..9dd0b834ff 100644 --- a/tests/onig.test +++ b/tests/onig.test @@ -83,3 +83,8 @@ gsub("(?\\d)"; ":\(.d);") gsub("(?.)[^a]*"; "+\(.x)-") "Abcabc" "+A-+a-" + +# utf-8 +sub( "(?.)"; "\(.x)!") +"’" +"’!"