Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix PCRE crashing on invalid UTF-8 #13240

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 55 additions & 8 deletions spec/std/regex_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ describe "Regex" do
{% end %}
end
end

it "raises on invalid UTF-8" do
expect_raises(ArgumentError, /invalid UTF-8 string|UTF-8 error/) do
Regex.new("\x96")
end
Regex.new("\x96", :NO_UTF8_CHECK).should be_a(Regex)
end
end

it "#options" do
Expand Down Expand Up @@ -94,6 +101,20 @@ describe "Regex" do
/foo/.match(".foo", options: Regex::Options::ANCHORED).should be_nil
/foo/.match("foo", options: Regex::Options::ANCHORED).should_not be_nil
end

it "with invalid UTF-8" do
{% if Regex::Engine.resolve.name == "Regex::PCRE" %}
expect_raises(ArgumentError, "UTF-8 error") do
/([\w_\.@#\/\*])+/.match("\xFF\xFE")
end
{% else %}
if Regex::PCRE2.version_number < {10, 35}
pending! "Error in libpcre2 < 10.35"
else
/([\w_\.@#\/\*])+/.match("\xFF\xFE").should be_nil
end
{% end %}
end
end

describe "#match_at_byte_index" do
Expand Down Expand Up @@ -126,9 +147,15 @@ describe "Regex" do
end

it "multibyte index" do
md = /foo/.match_at_byte_index("öfoo", 1).should_not be_nil
md.begin.should eq 1
md.byte_begin.should eq 2
if Regex::Engine.version_number < {10, 34}
expect_raises(ArgumentError, "bad offset into UTF string") do
/foo/.match_at_byte_index("öfoo", 1)
end
else
md = /foo/.match_at_byte_index("öfoo", 1).should_not be_nil
md.begin.should eq 1
md.byte_begin.should eq 2
end

md = /foo/.match_at_byte_index("öfoo", 2).should_not be_nil
md.begin.should eq 1
Expand Down Expand Up @@ -205,9 +232,17 @@ describe "Regex" do
end

it "invalid codepoint" do
/foo/.matches?("f\x96o").should be_false
/f\x96o/.matches?("f\x96o").should be_false
/f.o/.matches?("f\x96o").should be_true
if Regex::Engine.version_number < {10, 34}
expect_raises(ArgumentError, "UTF-8 error") do
/foo/.matches?("f\x96o")
end
else
/foo/.matches?("f\x96o").should be_false
/f\x96o/.matches?("f\x96o").should be_false
/f.o/.matches?("f\x96o").should be_false
/\bf\b/.matches?("f\x96o").should be_true
/\bo\b/.matches?("f\x96o").should be_true
end
end
end

Expand All @@ -223,7 +258,12 @@ describe "Regex" do
LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled
pending! "PCRE JIT mode not available." unless 1 == jit_enabled

str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
# This match may raise on JIT stack limit or not. If it raises, the error message should be the expected one.
begin
str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
rescue exc : Exception
exc.to_s.should eq("Regex match error: JIT_STACKLIMIT")
end
{% else %}
# Can't use regex literal because the *LIMIT_DEPTH verb is not supported in libpcre (only libpcre2)
# and thus the compiler doesn't recognize it.
Expand All @@ -249,7 +289,14 @@ describe "Regex" do
end

it "multibyte index" do
/foo/.matches_at_byte_index?("öfoo", 1).should be_true
if Regex::Engine.version_number < {10, 34}
expect_raises(ArgumentError, "bad offset into UTF string") do
/foo/.matches_at_byte_index?("öfoo", 1)
end
else
/foo/.matches_at_byte_index?("öfoo", 1).should be_true
end
/foo/.matches_at_byte_index?("öfoo", 2).should be_true
end

pending "negative" do
Expand Down
3 changes: 3 additions & 0 deletions src/regex.cr
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,9 @@ require "./regex/match_data"
class Regex
include Regex::Engine

class Error < Exception
end

# List of metacharacters that need to be escaped.
#
# See `Regex.needs_escape?` and `Regex.escape`.
Expand Down
43 changes: 43 additions & 0 deletions src/regex/lib_pcre.cr
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ lib LibPCRE
fun full_info = pcre_fullinfo(code : Pcre, extra : PcreExtra, what : Int, where : Int*) : Int
fun get_stringnumber = pcre_get_stringnumber(code : Pcre, string_name : UInt8*) : Int
fun get_stringtable_entries = pcre_get_stringtable_entries(code : Pcre, name : UInt8*, first : UInt8**, last : UInt8**) : Int
fun version = pcre_version : LibC::Char*

CONFIG_JIT = 9

Expand All @@ -102,4 +103,46 @@ lib LibPCRE
INFO_NAMETABLE = 9

$free = pcre_free : Void* ->

# Exec-time and get/set-time error codes
enum Error
NOMATCH = -1
NULL = -2
BADOPTION = -3
BADMAGIC = -4
UNKNOWN_OPCODE = -5
UNKNOWN_NODE = -5 # For backward compatibility
NOMEMORY = -6
NOSUBSTRING = -7
MATCHLIMIT = -8
CALLOUT = -9 # Never used by PCRE itself
BADUTF8 = -10 # Same for 8/16/32
BADUTF16 = -10 # Same for 8/16/32
BADUTF32 = -10 # Same for 8/16/32
BADUTF8_OFFSET = -11 # Same for 8/16
BADUTF16_OFFSET = -11 # Same for 8/16
PARTIAL = -12
BADPARTIAL = -13
INTERNAL = -14
BADCOUNT = -15
DFA_UITEM = -16
DFA_UCOND = -17
DFA_UMLIMIT = -18
DFA_WSSIZE = -19
DFA_RECURSE = -20
RECURSIONLIMIT = -21
NULLWSLIMIT = -22 # No longer actually used
BADNEWLINE = -23
BADOFFSET = -24
SHORTUTF8 = -25
SHORTUTF16 = -25 # Same for 8/16
RECURSELOOP = -26
JIT_STACKLIMIT = -27
BADMODE = -28
BADENDIANNESS = -29
DFA_BADRESTART = -30
JIT_BADOPTION = -31
BADLENGTH = -32
UNSET = -33
end
end
4 changes: 4 additions & 0 deletions src/regex/lib_pcre2.cr
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ lib LibPCRE2
CONVERT_SYNTAX = -64
INTERNAL_DUPMATCH = -65
DFA_UINVALID_UTF = -66

def utf8_validity?
in?(UTF8_ERR21..UTF8_ERR1)
end
end

INFO_ALLOPTIONS = 0
Expand Down
32 changes: 27 additions & 5 deletions src/regex/pcre.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,23 @@ require "./lib_pcre"

# :nodoc:
module Regex::PCRE
def self.version : String
String.new(LibPCRE.version)
end

class_getter version_number : {Int32, Int32} = begin
version = self.version
dot = version.index('.') || raise RuntimeError.new("Invalid libpcre2 version")
space = version.index(' ', dot) || raise RuntimeError.new("Invalid libpcre2 version")
{version.byte_slice(0, dot).to_i, version.byte_slice(dot + 1, space - dot - 1).to_i}
end

private def initialize(*, _source source, _options @options)
# PCRE's pattern must have their null characters escaped
source = source.gsub('\u{0}', "\\0")
@source = source

@re = LibPCRE.compile(@source, pcre_compile_options(options) | LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil)
@re = LibPCRE.compile(@source, pcre_compile_options(options) | LibPCRE::UTF8 | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil)
raise ArgumentError.new("#{String.new(errptr)} at #{erroffset}") if @re.null?
@extra = LibPCRE.study(@re, LibPCRE::STUDY_JIT_COMPILE, out studyerrptr)
if @extra.null? && studyerrptr
Expand Down Expand Up @@ -89,7 +100,7 @@ module Regex::PCRE
end

protected def self.error_impl(source)
re = LibPCRE.compile(source, LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES, out errptr, out erroffset, nil)
re = LibPCRE.compile(source, LibPCRE::UTF8 | LibPCRE::DUPNAMES, out errptr, out erroffset, nil)
if re
{% unless flag?(:interpreted) %}
LibPCRE.free.call re.as(Void*)
Expand Down Expand Up @@ -142,9 +153,20 @@ module Regex::PCRE

# Calls `pcre_exec` C function, and handles returning value.
private def internal_matches?(str, byte_index, options, ovector, ovector_size)
ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_match_options(options) | LibPCRE::NO_UTF8_CHECK, ovector, ovector_size)
# TODO: when `ret < -1`, it means PCRE error. It should handle correctly.
ret >= 0
ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_match_options(options), ovector, ovector_size)

return true if ret >= 0

case error = LibPCRE::Error.new(ret)
when .nomatch?
return false
when .badutf8_offset?
raise ArgumentError.new("Regex match error: bad offset into UTF string")
when .badutf8?
raise ArgumentError.new("Regex match error: UTF-8 error")
else
raise Regex::Error.new("Regex match error: #{error}")
end
end

module MatchData
Expand Down
42 changes: 34 additions & 8 deletions src/regex/pcre2.cr
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,27 @@ module Regex::PCRE2
@re : LibPCRE2::Code*
@jit : Bool

def self.version : String
String.new(24) do |pointer|
size = LibPCRE2.config(LibPCRE2::CONFIG_VERSION, pointer)
{size - 1, size - 1}
end
end

class_getter version_number : {Int32, Int32} = begin
version = self.version
dot = version.index('.') || raise RuntimeError.new("Invalid libpcre2 version")
space = version.index(' ', dot) || raise RuntimeError.new("Invalid libpcre2 version")
{version.byte_slice(0, dot).to_i, version.byte_slice(dot + 1, space - dot - 1).to_i}
end

# :nodoc:
def initialize(*, _source @source : String, _options @options)
@re = PCRE2.compile(source, pcre2_compile_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
options = pcre2_compile_options(options) | LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
if PCRE2.version_number >= {10, 34}
options |= LibPCRE2::MATCH_INVALID_UTF
end
@re = PCRE2.compile(source, options) do |error_message|
raise ArgumentError.new(error_message)
end

Expand All @@ -33,14 +51,18 @@ module Regex::PCRE2
if res = LibPCRE2.compile(source, source.bytesize, options, out errorcode, out erroroffset, nil)
res
else
message = String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
message = get_error_message(errorcode)
yield "#{message} at #{erroroffset}"
end
end

protected def self.get_error_message(errorcode)
String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
end

private def pcre2_compile_options(options)
flag = 0
Regex::Options.each do |option|
Expand Down Expand Up @@ -108,7 +130,7 @@ module Regex::PCRE2
end

protected def self.error_impl(source)
code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
return error_message
end

Expand Down Expand Up @@ -221,14 +243,18 @@ module Regex::PCRE2

private def match_data(str, byte_index, options)
match_data = self.match_data
match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_match_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, PCRE2.match_context)
match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_match_options(options), match_data, PCRE2.match_context)

if match_count < 0
case error = LibPCRE2::Error.new(match_count)
when .nomatch?
return
when .badutfoffset?, .utf8_validity?
error_message = PCRE2.get_error_message(error)
raise ArgumentError.new("Regex match error: #{error_message}")
else
raise Exception.new("Regex match error: #{error}")
error_message = PCRE2.get_error_message(error)
raise Regex::Error.new("Regex match error: #{error_message}")
end
end

Expand Down