Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix PCRE crashing on invalid UTF-8 #13240

Merged
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 50 additions & 8 deletions spec/std/regex_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ describe "Regex" do
{% end %}
end
end

it "raises on invalid UTF-8" do
expect_raises(ArgumentError, /invalid UTF-8 string|UTF-8 error: illegal byte/) do
Regex.new("\xFF")
end
Regex.new("\xFE", :NO_UTF8_CHECK).should be_a(Regex)
end
end

it "#options" do
Expand Down Expand Up @@ -94,6 +101,16 @@ describe "Regex" do
/foo/.match(".foo", options: Regex::Options::ANCHORED).should be_nil
/foo/.match("foo", options: Regex::Options::ANCHORED).should_not be_nil
end

it "with invalid UTF-8" do
{% if Regex::Engine.resolve.name == "Regex::PCRE" %}
expect_raises(ArgumentError, "UTF-8 error") do
/([\w_\.@#\/\*])+/.match("\xFF\xFE")
end
{% else %}
/([\w_\.@#\/\*])+/.match("\xFF\xFE").should be_nil
{% end %}
end
end

describe "#match_at_byte_index" do
Expand Down Expand Up @@ -126,9 +143,15 @@ describe "Regex" do
end

it "multibyte index" do
md = /foo/.match_at_byte_index("öfoo", 1).should_not be_nil
md.begin.should eq 1
md.byte_begin.should eq 2
{% if Regex::Engine.resolve.name == "Regex::PCRE" %}
expect_raises(ArgumentError, "bad offset into UTF string") do
/foo/.match_at_byte_index("öfoo", 1)
end
{% else %}
md = /foo/.match_at_byte_index("öfoo", 1).should_not be_nil
md.begin.should eq 1
md.byte_begin.should eq 2
{% end %}

md = /foo/.match_at_byte_index("öfoo", 2).should_not be_nil
md.begin.should eq 1
Expand Down Expand Up @@ -205,9 +228,17 @@ describe "Regex" do
end

it "invalid codepoint" do
/foo/.matches?("f\x96o").should be_false
/f\x96o/.matches?("f\x96o").should be_false
/f.o/.matches?("f\x96o").should be_true
{% if Regex::Engine.resolve.name == "Regex::PCRE" %}
expect_raises(ArgumentError, "UTF-8 error") do
/foo/.matches?("f\x96o")
end
{% else %}
/foo/.matches?("f\x96o").should be_false
/f\x96o/.matches?("f\x96o").should be_false
/f.o/.matches?("f\x96o").should be_false
/\bf\b/.matches?("f\x96o").should be_true
/\bo\b/.matches?("f\x96o").should be_true
{% end %}
end
end

Expand All @@ -223,7 +254,11 @@ describe "Regex" do
LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled
pending! "PCRE JIT mode not available." unless 1 == jit_enabled

str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
begin
str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
rescue exc : Exception
exc.to_s.should eq("Regex match error: JIT_STACKLIMIT")
end
straight-shoota marked this conversation as resolved.
Show resolved Hide resolved
{% else %}
# Can't use regex literal because the *LIMIT_DEPTH verb is not supported in libpcre (only libpcre2)
# and thus the compiler doesn't recognize it.
Expand All @@ -249,7 +284,14 @@ describe "Regex" do
end

it "multibyte index" do
/foo/.matches_at_byte_index?("öfoo", 1).should be_true
{% if Regex::Engine.resolve.name == "Regex::PCRE" %}
expect_raises(ArgumentError, "bad offset into UTF string") do
/foo/.matches_at_byte_index?("öfoo", 1)
end
{% else %}
/foo/.matches_at_byte_index?("öfoo", 1).should be_true
{% end %}
/foo/.matches_at_byte_index?("öfoo", 2).should be_true
end

pending "negative" do
Expand Down
3 changes: 3 additions & 0 deletions src/regex.cr
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,9 @@ require "./regex/match_data"
class Regex
include Regex::Engine

class Error < Exception
end

# List of metacharacters that need to be escaped.
#
# See `Regex.needs_escape?` and `Regex.escape`.
Expand Down
42 changes: 42 additions & 0 deletions src/regex/lib_pcre.cr
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,46 @@ lib LibPCRE
INFO_NAMETABLE = 9

$free = pcre_free : Void* ->

# Exec-time and get/set-time error codes
enum Error
NOMATCH = -1
NULL = -2
BADOPTION = -3
BADMAGIC = -4
UNKNOWN_OPCODE = -5
UNKNOWN_NODE = -5 # For backward compatibility
NOMEMORY = -6
NOSUBSTRING = -7
MATCHLIMIT = -8
CALLOUT = -9 # Never used by PCRE itself
BADUTF8 = -10 # Same for 8/16/32
BADUTF16 = -10 # Same for 8/16/32
BADUTF32 = -10 # Same for 8/16/32
BADUTF8_OFFSET = -11 # Same for 8/16
BADUTF16_OFFSET = -11 # Same for 8/16
PARTIAL = -12
BADPARTIAL = -13
INTERNAL = -14
BADCOUNT = -15
DFA_UITEM = -16
DFA_UCOND = -17
DFA_UMLIMIT = -18
DFA_WSSIZE = -19
DFA_RECURSE = -20
RECURSIONLIMIT = -21
NULLWSLIMIT = -22 # No longer actually used
BADNEWLINE = -23
BADOFFSET = -24
SHORTUTF8 = -25
SHORTUTF16 = -25 # Same for 8/16
RECURSELOOP = -26
JIT_STACKLIMIT = -27
BADMODE = -28
BADENDIANNESS = -29
DFA_BADRESTART = -30
JIT_BADOPTION = -31
BADLENGTH = -32
UNSET = -33
end
end
4 changes: 4 additions & 0 deletions src/regex/lib_pcre2.cr
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ lib LibPCRE2
CONVERT_SYNTAX = -64
INTERNAL_DUPMATCH = -65
DFA_UINVALID_UTF = -66

def utf8_validity?
in?(UTF8_ERR21..UTF8_ERR1)
end
end

INFO_ALLOPTIONS = 0
Expand Down
21 changes: 16 additions & 5 deletions src/regex/pcre.cr
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module Regex::PCRE
source = source.gsub('\u{0}', "\\0")
@source = source

@re = LibPCRE.compile(@source, pcre_options(options) | LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil)
@re = LibPCRE.compile(@source, pcre_options(options) | LibPCRE::UTF8 | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil)
raise ArgumentError.new("#{String.new(errptr)} at #{erroffset}") if @re.null?
@extra = LibPCRE.study(@re, LibPCRE::STUDY_JIT_COMPILE, out studyerrptr)
if @extra.null? && studyerrptr
Expand Down Expand Up @@ -53,7 +53,7 @@ module Regex::PCRE
end

protected def self.error_impl(source)
re = LibPCRE.compile(source, LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES, out errptr, out erroffset, nil)
re = LibPCRE.compile(source, LibPCRE::UTF8 | LibPCRE::DUPNAMES, out errptr, out erroffset, nil)
if re
{% unless flag?(:interpreted) %}
LibPCRE.free.call re.as(Void*)
Expand Down Expand Up @@ -106,9 +106,20 @@ module Regex::PCRE

# Calls `pcre_exec` C function, and handles returning value.
private def internal_matches?(str, byte_index, options, ovector, ovector_size)
ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_options(options) | LibPCRE::NO_UTF8_CHECK, ovector, ovector_size)
# TODO: when `ret < -1`, it means PCRE error. It should handle correctly.
ret >= 0
ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_options(options), ovector, ovector_size)

return true if ret >= 0

case error = LibPCRE::Error.new(ret)
when .nomatch?
return false
when .badutf8_offset?
raise ArgumentError.new("Regex match error: bad offset into UTF string")
when .badutf8?
raise ArgumentError.new("Regex match error: UTF-8 error")
else
raise Regex::Error.new("Regex match error: #{error}")
end
end

module MatchData
Expand Down
24 changes: 16 additions & 8 deletions src/regex/pcre2.cr
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ module Regex::PCRE2

# :nodoc:
def initialize(*, _source @source : String, _options @options)
@re = PCRE2.compile(source, pcre2_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
@re = PCRE2.compile(source, pcre2_options(options) | LibPCRE2::UTF | LibPCRE2::MATCH_INVALID_UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
raise ArgumentError.new(error_message)
end

Expand All @@ -33,14 +33,18 @@ module Regex::PCRE2
if res = LibPCRE2.compile(source, source.bytesize, options, out errorcode, out erroroffset, nil)
res
else
message = String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
message = get_error_message(errorcode)
yield "#{message} at #{erroroffset}"
end
end

protected def self.get_error_message(errorcode)
String.new(256) do |buffer|
bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
{bytesize, 0}
end
end

private def pcre2_options(options)
flag = 0
Regex::Options.each do |option|
Expand Down Expand Up @@ -73,7 +77,7 @@ module Regex::PCRE2
end

protected def self.error_impl(source)
code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
return error_message
end

Expand Down Expand Up @@ -186,14 +190,18 @@ module Regex::PCRE2

private def match_data(str, byte_index, options)
match_data = self.match_data
match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, PCRE2.match_context)
match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_options(options), match_data, PCRE2.match_context)

if match_count < 0
case error = LibPCRE2::Error.new(match_count)
when .nomatch?
return
when .badutfoffset?, .utf8_validity?
error_message = PCRE2.get_error_message(error)
raise ArgumentError.new("Regex match error: #{error_message}")
else
raise Exception.new("Regex match error: #{error}")
error_message = PCRE2.get_error_message(error)
raise Regex::Error.new("Regex match error: #{error_message}")
end
end

Expand Down