crystal-lang · straight-shoota · Apr 3, 2023 · Mar 29, 2023 · Mar 29, 2023 · Mar 29, 2023
diff --git a/spec/std/regex_spec.cr b/spec/std/regex_spec.cr
@@ -25,6 +25,13 @@ describe "Regex" do
         {% end %}
       end
     end
+
+    it "raises on invalid UTF-8" do
+      expect_raises(ArgumentError, /invalid UTF-8 string|UTF-8 error/) do
+        Regex.new("\x96")
+      end
+      Regex.new("\x96", :NO_UTF8_CHECK).should be_a(Regex)
+    end
   end
 
   it "#options" do
@@ -94,6 +101,20 @@ describe "Regex" do
       /foo/.match(".foo", options: Regex::Options::ANCHORED).should be_nil
       /foo/.match("foo", options: Regex::Options::ANCHORED).should_not be_nil
     end
+
+    it "with invalid UTF-8" do
+      {% if Regex::Engine.resolve.name == "Regex::PCRE" %}
+        expect_raises(ArgumentError, "UTF-8 error") do
+          /([\w_\.@#\/\*])+/.match("\xFF\xFE")
+        end
+      {% else %}
+        if Regex::PCRE2.version_number < {10, 35}
+          pending! "Error in libpcre2 < 10.35"
+        else
+          /([\w_\.@#\/\*])+/.match("\xFF\xFE").should be_nil
+        end
+      {% end %}
+    end
   end
 
   describe "#match_at_byte_index" do
@@ -126,9 +147,15 @@ describe "Regex" do
     end
 
     it "multibyte index" do
-      md = /foo/.match_at_byte_index("öfoo", 1).should_not be_nil
-      md.begin.should eq 1
-      md.byte_begin.should eq 2
+      if Regex::Engine.version_number < {10, 34}
+        expect_raises(ArgumentError, "bad offset into UTF string") do
+          /foo/.match_at_byte_index("öfoo", 1)
+        end
+      else
+        md = /foo/.match_at_byte_index("öfoo", 1).should_not be_nil
+        md.begin.should eq 1
+        md.byte_begin.should eq 2
+      end
 
       md = /foo/.match_at_byte_index("öfoo", 2).should_not be_nil
       md.begin.should eq 1
@@ -205,9 +232,17 @@ describe "Regex" do
       end
 
       it "invalid codepoint" do
-        /foo/.matches?("f\x96o").should be_false
-        /f\x96o/.matches?("f\x96o").should be_false
-        /f.o/.matches?("f\x96o").should be_true
+        if Regex::Engine.version_number < {10, 34}
+          expect_raises(ArgumentError, "UTF-8 error") do
+            /foo/.matches?("f\x96o")
+          end
+        else
+          /foo/.matches?("f\x96o").should be_false
+          /f\x96o/.matches?("f\x96o").should be_false
+          /f.o/.matches?("f\x96o").should be_false
+          /\bf\b/.matches?("f\x96o").should be_true
+          /\bo\b/.matches?("f\x96o").should be_true
+        end
       end
     end
 
@@ -223,7 +258,12 @@ describe "Regex" do
         LibPCRE.config LibPCRE::CONFIG_JIT, out jit_enabled
         pending! "PCRE JIT mode not available." unless 1 == jit_enabled
 
-        str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
+        # This match may raise on JIT stack limit or not. If it raises, the error message should be the expected one.
+        begin
+          str.matches?(/^(?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)?$/)
+        rescue exc : Exception
+          exc.to_s.should eq("Regex match error: JIT_STACKLIMIT")
+        end
       {% else %}
         # Can't use regex literal because the *LIMIT_DEPTH verb is not supported in libpcre (only libpcre2)
         # and thus the compiler doesn't recognize it.
@@ -249,7 +289,14 @@ describe "Regex" do
     end
 
     it "multibyte index" do
-      /foo/.matches_at_byte_index?("öfoo", 1).should be_true
+      if Regex::Engine.version_number < {10, 34}
+        expect_raises(ArgumentError, "bad offset into UTF string") do
+          /foo/.matches_at_byte_index?("öfoo", 1)
+        end
+      else
+        /foo/.matches_at_byte_index?("öfoo", 1).should be_true
+      end
+      /foo/.matches_at_byte_index?("öfoo", 2).should be_true
     end
 
     pending "negative" do

diff --git a/src/regex.cr b/src/regex.cr
@@ -198,6 +198,9 @@ require "./regex/match_data"
 class Regex
   include Regex::Engine
 
+  class Error < Exception
+  end
+
   # List of metacharacters that need to be escaped.
   #
   # See `Regex.needs_escape?` and `Regex.escape`.

diff --git a/src/regex/lib_pcre.cr b/src/regex/lib_pcre.cr
@@ -91,6 +91,7 @@ lib LibPCRE
   fun full_info = pcre_fullinfo(code : Pcre, extra : PcreExtra, what : Int, where : Int*) : Int
   fun get_stringnumber = pcre_get_stringnumber(code : Pcre, string_name : UInt8*) : Int
   fun get_stringtable_entries = pcre_get_stringtable_entries(code : Pcre, name : UInt8*, first : UInt8**, last : UInt8**) : Int
+  fun version = pcre_version : LibC::Char*
 
   CONFIG_JIT = 9
 
@@ -102,4 +103,46 @@ lib LibPCRE
   INFO_NAMETABLE     = 9
 
   $free = pcre_free : Void* ->
+
+  # Exec-time and get/set-time error codes
+  enum Error
+    NOMATCH         =  -1
+    NULL            =  -2
+    BADOPTION       =  -3
+    BADMAGIC        =  -4
+    UNKNOWN_OPCODE  =  -5
+    UNKNOWN_NODE    =  -5 # For backward compatibility
+    NOMEMORY        =  -6
+    NOSUBSTRING     =  -7
+    MATCHLIMIT      =  -8
+    CALLOUT         =  -9 # Never used by PCRE itself
+    BADUTF8         = -10 # Same for 8/16/32
+    BADUTF16        = -10 # Same for 8/16/32
+    BADUTF32        = -10 # Same for 8/16/32
+    BADUTF8_OFFSET  = -11 # Same for 8/16
+    BADUTF16_OFFSET = -11 # Same for 8/16
+    PARTIAL         = -12
+    BADPARTIAL      = -13
+    INTERNAL        = -14
+    BADCOUNT        = -15
+    DFA_UITEM       = -16
+    DFA_UCOND       = -17
+    DFA_UMLIMIT     = -18
+    DFA_WSSIZE      = -19
+    DFA_RECURSE     = -20
+    RECURSIONLIMIT  = -21
+    NULLWSLIMIT     = -22 # No longer actually used
+    BADNEWLINE      = -23
+    BADOFFSET       = -24
+    SHORTUTF8       = -25
+    SHORTUTF16      = -25 # Same for 8/16
+    RECURSELOOP     = -26
+    JIT_STACKLIMIT  = -27
+    BADMODE         = -28
+    BADENDIANNESS   = -29
+    DFA_BADRESTART  = -30
+    JIT_BADOPTION   = -31
+    BADLENGTH       = -32
+    UNSET           = -33
+  end
 end
diff --git a/src/regex/lib_pcre2.cr b/src/regex/lib_pcre2.cr
@@ -182,6 +182,10 @@ lib LibPCRE2
     CONVERT_SYNTAX    = -64
     INTERNAL_DUPMATCH = -65
     DFA_UINVALID_UTF  = -66
+
+    def utf8_validity?
+      in?(UTF8_ERR21..UTF8_ERR1)
+    end
   end
 
   INFO_ALLOPTIONS     =  0

diff --git a/src/regex/pcre.cr b/src/regex/pcre.cr
@@ -2,12 +2,23 @@ require "./lib_pcre"
 
 # :nodoc:
 module Regex::PCRE
+  def self.version : String
+    String.new(LibPCRE.version)
+  end
+
+  class_getter version_number : {Int32, Int32} = begin
+    version = self.version
+    dot = version.index('.') || raise RuntimeError.new("Invalid libpcre2 version")
+    space = version.index(' ', dot) || raise RuntimeError.new("Invalid libpcre2 version")
+    {version.byte_slice(0, dot).to_i, version.byte_slice(dot + 1, space - dot - 1).to_i}
+  end
+
   private def initialize(*, _source source, _options @options)
     # PCRE's pattern must have their null characters escaped
     source = source.gsub('\u{0}', "\\0")
     @source = source
 
-    @re = LibPCRE.compile(@source, pcre_compile_options(options) | LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil)
+    @re = LibPCRE.compile(@source, pcre_compile_options(options) | LibPCRE::UTF8 | LibPCRE::DUPNAMES | LibPCRE::UCP, out errptr, out erroffset, nil)
     raise ArgumentError.new("#{String.new(errptr)} at #{erroffset}") if @re.null?
     @extra = LibPCRE.study(@re, LibPCRE::STUDY_JIT_COMPILE, out studyerrptr)
     if @extra.null? && studyerrptr
@@ -89,7 +100,7 @@ module Regex::PCRE
   end
 
   protected def self.error_impl(source)
-    re = LibPCRE.compile(source, LibPCRE::UTF8 | LibPCRE::NO_UTF8_CHECK | LibPCRE::DUPNAMES, out errptr, out erroffset, nil)
+    re = LibPCRE.compile(source, LibPCRE::UTF8 | LibPCRE::DUPNAMES, out errptr, out erroffset, nil)
     if re
       {% unless flag?(:interpreted) %}
         LibPCRE.free.call re.as(Void*)
@@ -142,9 +153,20 @@ module Regex::PCRE
 
   # Calls `pcre_exec` C function, and handles returning value.
   private def internal_matches?(str, byte_index, options, ovector, ovector_size)
-    ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_match_options(options) | LibPCRE::NO_UTF8_CHECK, ovector, ovector_size)
-    # TODO: when `ret < -1`, it means PCRE error. It should handle correctly.
-    ret >= 0
+    ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, pcre_match_options(options), ovector, ovector_size)
+
+    return true if ret >= 0
+
+    case error = LibPCRE::Error.new(ret)
+    when .nomatch?
+      return false
+    when .badutf8_offset?
+      raise ArgumentError.new("Regex match error: bad offset into UTF string")
+    when .badutf8?
+      raise ArgumentError.new("Regex match error: UTF-8 error")
+    else
+      raise Regex::Error.new("Regex match error: #{error}")
+    end
   end
 
   module MatchData

diff --git a/src/regex/pcre2.cr b/src/regex/pcre2.cr
@@ -6,9 +6,27 @@ module Regex::PCRE2
   @re : LibPCRE2::Code*
   @jit : Bool
 
+  def self.version : String
+    String.new(24) do |pointer|
+      size = LibPCRE2.config(LibPCRE2::CONFIG_VERSION, pointer)
+      {size - 1, size - 1}
+    end
+  end
+
+  class_getter version_number : {Int32, Int32} = begin
+    version = self.version
+    dot = version.index('.') || raise RuntimeError.new("Invalid libpcre2 version")
+    space = version.index(' ', dot) || raise RuntimeError.new("Invalid libpcre2 version")
+    {version.byte_slice(0, dot).to_i, version.byte_slice(dot + 1, space - dot - 1).to_i}
+  end
+
   # :nodoc:
   def initialize(*, _source @source : String, _options @options)
-    @re = PCRE2.compile(source, pcre2_compile_options(options) | LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
+    options = pcre2_compile_options(options) | LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP
+    if PCRE2.version_number >= {10, 34}
+      options |= LibPCRE2::MATCH_INVALID_UTF
+    end
+    @re = PCRE2.compile(source, options) do |error_message|
       raise ArgumentError.new(error_message)
     end
 
@@ -33,14 +51,18 @@ module Regex::PCRE2
     if res = LibPCRE2.compile(source, source.bytesize, options, out errorcode, out erroroffset, nil)
       res
     else
-      message = String.new(256) do |buffer|
-        bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
-        {bytesize, 0}
-      end
+      message = get_error_message(errorcode)
       yield "#{message} at #{erroroffset}"
     end
   end
 
+  protected def self.get_error_message(errorcode)
+    String.new(256) do |buffer|
+      bytesize = LibPCRE2.get_error_message(errorcode, buffer, 256)
+      {bytesize, 0}
+    end
+  end
+
   private def pcre2_compile_options(options)
     flag = 0
     Regex::Options.each do |option|
@@ -108,7 +130,7 @@ module Regex::PCRE2
   end
 
   protected def self.error_impl(source)
-    code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::NO_UTF_CHECK | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
+    code = PCRE2.compile(source, LibPCRE2::UTF | LibPCRE2::DUPNAMES | LibPCRE2::UCP) do |error_message|
       return error_message
     end
 
@@ -221,14 +243,18 @@ module Regex::PCRE2
 
   private def match_data(str, byte_index, options)
     match_data = self.match_data
-    match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_match_options(options) | LibPCRE2::NO_UTF_CHECK, match_data, PCRE2.match_context)
+    match_count = LibPCRE2.match(@re, str, str.bytesize, byte_index, pcre2_match_options(options), match_data, PCRE2.match_context)
 
     if match_count < 0
       case error = LibPCRE2::Error.new(match_count)
       when .nomatch?
         return
+      when .badutfoffset?, .utf8_validity?
+        error_message = PCRE2.get_error_message(error)
+        raise ArgumentError.new("Regex match error: #{error_message}")
       else
-        raise Exception.new("Regex match error: #{error}")
+        error_message = PCRE2.get_error_message(error)
+        raise Regex::Error.new("Regex match error: #{error_message}")
       end
     end