Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix an issue where Regexp.union was improperly negotiating the result encoding #3296

Merged
merged 3 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Bug fixes:

* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).
* Fix `IO.copy_stream` with a `Tempfile` destination (#3280, @eregon).
* Fix `Regexp.union` negotiating the wrong result encoding (#3287, @nirvdrum, @simonlevasseur).

Compatibility:

Expand Down
51 changes: 37 additions & 14 deletions spec/ruby/core/regexp/union_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,27 @@
Regexp.union("\u00A9".encode("ISO-8859-1"), "a".encode("UTF-8")).encoding.should == Encoding::ISO_8859_1
end

it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do
us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n
us_ascii_implicit.encoding.should == Encoding::US_ASCII
us_ascii_explicit.encoding.should == Encoding::US_ASCII
binary.encoding.should == Encoding::BINARY

Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_explicit, us_ascii_implicit, binary).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_explicit, binary, us_ascii_implicit).encoding.should == Encoding::BINARY
Regexp.union(binary, us_ascii_implicit, us_ascii_explicit).encoding.should == Encoding::BINARY
Regexp.union(binary, us_ascii_explicit, us_ascii_implicit).encoding.should == Encoding::BINARY
end

it "return US-ASCII if all patterns are ASCII-only" do
Regexp.union(/abc/e, /def/e).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/n, /def/n).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/s, /def/s).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/u, /def/u).encoding.should == Encoding::US_ASCII
end

it "returns a Regexp with UTF-8 if one part is UTF-8" do
Regexp.union(/probl[éeè]me/i, /help/i).encoding.should == Encoding::UTF_8
end
Expand All @@ -54,83 +75,83 @@
it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Strings" do
-> {
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-16BE"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
end

it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Regexps" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")),
Regexp.new("b".encode("UTF-16BE")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
end

it "raises ArgumentError if the arguments include conflicting fixed encoding Regexps" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
Regexp.new("b".encode("US-ASCII"), Regexp::FIXEDENCODING))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and US-ASCII')
end

it "raises ArgumentError if the arguments include a fixed encoding Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
"\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and ISO-8859-1')
end

it "raises ArgumentError if the arguments include a String containing non-ASCII-compatible characters and a fixed encoding Regexp in a different encoding" do
-> {
Regexp.union("\u00A9".encode("ISO-8859-1"),
Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: ISO-8859-1 and UTF-8')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only String" do
-> {
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-8"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only String" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "b".encode("UTF-8"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only Regexp" do
-> {
Regexp.union("a".encode("UTF-16LE"), Regexp.new("b".encode("UTF-8")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only Regexp" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("b".encode("UTF-8")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union("a".encode("UTF-16LE"), "\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and a Regexp containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union("a".encode("UTF-16LE"), Regexp.new("\u00A9".encode("ISO-8859-1")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a Regexp containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("\u00A9".encode("ISO-8859-1")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "uses to_str to convert arguments (if not Regexp)" do
Expand All @@ -154,6 +175,8 @@
not_supported_on :opal do
Regexp.union([/dogs/, /cats/i]).should == /(?-mix:dogs)|(?i-mx:cats)/
end
->{Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])}.should raise_error(TypeError)
-> {
Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])
}.should raise_error(TypeError, 'no implicit conversion of Array into String')
end
end
65 changes: 34 additions & 31 deletions src/main/ruby/truffleruby/core/regexp.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,22 +55,27 @@ def self.try_convert(obj)
Truffle::Type.try_convert obj, Regexp, :to_regexp
end

def self.convert(pattern)
return pattern if Primitive.is_a?(pattern, Regexp)
if Primitive.is_a?(pattern, Array)
union(*pattern)
else
Regexp.quote(pattern.to_s)
end
end
def self.negotiate_union_encoding(*patterns)
compatible_enc = nil

patterns.each do |pattern|
converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern)

enc = converted.encoding

if Primitive.nil?(compatible_enc)
compatible_enc = enc
else
if test = Primitive.encoding_compatible?(enc, compatible_enc)
compatible_enc = test
else
raise ArgumentError, "incompatible encodings: #{compatible_enc} and #{enc}"
end

def self.compatible?(*patterns)
encodings = patterns.map { |r| convert(r).encoding }
last_enc = encodings.pop
encodings.each do |encoding|
raise ArgumentError, "incompatible encodings: #{encoding} and #{last_enc}" unless Primitive.encoding_compatible?(last_enc, encoding)
last_enc = encoding
end
end

compatible_enc
end

def self.last_match(index = nil)
Expand All @@ -96,37 +101,35 @@ def self.last_match(index = nil)
def self.union(*patterns)
case patterns.size
when 0
return %r/(?!)/
%r/(?!)/
when 1
pattern = patterns.first
case pattern
when Array
return union(*pattern)
union(*pattern)
else
converted = Truffle::Type.rb_check_convert_type(pattern, Regexp, :to_regexp)
if Primitive.nil? converted
return Regexp.new(Regexp.quote(pattern))
Regexp.new(Regexp.quote(pattern))
else
return converted
converted
end
end
else
compatible?(*patterns)
enc = convert(patterns.first).encoding
end
patterns = patterns.map do |pat|
if Primitive.is_a?(pat, Regexp)
pat
else
StringValue(pat)
end
end

sep = '|'.encode(enc)
str = ''.encode(enc)
enc = negotiate_union_encoding(*patterns)
sep = '|'.encode(enc)
str = ''.encode(enc)

patterns = patterns.map do |pat|
if Primitive.is_a?(pat, Regexp)
pat
else
StringValue(pat)
end
Truffle::RegexpOperations.union(str, sep, *patterns)
end

Truffle::RegexpOperations.union(str, sep, *patterns)
end
Truffle::Graal.always_split(method(:union))

Expand Down