diff --git a/.travis.yml b/.travis.yml index 94ebd55c4..5dc616276 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,19 +22,24 @@ rvm: - 2.2 - ruby-head - ree - - jruby-18mode - - jruby - - jruby-head - rbx matrix: include: - rvm: jruby - env: JRUBY_OPTS='--2.0' - allow_failures: + env: JRUBY_OPTS='--server -Xcompile.invokedynamic=false -Xcompat.version=2.0' - rvm: jruby-head + env: JRUBY_OPTS='--server -Xcompile.invokedynamic=false' + # These two are temporary until https://github.com/travis-ci/travis-ci/issues/3067 is solved. + - rvm: jruby-18mode + env: JRUBY_OPTS='--server -Xcompile.invokedynamic=false' + - rvm: jruby + env: JRUBY_OPTS='--server -Xcompile.invokedynamic=false' + allow_failures: - rvm: ruby-head - rvm: rbx # These two are temporary until https://github.com/travis-ci/travis-ci/issues/3067 is solved. - rvm: jruby-18mode + env: JRUBY_OPTS='--server -Xcompile.invokedynamic=false' - rvm: jruby + env: JRUBY_OPTS='--server -Xcompile.invokedynamic=false' fast_finish: true diff --git a/Gemfile b/Gemfile index 4cae93c45..2c9a5ef3e 100644 --- a/Gemfile +++ b/Gemfile @@ -14,7 +14,8 @@ branch = File.read(File.expand_path("../maintenance-branch", __FILE__)).chomp end ### dep for ci/coverage -gem 'simplecov', '~> 0.8' +gem 'simplecov', '~> 0.9' +gem 'simplecov-html', :github => 'colszowka/simplecov-html' gem 'rubocop', "~> 0.23.0", :platform => [:ruby_19, :ruby_20, :ruby_21] diff --git a/lib/rspec/support/differ.rb b/lib/rspec/support/differ.rb index d59890a15..56bbd0a35 100644 --- a/lib/rspec/support/differ.rb +++ b/lib/rspec/support/differ.rb @@ -7,8 +7,14 @@ module RSpec module Support # rubocop:disable ClassLength class Differ + if String.method_defined?(:encoding) + EMPTY_DIFF = EncodedString.new("", Encoding.default_external) + else + EMPTY_DIFF = EncodedString.new("") + end + def diff(actual, expected) - diff = "" + diff = EMPTY_DIFF.dup if actual && expected if all_strings?(actual, expected) @@ -25,12 +31,10 @@ def diff(actual, expected) # rubocop:disable MethodLength def diff_as_string(actual, expected) - @encoding = pick_encoding actual, expected - + @encoding = EncodedString.pick_encoding(actual, expected) @actual = EncodedString.new(actual, @encoding) @expected = EncodedString.new(expected, @encoding) - - output = EncodedString.new("\n", @encoding) + output = EncodedString.new("\n", @encoding) hunks.each_cons(2) do |prev_hunk, current_hunk| begin @@ -47,8 +51,6 @@ def diff_as_string(actual, expected) finalize_output(output, hunks.last.diff(format_type).to_s) if hunks.last color_diff output - rescue Encoding::CompatibilityError - handle_encoding_errors end # rubocop:enable MethodLength @@ -188,26 +190,6 @@ def object_to_string(object) PP.pp(object, "") end end - - if String.method_defined?(:encoding) - def pick_encoding(source_a, source_b) - Encoding.compatible?(source_a, source_b) || Encoding.default_external - end - else - def pick_encoding(_source_a, _source_b) - end - end - - def handle_encoding_errors - if @actual.source_encoding != @expected.source_encoding - "Could not produce a diff because the encoding of the actual string " \ - "(#{@actual.source_encoding}) differs from the encoding of the expected " \ - "string (#{@expected.source_encoding})" - else - "Could not produce a diff because of the encoding of the string " \ - "(#{@expected.source_encoding})" - end - end end # rubocop:enable ClassLength end diff --git a/lib/rspec/support/encoded_string.rb b/lib/rspec/support/encoded_string.rb index 4fa90fdd4..8976131e5 100644 --- a/lib/rspec/support/encoded_string.rb +++ b/lib/rspec/support/encoded_string.rb @@ -2,7 +2,19 @@ module RSpec module Support # @private class EncodedString - MRI_UNICODE_UNKOWN_CHARACTER = "\xEF\xBF\xBD" + if String.method_defined?(:encoding) + # see https://github.com/ruby/ruby/blob/ca24e581ba/encoding.c#L1191 + def self.pick_encoding(source_a, source_b) + Encoding.compatible?(source_a, source_b) || Encoding.default_external + end + else + def self.pick_encoding(_source_a, _source_b) + end + end + + # Ruby's default replacement string for is U+FFFD ("\xEF\xBF\xBD") for Unicode encoding forms + # else is '?' ("\x3F") + REPLACE = "\x3F" def initialize(string, encoding=nil) @encoding = encoding @@ -33,21 +45,52 @@ def to_s private + ENCODING_STRATEGY = { + :bad_bytes => { + :invalid => :replace, + # :undef => :nil, + :replace => REPLACE + }, + :cannot_convert => { + # :invalid => :nil, + :undef => :replace, + :replace => REPLACE + }, + :no_converter => { + :invalid => :replace, + # :undef => :nil, + :replace => REPLACE + } + } + + # Raised by Encoding and String methods: + # Encoding::UndefinedConversionError: + # when a transcoding operation fails + # e.g. "\x80".encode('utf-8','ASCII-8BIT') + # Encoding::InvalidByteSequenceError: + # when the string being transcoded contains a byte invalid for the either + # the source or target encoding + # e.g. "\x80".encode('utf-8','US-ASCII') + # Raised by transcoding methods: + # Encoding::ConverterNotFoundError: + # when a named encoding does not correspond with a known converter + # e.g. 'abc'.force_encoding('utf-8').encode('foo') + # Encoding::CompatibilityError + # def matching_encoding(string) - string.encode(@encoding) - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - normalize_missing(string.encode(@encoding, :invalid => :replace, :undef => :replace)) + encoding = EncodedString.pick_encoding(source_encoding, @encoding) + # Converting it to a higher character set (UTF-16) and then back (to UTF-8) + # ensures that we strip away invalid or undefined byte sequences + # => no need to rescue Encoding::InvalidByteSequenceError, ArgumentError + string.encode(::Encoding::UTF_16LE, ENCODING_STRATEGY[:bad_bytes]). + encode(encoding) + rescue Encoding::UndefinedConversionError, Encoding::CompatibilityError + string.encode(encoding, ENCODING_STRATEGY[:cannot_convert]) + # Begin: Needed for 1.9.2 rescue Encoding::ConverterNotFoundError - normalize_missing(string.force_encoding(@encoding).encode(:invalid => :replace)) - end - - def normalize_missing(string) - if @encoding.to_s == "UTF-8" - string.gsub(MRI_UNICODE_UNKOWN_CHARACTER.force_encoding(@encoding), "?") - else - string - end + string.force_encoding(encoding).encode(ENCODING_STRATEGY[:no_converter]) end + # End: Needed for 1.9.2 def detect_source_encoding(string) string.encoding diff --git a/lib/rspec/support/spec.rb b/lib/rspec/support/spec.rb index 3f90aa5fe..c5fde9780 100644 --- a/lib/rspec/support/spec.rb +++ b/lib/rspec/support/spec.rb @@ -1,5 +1,6 @@ require 'rspec/support' RSpec::Support.require_rspec_support "spec/deprecation_helpers" +RSpec::Support.require_rspec_support "spec/encoding_helpers" RSpec::Support.require_rspec_support "spec/with_isolated_stderr" RSpec::Support.require_rspec_support "spec/stderr_splitter" RSpec::Support.require_rspec_support "spec/formatting_support" @@ -12,6 +13,7 @@ c.include RSpecHelpers c.include RSpec::Support::WithIsolatedStdErr c.include RSpec::Support::FormattingSupport + c.include RSpec::Support::EncodingHelpers unless defined?(Debugger) # debugger causes warnings when used c.before do diff --git a/lib/rspec/support/spec/encoding_helpers.rb b/lib/rspec/support/spec/encoding_helpers.rb new file mode 100644 index 000000000..8bb70e79e --- /dev/null +++ b/lib/rspec/support/spec/encoding_helpers.rb @@ -0,0 +1,61 @@ +module RSpec + module Support + module EncodingHelpers + module_function + + # For undefined conversions, replace as "U+" + # e.g. '\xa0' becomes 'U+00A0' + # see https://github.com/ruby/ruby/blob/34fbf57aaa/test/ruby/test_transcode.rb#L2050 + def safe_chr + # rubocop:disable Style/RescueModifier + @safe_chr ||= Hash.new { |h, x| h[x] = x.chr rescue ("U+%.4X" % [x]) } + # rubocop:enable Style/RescueModifier + end + + if String.method_defined?(:encoding) + + def safe_codepoints(str) + str.each_codepoint.map { |codepoint| safe_chr[codepoint] } + rescue ArgumentError + str.each_byte.map { |byte| safe_chr[byte] } + end + + # rubocop:disable MethodLength + def expect_identical_string(str1, str2, expected_encoding=str1.encoding) + expect(str1.encoding).to eq(expected_encoding) + str1_bytes = safe_codepoints(str1) + str2_bytes = safe_codepoints(str2) + return unless str1_bytes != str2_bytes + str1_differences = [] + str2_differences = [] + # rubocop:disable Style/Next + str2_bytes.each_with_index do |str2_byte, index| + str1_byte = str1_bytes.fetch(index) do + str2_differences.concat str2_bytes[index..-1] + return + end + if str1_byte != str2_byte + str1_differences << str1_byte + str2_differences << str2_byte + end + end + # rubocop:enable Style/Next + expect(str1_differences.join).to eq(str2_differences.join) + end + # rubocop:enable Style/MethodLength + + else + + def safe_codepoints(str) + str.split(//) + end + + def expect_identical_string(str1, str2) + str1_bytes = safe_codepoints(str1) + str2_bytes = safe_codepoints(str2) + expect(str1_bytes).to eq(str2_bytes) + end + end + end + end +end diff --git a/lib/rspec/support/spec/in_sub_process.rb b/lib/rspec/support/spec/in_sub_process.rb index b641e3935..638797e6b 100644 --- a/lib/rspec/support/spec/in_sub_process.rb +++ b/lib/rspec/support/spec/in_sub_process.rb @@ -1,7 +1,7 @@ module RSpec module Support module InSubProcess - if Process.respond_to?(:fork) && !(RUBY_PLATFORM == 'java' && RUBY_VERSION == '1.8.7') + if Process.respond_to?(:fork) && !(Ruby.jruby? && RUBY_VERSION == '1.8.7') # Useful as a way to isolate a global change to a subprocess. # rubocop:disable MethodLength @@ -35,7 +35,7 @@ def in_sub_process(prevent_warnings=true) raise exception if exception end else - def in_sub_process + def in_sub_process(*) skip "This spec requires forking to work properly, " \ "and your platform does not support forking" end diff --git a/script/functions.sh b/script/functions.sh index 75fe0465d..83b4362d8 100644 --- a/script/functions.sh +++ b/script/functions.sh @@ -7,6 +7,10 @@ source $SCRIPT_DIR/predicate_functions.sh # idea taken from: http://blog.headius.com/2010/03/jruby-startup-time-tips.html export JRUBY_OPTS="${JRUBY_OPTS} -X-C" # disable JIT since these processes are so short lived +# Set the external encoding to UTF-8 in a 1.8.7-compatible way +export LANG=en_US.UTF-8 +export LC_ALL=en_US.UTF-8 + SPECS_HAVE_RUN_FILE=specs.out MAINTENANCE_BRANCH=`cat maintenance-branch` @@ -112,7 +116,7 @@ function check_documentation_coverage { } function check_style_and_lint { - echo "bin/rubucop lib" + echo "bin/rubocop lib" bin/rubocop lib } diff --git a/spec/rspec/support/differ_spec.rb b/spec/rspec/support/differ_spec.rb index abd927a3c..6a99e20c7 100644 --- a/spec/rspec/support/differ_spec.rb +++ b/spec/rspec/support/differ_spec.rb @@ -7,6 +7,7 @@ module RSpec module Support describe Differ do + describe '#diff' do let(:differ) { RSpec::Support::Differ.new } @@ -34,50 +35,87 @@ module Support EOD diff = differ.diff(actual, expected) - expect(diff).to eql(expected_diff) + expect_identical_string(diff, expected_diff) end if String.method_defined?(:encoding) it "returns an empty string if strings are not multiline" do - expected = "Tu avec carte {count} item has".encode('UTF-16LE') - actual = "Tu avec carté {count} itém has".encode('UTF-16LE') + expected = "It has trouble when {string} has an e".encode('UTF-16LE') + actual = "It has trouble when {string} has an é".encode('UTF-16LE') expect(differ.diff(actual, expected)).to be_empty end it 'copes with encoded strings' do - expected = "Tu avec carte {count} item has\n".encode('UTF-16LE') - actual = "Tu avec carté {count} itém has\n".encode('UTF-16LE') - expect(differ.diff(actual, expected)).to eql(<<-EOD.encode('UTF-16LE')) + expected = "It has trouble when {string} has an e".encode('UTF-16LE') + actual = "It has trouble when {string} has an é".encode('UTF-16LE') + + diff = differ.diff(actual, expected) + expected_diff = <<-EOD.encode('UTF-16LE') @@ -1,2 +1,2 @@ --Tu avec carte {count} item has -+Tu avec carté {count} itém has -EOD +-It has trouble when {string} has an e ++It has trouble when {string} has an é + EOD + expect_identical_string(diff, expected_diff) end it 'handles differently encoded strings that are compatible' do expected = "abc\n".encode('us-ascii') actual = "강인철\n".encode('UTF-8') - expect(differ.diff(actual, expected)).to eql "\n@@ -1,2 +1,2 @@\n-abc\n+강인철\n" + + diff = differ.diff(actual, expected) + expected_diff = "\n@@ -1,2 +1,2 @@\n-abc\n+강인철\n" + expect_identical_string(diff, expected_diff) + end + + it 'uses the default external encoding when the two strings have incompatible encodings' do + source_encoding = Encoding.find('iso-8859-1') + target_encoding = Encoding.find('euc-jp') + expected = "This is #{source_encoding.name}".force_encoding(source_encoding) + actual = "This is #{target_encoding.name}".force_encoding(target_encoding) + expected_diff = <<-EOD + +@@ -1,2 +1,2 @@ +-This is iso-8859-1 ++This is euc-jp + EOD + diff = differ.diff(actual, expected) + expect(Encoding.compatible?(actual.encoding, expected.encoding)).to be_nil + expect(diff.encoding).to eq(Encoding.default_external) + expect_identical_string(diff, expected_diff) end - it 'uses the default external encoding when the two strings have incompatible encodings', :failing_on_appveyor do - expected = "Tu avec carte {count} item has\n" - actual = "Tu avec carté {count} itém has\n".encode('UTF-16LE') - expect(differ.diff(actual, expected)).to eq("\n@@ -1,2 +1,2 @@\n-Tu avec carte {count} item has\n+Tu avec carté {count} itém has\n") - expect(differ.diff(actual, expected).encoding).to eq(Encoding.default_external) + it 'handles an Encoding::ConverterNotFoundError' do + expected = "Tu avec carte {count} item has\n".encode('UTF-16LE') + actual = "Tu avec carté {count} itém has\n".force_encoding('IBM737') + if RUBY_VERSION === '1.9.2' && Ruby.mri? + e = '\xC3\xA9\xC3\xA9' + else + e = "é".force_encoding('IBM737').encode('UTF-16LE').unpack('C*').pack('c*') + end + diff = differ.diff(actual, expected) + expected_diff = <<-EOD + +@@ -1,2 +1,2 @@ +-Tu avec carte {count} item has ++Tu avec cart#{e} {count} it#{e}m has + EOD + expect_identical_string(diff, expected_diff) end - it 'handles any encoding error that occurs with a helpful error message' do - expect(RSpec::Support::HunkGenerator).to receive(:new). - and_raise(Encoding::CompatibilityError) - expected = "Tu avec carte {count} item has\n".encode('us-ascii') - actual = "Tu avec carté {count} itém has\n" + it 'handles an Encoding::CompatibilityError' do + expected = "\xAETu avec carte {count} item has\n".force_encoding("ASCII-8BIT") + actual = "\xE2\x82\xACTu avec carte {count} item has\n".force_encoding('UTF-8') + diff = differ.diff(actual, expected) - expect(diff).to match(/Could not produce a diff/) - expect(diff).to match(/actual string \(UTF-8\)/) - expect(diff).to match(/expected string \(US-ASCII\)/) + expected_diff = <<-EOD + +@@ -1,2 +1,2 @@ +-?Tu avec carte {count} item has ++\xE2\x82\xACTu avec carte {count} item has + EOD + expect_identical_string(diff, expected_diff) end end @@ -111,7 +149,7 @@ def inspect EOD diff = differ.diff(expected,actual) - expect(diff).to eq expected_diff + expect_identical_string(diff, expected_diff) end it "outputs unified diff message of two arrays" do @@ -133,7 +171,7 @@ def inspect EOD diff = differ.diff(expected,actual) - expect(diff).to eq expected_diff + expect_identical_string(diff, expected_diff) end it 'outputs a unified diff message for an array which flatten recurses' do @@ -148,12 +186,13 @@ def inspect; ""; end diff = differ.diff [obj], [] end - expect(diff).to eq <<-EOD + expected_diff = <<-EOD @@ -1,2 +1,2 @@ -[] +[] -EOD + EOD + expect_identical_string(diff, expected_diff) end it "outputs unified diff message of two hashes" do @@ -172,29 +211,41 @@ def inspect; ""; end EOD diff = differ.diff(expected,actual) - expect(diff).to eq expected_diff + expect_identical_string(diff, expected_diff) end - it 'outputs unified diff message of two hashes with differing encoding', :failing_on_appveyor do + it 'outputs unified diff message of two hashes with differing encoding' do + replacement = if OS.windows? || RUBY_VERSION < '1.9.3' + '+"\303\266" => "\303\266"' + else + %{+"ö" => "ö"} + end expected_diff = %Q{ @@ -1,2 +1,2 @@ -"a" => "a", -#{ (RUBY_VERSION.to_f > 1.8) ? %Q{+"ö" => "ö"} : '+"\303\266" => "\303\266"' }, +#{ replacement }, } diff = differ.diff({'ö' => 'ö'}, {'a' => 'a'}) - expect(diff).to eq expected_diff + expect_identical_string(diff, expected_diff) end - it 'outputs unified diff message of two hashes with encoding different to key encoding', :failing_on_appveyor do + it 'outputs unified diff message of two hashes with encoding different to key encoding' do + actual = { "한글" => "한글2"} + expected = { :a => "a"} + replacement = if OS.windows? || RUBY_VERSION < '1.9.3' + '+"\355\225\234\352\270\200" => "\355\225\234\352\270\2002"' + else + %{+\"한글\" => \"한글2\"} + end expected_diff = %Q{ @@ -1,2 +1,2 @@ -:a => "a", -#{ (RUBY_VERSION.to_f > 1.8) ? %Q{+\"한글\" => \"한글2\"} : '+"\355\225\234\352\270\200" => "\355\225\234\352\270\2002"' }, +#{ replacement }, } - diff = differ.diff({ "한글" => "한글2"}, { :a => "a"}) - expect(diff).to eq expected_diff + diff = differ.diff(actual, expected) + expect_identical_string(diff, expected_diff) end it "outputs unified diff message of two hashes with object keys" do @@ -205,7 +256,7 @@ def inspect; ""; end } diff = differ.diff({ ['d','c'] => 'b'}, { ['a','c'] => 'b' }) - expect(diff).to eq expected_diff + expect_identical_string(diff, expected_diff) end it "outputs unified diff of multi line strings" do @@ -221,7 +272,7 @@ def inspect; ""; end EOD diff = differ.diff(expected,actual) - expect(diff).to eq expected_diff + expect_identical_string(diff, expected_diff) end it "splits items with newlines" do @@ -233,7 +284,7 @@ def inspect; ""; end EOD diff = differ.diff [], ["a\nb", "c\nd"] - expect(diff).to eql expected_diff + expect_identical_string(diff, expected_diff) end it "shows inner arrays on a single line" do @@ -245,7 +296,7 @@ def inspect; ""; end EOD diff = differ.diff [], ["a\nb", ["c\nd"]] - expect(diff).to eql expected_diff + expect_identical_string(diff, expected_diff) end it "returns an empty string if no expected or actual" do @@ -300,7 +351,7 @@ def inspect; ""; end EOS diff = differ.diff(expected, actual) - expect(diff).to eq expected_diff + expect_identical_string(diff, expected_diff) end end @@ -313,7 +364,7 @@ def inspect; ""; end expected_diff = "\e[0m\n\e[0m\e[34m@@ -1,2 +1,2 @@\n\e[0m\e[31m-foo bang baz\n\e[0m\e[32m+foo bar baz\n\e[0m" diff = differ.diff(expected,actual) - expect(diff).to eq expected_diff + expect_identical_string(diff, expected_diff) end end end diff --git a/spec/rspec/support/encoded_string_spec.rb b/spec/rspec/support/encoded_string_spec.rb index 4df40a408..0bfc8178e 100644 --- a/spec/rspec/support/encoded_string_spec.rb +++ b/spec/rspec/support/encoded_string_spec.rb @@ -1,51 +1,193 @@ +# encoding: utf-8 require 'spec_helper' require 'rspec/support/encoded_string' module RSpec::Support describe EncodedString do - let(:target_encoding) { 'UTF-8' } + let(:utf8_encoding) { 'UTF-8' } delegated_methods = String.instance_methods.map(&:to_s) & %w[eql? lines == encoding empty?] delegated_methods.each do |delegated_method| it "responds to #{delegated_method}" do - encoded_string = EncodedString.new("abc", target_encoding) + encoded_string = EncodedString.new("abc", utf8_encoding) expect(encoded_string).to respond_to(delegated_method) end end if String.method_defined?(:encoding) + + # see https://github.com/rubyspec/rubyspec/blob/91ce9f6549/core/encoding/find_spec.rb#L57 + describe 'Ensure tests are running with utf-8 encoding' do + + it 'default_internal' do + if Encoding.default_external == Encoding.find('locale') + expected_encoding = '' + else + expected_encoding = utf8_encoding + end + expect(Encoding.default_internal.to_s).to eq(expected_encoding) + end + + it 'default_external' do + expect(Encoding.default_external.to_s).to eq(utf8_encoding) + end + + it 'locale' do + skip "Not sure how to determine locale (#{Encoding.find('locale')})"\ + "from LC_ALL or on windows" + end + + it 'filesystem' do + encoding = Encoding.find('filesystem').to_s + if OS.windows? + skip "Not sure how to tell filesystem encoding is #{encoding}" + expect(encoding).to eq(utf8_encoding) + end + end + + it 'current script (file)' do + expect(__ENCODING__.to_s).to eq(utf8_encoding) + end + end + + describe '#pick_encoding' do + if String.method_defined?(:encoding) + it "picks a compatible encoding, falling back to default_external" do + str1 = "\xa1".force_encoding("iso-8859-1") + str2 = "\xa1\xa1".force_encoding("euc-jp") + expect(Encoding.compatible?(str1, str2)).to be_nil + + expect(EncodedString.pick_encoding(str1, str2)).to eq(Encoding.default_external) + end + end + end + describe '#source_encoding' do it 'knows the original encoding of the string' do str = EncodedString.new("abc".encode('ASCII-8BIT'), "UTF-8") - expect( str.source_encoding.to_s ).to eq('ASCII-8BIT') + expect(str.source_encoding.to_s).to eq('ASCII-8BIT') end end - let(:ascii_arrow_symbol) { "\xAE" } + describe '#to_s' do + context 'when encoding a string with invalid bytes in the target encoding' do + # see https://github.com/jruby/jruby/blob/c1be61a501/test/mri/ruby/test_transcode.rb#L13 + let(:source_encoding) { Encoding.find('US-ASCII') } + let(:target_encoding) { Encoding.find('UTF-8') } + let(:string) { "I have a bad byté\x80".force_encoding(source_encoding) } + + it 'replaces invalid byte sequences with the REPLACE string' do + resulting_string = build_encoded_string(string, target_encoding).to_s + expected_string = "I have a bad byt\x3F\x3F\x3F" + expect_identical_string(resulting_string, expected_string) + end + it 'normally raises an EncodedString::InvalidByteSequenceError' do + expect { + string.encode(target_encoding) + }.to raise_error(Encoding::InvalidByteSequenceError) + end + end + + context 'when no converter is known for an encoding' do + # see https://github.com/rubyspec/rubyspec/blob/91ce9f6549/core/string/shared/encode.rb#L12 + let(:source_encoding) { Encoding.find('ASCII-8BIT') } + let(:no_converter_encoding) { Encoding::Emacs_Mule } + let(:string) { "\x80".force_encoding(source_encoding) } + + it 'normally raises an Encoding::ConverterNotFoundError' do + expect { + string.encode(no_converter_encoding) + }.to raise_error(Encoding::ConverterNotFoundError) + end + + it 'forces the encoding to Encoding.default_external' do + resulting_string = build_encoded_string(string, no_converter_encoding).to_s + expected_string = "I am not going to changé\xEF".force_encoding(Encoding.default_external) + expect_identical_string(resulting_string, expected_string, Encoding.default_external) + end + end + + # see https://github.com/ruby/ruby/blob/34fbf57aaa/transcode.c#L4289 + # ISO-8859-1 -> UTF-8 -> EUC-JP + # "\xa0" NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP + context 'when there is an undefined conversion to the target encoding' do + let(:source_encoding) { Encoding.find('ISO-8859-1') } + let(:incompatible_encoding) { Encoding.find('EUC-JP') } + let(:string) { "\xa0 hi I am not going to work".force_encoding(source_encoding) } + + it 'normally raises an Encoding::UndefinedConversionError' do + expect { + string.encode(incompatible_encoding) + }.to raise_error(Encoding::UndefinedConversionError) + end + + it 'replaces all undefines conversions with the REPLACE string' do + resulting_string = build_encoded_string(string, incompatible_encoding).to_s + if OS.windows? + replacement = "\xFF" + else + replacement = "\xA0" + end + expected_string = "#{replacement} hi I am not going to work" + expect_identical_string(resulting_string, expected_string) + end + end + end + + let(:ascii_arrow_symbol) { "\xAE" } let(:utf_8_euro_symbol) { "\xE2\x82\xAC" } describe '#<<' do context 'with strings that can be converted to the target encoding' do + let(:valid_ascii_string) { "abcdé".force_encoding("ASCII-8BIT") } + let(:valid_unicode_string) { utf_8_euro_symbol.force_encoding('UTF-8') } it 'encodes and appends the string' do - valid_ascii_string = "abc".force_encoding("ASCII-8BIT") - valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8') - resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << valid_ascii_string - expect(resulting_string).to eq "#{utf_8_euro_symbol}abc".force_encoding('UTF-8') + resulting_string = build_encoded_string(valid_unicode_string, utf8_encoding) << valid_ascii_string + if OS.windows? + replacement = "\x82\x82" + else + replacement = "\xE9\xE9" + end + expected_string = "#{utf_8_euro_symbol}abcd#{replacement}".force_encoding('UTF-8') + expect_identical_string(resulting_string, expected_string) + end + + it 'copes with encoded strings' do + source_encoding = Encoding.find('UTF-16LE') + accentless = build_encoded_string("Tu avec carte {count} item has\n", source_encoding) + accented = "Tu avec carté {count} itém has\n".encode(source_encoding) + resulting_string = accentless << accented + if OS.windows? + replacement = "\x82\x82" + else + replacement = "\u00E9" + end + expected_string = <<-EOS.encode('UTF-16LE') +Tu avec carte {count} item has +Tu avec cart#{replacement} {count} it#{replacement}m has + EOS + expect_identical_string(resulting_string, expected_string) end end context 'with a string that cannot be converted to the target encoding' do - it 'replaces undefined characters with either a ? or a unicode ?' do - ascii_string = ascii_arrow_symbol.force_encoding("ASCII-8BIT") - valid_unicode_string = utf_8_euro_symbol.force_encoding('UTF-8') + context 'when appending a string with an incompatible character encoding' do + let(:ascii_string) { ascii_arrow_symbol.force_encoding("ASCII-8BIT") } + let(:valid_unicode_string) { utf_8_euro_symbol.force_encoding('UTF-8') } - resulting_string = build_encoded_string(valid_unicode_string, target_encoding) << ascii_string - expected_bytes = utf_8_euro_symbol.each_byte.to_a + ["?".unpack("c").first] - actual_bytes = resulting_string.each_byte.to_a + it "normally raises an Encoding::CompatibilityError" do + expect { + valid_unicode_string.encode(utf8_encoding) << ascii_string + }.to raise_error(Encoding::CompatibilityError) + end - expect(actual_bytes).to eq(expected_bytes) + it 'replaces unconvertable characters with a string representation of their hex value' do + resulting_string = build_encoded_string(valid_unicode_string, utf8_encoding) << ascii_string + expected_string = "#{utf_8_euro_symbol}?" + expect_identical_string(resulting_string, expected_string) + end end end @@ -54,23 +196,76 @@ module RSpec::Support ascii_string = 'abc'.force_encoding("ASCII-8BIT") other_ascii_string = '123'.force_encoding("ASCII-8BIT") - resulting_string = build_encoded_string(ascii_string, target_encoding) << other_ascii_string - expect(resulting_string.encoding.to_s).to eq 'UTF-8' + resulting_string = build_encoded_string(ascii_string, utf8_encoding) << other_ascii_string + expected_string = 'abc123'.force_encoding('ASCII-8BIT') + expect_identical_string(resulting_string, expected_string) end end end describe '#split' do - it 'splits the string based on the delimiter accounting for encoding' do - wrapped_string = "aaaaaaaaaaa#{ascii_arrow_symbol}aaaaa".force_encoding("ASCII-8BIT") + context 'when the string has an invalid byte sequence' do + let(:message_with_invalid_byte_sequence) { "\xEF \255 \xAD I have bad bytes".force_encoding(utf8_encoding) } - expect { - build_encoded_string(wrapped_string, target_encoding).split(utf_8_euro_symbol.force_encoding("UTF-8")) - }.not_to raise_error + it 'normally raises an ArgumentError' do + expect { + message_with_invalid_byte_sequence.split("\n") + }.to raise_error(ArgumentError) + end + + it 'replaces invalid bytes with the REPLACE string' do + resulting_array = build_encoded_string(message_with_invalid_byte_sequence, utf8_encoding).split("\n") + expect(resulting_array.size).to eq(1) # sanity check + expected_string = "? ? ? I have bad bytes" + expect_identical_string(resulting_array.first, expected_string) + end + + end + + context 'when there is an undefined conversion to the target encoding' do + let(:wrapped_string) { "aaaaaaaaaaa#{ascii_arrow_symbol}aaaaa".force_encoding("ASCII-8BIT") } + + it 'normally raises an Encoding::UndefinedConversionError' do + expect { + wrapped_string.encode(utf8_encoding) + }.to raise_error(Encoding::UndefinedConversionError) + end + + it 'splits the string based on the delimiter accounting for encoding' do + expect { + build_encoded_string(wrapped_string, utf8_encoding).split(utf_8_euro_symbol.force_encoding("UTF-8")) + }.not_to raise_error + end + end + + # see https://github.com/rspec/rspec-expectations/blob/f8a1232/spec/rspec/expectations/fail_with_spec.rb#L50 + # https://github.com/rspec/rspec-expectations/issues/201 + # https://github.com/rspec/rspec-expectations/pull/220 + context 'with a string that cannot be converted to the target encoding' do + let(:binary_poop) {'💩' } # [128169] "\u{1F4A9}" + let(:non_ascii_compatible_string) { "This is a pile of poo: #{binary_poop}, yuck".encode("UTF-16LE") } + + it 'normally raises an Encoding::CompatibilityError' do + expect { + non_ascii_compatible_string.split("\n") + }.to raise_error(Encoding::CompatibilityError) + end + + it 'corrects for the encoding if possible, else replaces the incompatible character' do + resulting_array = build_encoded_string(non_ascii_compatible_string).split("\n") + expect(resulting_array.size).to eq(1) # sanity check + if OS.windows? + replacement = EncodedString::REPLACE + else + replacement = binary_poop + end + expected_string = "This is a pile of poo: #{replacement}, yuck" + expect_identical_string(resulting_array.first, expected_string) + end end end - def build_encoded_string(string, target_encoding) + def build_encoded_string(string, target_encoding = string.encoding) EncodedString.new(string, target_encoding) end else @@ -78,7 +273,7 @@ def build_encoded_string(string, target_encoding) describe '#source_encoding' do it 'defaults to US-ASCII' do str = EncodedString.new("abc", "UTF-8") - expect( str.source_encoding ).to eq('US-ASCII') + expect(str.source_encoding).to eq('US-ASCII') end end end