From 2ad27a7629b2b8c9459cd2856f2c315ded0aa6cd Mon Sep 17 00:00:00 2001
From: tompng <tomoyapenguin@gmail.com>
Date: Tue, 18 Jun 2024 02:19:50 +0900
Subject: [PATCH] Make Reline::Unicode's vi_ ed_ em_ method encoding safe

---
 lib/reline/unicode.rb       | 92 ++++++++++++++++++++-----------------
 test/reline/test_unicode.rb | 45 ++++++++++++++----
 2 files changed, 87 insertions(+), 50 deletions(-)

diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
index afcdaf1e43..ab7708a5fe 100644
--- a/lib/reline/unicode.rb
+++ b/lib/reline/unicode.rb
@@ -263,29 +263,29 @@ def self.get_prev_mbchar_size(line, byte_pointer)
 
   def self.em_forward_word(line, byte_pointer)
     gcs = line.byteslice(byte_pointer..).grapheme_clusters
-    nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
-    words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
+    nonwords = gcs.take_while { |c| !word_character?(c) }
+    words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
     nonwords.sum(&:bytesize) + words.sum(&:bytesize)
   end
 
   def self.em_forward_word_with_capitalization(line, byte_pointer)
     gcs = line.byteslice(byte_pointer..).grapheme_clusters
-    nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
-    words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
+    nonwords = gcs.take_while { |c| !word_character?(c) }
+    words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
     [nonwords.sum(&:bytesize) + words.sum(&:bytesize), nonwords.join + words.join.capitalize]
   end
 
   def self.em_backward_word(line, byte_pointer)
     gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
-    nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
-    words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
+    nonwords = gcs.take_while { |c| !word_character?(c) }
+    words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
     nonwords.sum(&:bytesize) + words.sum(&:bytesize)
   end
 
   def self.em_big_backward_word(line, byte_pointer)
     gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
-    spaces = gcs.take_while { |c| c.match?(/\s/) }
-    nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
+    spaces = gcs.take_while { |c| space_character?(c) }
+    nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
     spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize)
   end
 
@@ -293,20 +293,19 @@ def self.ed_transpose_words(line, byte_pointer)
     gcs = line.byteslice(0, byte_pointer).grapheme_clusters
     pos = gcs.size
     gcs += line.byteslice(byte_pointer..).grapheme_clusters
-    gcs.map! { |c| c.encode(Encoding::UTF_8) }
-    pos += 1 while pos < gcs.size && gcs[pos].match?(/\P{Word}/)
+    pos += 1 while pos < gcs.size && !word_character?(gcs[pos])
     if pos == gcs.size # 'aaa  bbb [cursor] '
-      pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/)
+      pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1])
       second_word_end = gcs.size
     else # 'aaa  [cursor]bbb'
-      pos += 1 while pos < gcs.size && gcs[pos].match?(/\p{Word}/)
+      pos += 1 while pos < gcs.size && word_character?(gcs[pos])
       second_word_end = pos
     end
-    pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/)
+    pos -= 1 while pos > 0 && word_character?(gcs[pos - 1])
     second_word_start = pos
-    pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/)
+    pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1])
     first_word_end = pos
-    pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/)
+    pos -= 1 while pos > 0 && word_character?(gcs[pos - 1])
     first_word_start = pos
 
     [first_word_start, first_word_end, second_word_start, second_word_end].map do |idx|
@@ -316,16 +315,16 @@ def self.ed_transpose_words(line, byte_pointer)
 
   def self.vi_big_forward_word(line, byte_pointer)
     gcs = line.byteslice(byte_pointer..).grapheme_clusters
-    nonspaces = gcs.take_while { |c| c.match?(/\S/) }
-    spaces = gcs.drop(nonspaces.size).take_while { |c| c.match?(/\s/) }
+    nonspaces = gcs.take_while { |c| !space_character?(c) }
+    spaces = gcs.drop(nonspaces.size).take_while { |c| space_character?(c) }
     nonspaces.sum(&:bytesize) + spaces.sum(&:bytesize)
   end
 
   def self.vi_big_forward_end_word(line, byte_pointer)
     gcs = line.byteslice(byte_pointer..).grapheme_clusters
     first = gcs.shift(1)
-    spaces = gcs.take_while { |c| c.match?(/\s/) }
-    nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
+    spaces = gcs.take_while { |c| space_character?(c) }
+    nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
     matched = spaces + nonspaces
     matched.pop
     first.sum(&:bytesize) + matched.sum(&:bytesize)
@@ -333,55 +332,56 @@ def self.vi_big_forward_end_word(line, byte_pointer)
 
   def self.vi_big_backward_word(line, byte_pointer)
     gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
-    spaces = gcs.take_while { |c| c.match?(/\s/) }
-    nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
+    spaces = gcs.take_while { |c| space_character?(c) }
+    nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
     spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize)
   end
 
   def self.vi_forward_word(line, byte_pointer, drop_terminate_spaces = false)
-    gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }
+    gcs = line.byteslice(byte_pointer..).grapheme_clusters
     return 0 if gcs.empty?
 
-    regexp =
-      case gcs.first
-      when /\p{Word}/
-        /\p{Word}/
-      when /\s/
-        /\s/
+    c = gcs.first
+    matched =
+      if word_character?(c)
+        gcs.take_while { |c| word_character?(c) }
+      elsif space_character?(c)
+        gcs.take_while { |c| space_character?(c) }
       else
-        /[^\p{Word}\s]/
+        gcs.take_while { |c| !word_character?(c) && !space_character?(c) }
       end
-    matched = gcs.take_while { |c| c.match?(regexp) }
+
     return matched.sum(&:bytesize) if drop_terminate_spaces
 
-    spaces = gcs.drop(matched.size).take_while { |c| c.match?(/\s/) }
+    spaces = gcs.drop(matched.size).take_while { |c| space_character?(c) }
     matched.sum(&:bytesize) + spaces.sum(&:bytesize)
   end
 
   def self.vi_forward_end_word(line, byte_pointer)
-    gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }
+    gcs = line.byteslice(byte_pointer..).grapheme_clusters
     return 0 if gcs.empty?
     return gcs.first.bytesize if gcs.size == 1
 
     start = gcs.shift
     skips = [start]
-    if start.match?(/\s/) || gcs.first.match?(/\s/)
-      spaces = gcs.take_while { |c| c.match?(/\s/) }
+    if space_character?(start) || space_character?(gcs.first)
+      spaces = gcs.take_while { |c| space_character?(c) }
       skips += spaces
       gcs.shift(spaces.size)
     end
-    regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/
-    matched = gcs.take_while { |c| c.match?(regexp) }
+    start_with_word = word_character?(gcs.first)
+    matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) }
     matched.pop
     skips.sum(&:bytesize) + matched.sum(&:bytesize)
   end
 
   def self.vi_backward_word(line, byte_pointer)
-    gcs = line.byteslice(0, byte_pointer).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }.reverse
-    spaces = gcs.take_while { |c| c.match?(/\s/) }
+    gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
+    spaces = gcs.take_while { |c| space_character?(c) }
     gcs.shift(spaces.size)
-    regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/
-    spaces.sum(&:bytesize) + gcs.take_while { |c| c.match?(regexp) }.sum(&:bytesize)
+    start_with_word = word_character?(gcs.first)
+    matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) }
+    spaces.sum(&:bytesize) + matched.sum(&:bytesize)
   end
 
   def self.common_prefix(list, ignore_case: false)
@@ -399,7 +399,17 @@ def self.common_prefix(list, ignore_case: false)
 
   def self.vi_first_print(line)
     gcs = line.grapheme_clusters
-    spaces = gcs.take_while { |c| c.match?(/\s/) }
+    spaces = gcs.take_while { |c| space_character?(c) }
     spaces.sum(&:bytesize)
   end
+
+  def self.word_character?(s)
+    s.encode(Encoding::UTF_8).match?(/\p{Word}/) if s
+  rescue Encoding::UndefinedConversionError
+    false
+  end
+
+  def self.space_character?(s)
+    s.match?(/\s/) if s
+  end
 end
diff --git a/test/reline/test_unicode.rb b/test/reline/test_unicode.rb
index 35a9b99daa..0778306c32 100644
--- a/test/reline/test_unicode.rb
+++ b/test/reline/test_unicode.rb
@@ -147,6 +147,7 @@ def test_encoding_conversion
 
   def test_em_forward_word
     assert_equal(12, Reline::Unicode.em_forward_word('abc---fooあbar-baz', 3))
+    assert_equal(11, Reline::Unicode.em_forward_word('abc---fooあbar-baz'.encode('sjis'), 3))
     assert_equal(3, Reline::Unicode.em_forward_word('abcfoo', 3))
     assert_equal(3, Reline::Unicode.em_forward_word('abc---', 3))
     assert_equal(0, Reline::Unicode.em_forward_word('abc', 3))
@@ -154,6 +155,7 @@ def test_em_forward_word
 
   def test_em_forward_word_with_capitalization
     assert_equal([12, '---Fooあbar'], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz', 3))
+    assert_equal([11, '---Fooあbar'.encode('sjis')], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz'.encode('sjis'), 3))
     assert_equal([3, 'Foo'], Reline::Unicode.em_forward_word_with_capitalization('abcfOo', 3))
     assert_equal([3, '---'], Reline::Unicode.em_forward_word_with_capitalization('abc---', 3))
     assert_equal([0, ''], Reline::Unicode.em_forward_word_with_capitalization('abc', 3))
@@ -162,6 +164,7 @@ def test_em_forward_word_with_capitalization
 
   def test_em_backward_word
     assert_equal(12, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz', 20))
+    assert_equal(11, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
     assert_equal(2, Reline::Unicode.em_backward_word('  ', 2))
     assert_equal(2, Reline::Unicode.em_backward_word('ab', 2))
     assert_equal(0, Reline::Unicode.em_backward_word('ab', 0))
@@ -169,6 +172,7 @@ def test_em_backward_word
 
   def test_em_big_backward_word
     assert_equal(16, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz', 20))
+    assert_equal(15, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
     assert_equal(2, Reline::Unicode.em_big_backward_word('  ', 2))
     assert_equal(2, Reline::Unicode.em_big_backward_word('ab', 2))
     assert_equal(0, Reline::Unicode.em_big_backward_word('ab', 0))
@@ -184,20 +188,20 @@ def test_ed_transpose_words
     assert_equal([3, 5, 6, 8], Reline::Unicode.ed_transpose_words('aa bb cc  ', 7))
     assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc  ', 8))
     assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc  ', 9))
-    word1 = 'fooあ'
-    word2 = 'barあbaz'
-    left = 'aaa  -'
-    middle = '- -'
-    right = '-  bbb'
-    expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize]
-    assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize))
-    assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize))
-    assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1))
+    ['sjis', 'utf-8'].each do |encoding|
+      texts = ['fooあ', 'barあbaz', 'aaa  -', '- -', '-  bbb']
+      word1, word2, left, middle, right = texts.map { |text| text.encode(encoding) }
+      expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize]
+      assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize))
+      assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize))
+      assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1))
+    end
   end
 
   def test_vi_big_forward_word
     assert_equal(18, Reline::Unicode.vi_big_forward_word('abc---fooあbar-baz  xyz', 3))
     assert_equal(8, Reline::Unicode.vi_big_forward_word('abcfooあ  --', 3))
+    assert_equal(7, Reline::Unicode.vi_big_forward_word('abcfooあ  --'.encode('sjis'), 3))
     assert_equal(6, Reline::Unicode.vi_big_forward_word('abcfooあ', 3))
     assert_equal(3, Reline::Unicode.vi_big_forward_word('abc-  ', 3))
     assert_equal(0, Reline::Unicode.vi_big_forward_word('abc', 3))
@@ -211,6 +215,7 @@ def test_vi_big_forward_end_word
     assert_equal(1, Reline::Unicode.vi_big_forward_end_word('aa b', 0))
     assert_equal(3, Reline::Unicode.vi_big_forward_end_word('  aa b', 0))
     assert_equal(15, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz  xyz', 3))
+    assert_equal(14, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz  xyz'.encode('sjis'), 3))
     assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ  --', 3))
     assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ', 3))
     assert_equal(2, Reline::Unicode.vi_big_forward_end_word('abc-  ', 3))
@@ -219,6 +224,7 @@ def test_vi_big_forward_end_word
 
   def test_vi_big_backward_word
     assert_equal(16, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz', 20))
+    assert_equal(15, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
     assert_equal(2, Reline::Unicode.vi_big_backward_word('  ', 2))
     assert_equal(2, Reline::Unicode.vi_big_backward_word('ab', 2))
     assert_equal(0, Reline::Unicode.vi_big_backward_word('ab', 0))
@@ -227,6 +233,7 @@ def test_vi_big_backward_word
   def test_vi_forward_word
     assert_equal(3, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 3))
     assert_equal(9, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 6))
+    assert_equal(8, Reline::Unicode.vi_forward_word('abc---fooあbar-baz'.encode('sjis'), 6))
     assert_equal(6, Reline::Unicode.vi_forward_word('abcfooあ', 3))
     assert_equal(3, Reline::Unicode.vi_forward_word('abc---', 3))
     assert_equal(0, Reline::Unicode.vi_forward_word('abc', 3))
@@ -237,6 +244,7 @@ def test_vi_forward_word
   def test_vi_forward_end_word
     assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 3))
     assert_equal(8, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 6))
+    assert_equal(7, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz'.encode('sjis'), 6))
     assert_equal(3, Reline::Unicode.vi_forward_end_word('abcfooあ', 3))
     assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---', 3))
     assert_equal(0, Reline::Unicode.vi_forward_end_word('abc', 3))
@@ -245,6 +253,7 @@ def test_vi_forward_end_word
   def test_vi_backward_word
     assert_equal(3, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 20))
     assert_equal(9, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 17))
+    assert_equal(8, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 16))
     assert_equal(2, Reline::Unicode.vi_backward_word('  ', 2))
     assert_equal(2, Reline::Unicode.vi_backward_word('ab', 2))
     assert_equal(0, Reline::Unicode.vi_backward_word('ab', 0))
@@ -254,6 +263,24 @@ def test_vi_first_print
     assert_equal(3, Reline::Unicode.vi_first_print('   abcdefg'))
     assert_equal(3, Reline::Unicode.vi_first_print('   '))
     assert_equal(0, Reline::Unicode.vi_first_print('abc'))
+    assert_equal(0, Reline::Unicode.vi_first_print('あ'))
+    assert_equal(0, Reline::Unicode.vi_first_print('あ'.encode('sjis')))
     assert_equal(0, Reline::Unicode.vi_first_print(''))
   end
+
+  def test_character_type
+    assert(Reline::Unicode.word_character?('a'))
+    assert(Reline::Unicode.word_character?('あ'))
+    assert(Reline::Unicode.word_character?('あ'.encode('sjis')))
+    refute(Reline::Unicode.word_character?(33345.chr('sjis')))
+    refute(Reline::Unicode.word_character?('-'))
+    refute(Reline::Unicode.word_character?(nil))
+
+    assert(Reline::Unicode.space_character?(' '))
+    refute(Reline::Unicode.space_character?('あ'))
+    refute(Reline::Unicode.space_character?('あ'.encode('sjis')))
+    refute(Reline::Unicode.space_character?(33345.chr('sjis')))
+    refute(Reline::Unicode.space_character?('-'))
+    refute(Reline::Unicode.space_character?(nil))
+  end
 end