diff --git a/spec/std/string_spec.cr b/spec/std/string_spec.cr index 905930463cfc..d079f7618948 100644 --- a/spec/std/string_spec.cr +++ b/spec/std/string_spec.cr @@ -1277,6 +1277,19 @@ describe "String" do "Dizzy Miss Lizzy".byte_index('z'.ord, -17).should be_nil } + it { "foo".byte_index('o').should eq(1) } + it { "foo bar booz".byte_index('o', 3).should eq(9) } + it { "foo".byte_index('a').should be_nil } + it { "foo".byte_index('a').should be_nil } + it { "foo".byte_index('o', 3).should be_nil } + it { "Hi, 💣".byte_index('💣').should eq(4) } + it { + "Dizzy Miss Lizzy".byte_index('z').should eq(2) + "Dizzy Miss Lizzy".byte_index('z', 3).should eq(3) + "Dizzy Miss Lizzy".byte_index('z', -4).should eq(13) + "Dizzy Miss Lizzy".byte_index('z', -17).should be_nil + } + it "gets byte index of string" do "hello world".byte_index("he").should eq(0) "hello world".byte_index("lo").should eq(3) diff --git a/src/string.cr b/src/string.cr index 7c9eed3dd186..051247386956 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3670,6 +3670,47 @@ class String nil end + # Returns the index of the _first_ occurrence of *char* in the string, or `nil` if not present. + # If *offset* is present, it defines the position to start the search. + # + # Negative *offset* can be used to start the search from the end of the string. + # + # ``` + # "Hello, World".byte_index('o') # => 4 + # "Hello, World".byte_index('Z') # => nil + # "Hello, World".byte_index('o', 5) # => 8 + # "Hi, 💣".byte_index('💣') # => 4 + # "Dizzy Miss Lizzy".byte_index('z') # => 2 + # "Dizzy Miss Lizzy".byte_index('z', 3) # => 3 + # "Dizzy Miss Lizzy".byte_index('z', -4) # => 13 + # "Dizzy Miss Lizzy".byte_index('z', -17) # => nil + # ``` + def byte_index(char : Char, offset = 0) : Int32? + offset += bytesize if offset < 0 + return if offset < 0 + return byte_index(char.ord, offset) if char.ascii? + + # Simplified "Rabin-Karp" algorithm + search_hash = 0u32 + search_mask = 0u32 + char.each_byte do |byte| + search_hash <<= 8 + search_hash |= byte + search_mask <<= 8 + search_mask |= 0xff + end + + hash = 0u32 + offset.upto(bytesize - 1) do |i| + hash <<= 8 + hash |= to_unsafe[i] + if (hash & search_mask) == search_hash + return i - (char.bytesize - 1) + end + end + nil + end + # Returns the byte index of *search* in the string, or `nil` if the string is not present. # If *offset* is present, it defines the position to start the search. # diff --git a/src/string_scanner.cr b/src/string_scanner.cr index f08ad10e57b2..ff852670a626 100644 --- a/src/string_scanner.cr +++ b/src/string_scanner.cr @@ -172,15 +172,24 @@ class StringScanner if pattern.bytesize > @str.bytesize - @byte_offset nil elsif anchored - unsafe_str = String.new(@str.unsafe_byte_slice(@byte_offset)) - if unsafe_str.starts_with?(pattern) - result = pattern.to_s - @last_match = StringMatchData.new(result) - @byte_offset += pattern.bytesize if advance - result + i = 0 + # check string starts with char + unsafe_ptr = @str.to_unsafe + @byte_offset + pattern.each_byte do |byte| + return nil unless unsafe_ptr[i] == byte + i += 1 end - else - match(pattern.to_s, advance: advance, anchored: false) + # ok, it starts + result = pattern.to_s + @last_match = StringMatchData.new(result) + @byte_offset += pattern.bytesize if advance + result + elsif (found = @str.byte_index(pattern, @byte_offset)) + finish = found + pattern.bytesize + result = @str.byte_slice(@byte_offset, finish - @byte_offset) + @byte_offset = finish if advance + @last_match = StringMatchData.new(result) + result end end