Skip to content

Commit

Permalink
StringScanner: fix and improve Char searching + add String#byte_index…
Browse files Browse the repository at this point in the history
…(Char)
  • Loading branch information
funny-falcon committed Sep 19, 2023
1 parent 573521f commit 2a68f3a
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 8 deletions.
13 changes: 13 additions & 0 deletions spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -1277,6 +1277,19 @@ describe "String" do
"Dizzy Miss Lizzy".byte_index('z'.ord, -17).should be_nil
}

it { "foo".byte_index('o').should eq(1) }
it { "foo bar booz".byte_index('o', 3).should eq(9) }
it { "foo".byte_index('a').should be_nil }
it { "foo".byte_index('a').should be_nil }
it { "foo".byte_index('o', 3).should be_nil }
it { "Hi, 💣".byte_index('💣').should eq(4) }
it {
"Dizzy Miss Lizzy".byte_index('z').should eq(2)
"Dizzy Miss Lizzy".byte_index('z', 3).should eq(3)
"Dizzy Miss Lizzy".byte_index('z', -4).should eq(13)
"Dizzy Miss Lizzy".byte_index('z', -17).should be_nil
}

it "gets byte index of string" do
"hello world".byte_index("he").should eq(0)
"hello world".byte_index("lo").should eq(3)
Expand Down
41 changes: 41 additions & 0 deletions src/string.cr
Original file line number Diff line number Diff line change
Expand Up @@ -3670,6 +3670,47 @@ class String
nil
end

# Returns the index of the _first_ occurrence of *char* in the string, or `nil` if not present.
# If *offset* is present, it defines the position to start the search.
#
# Negative *offset* can be used to start the search from the end of the string.
#
# ```
# "Hello, World".byte_index('o') # => 4
# "Hello, World".byte_index('Z') # => nil
# "Hello, World".byte_index('o', 5) # => 8
# "Hi, 💣".byte_index('💣') # => 4
# "Dizzy Miss Lizzy".byte_index('z') # => 2
# "Dizzy Miss Lizzy".byte_index('z', 3) # => 3
# "Dizzy Miss Lizzy".byte_index('z', -4) # => 13
# "Dizzy Miss Lizzy".byte_index('z', -17) # => nil
# ```
def byte_index(char : Char, offset = 0) : Int32?
offset += bytesize if offset < 0
return if offset < 0
return byte_index(char.ord, offset) if char.ascii?

# Simplified "Rabin-Karp" algorithm
search_hash = 0u32
search_mask = 0u32
char.each_byte do |byte|
search_hash <<= 8
search_hash |= byte
search_mask <<= 8
search_mask |= 0xff
end

hash = 0u32
offset.upto(bytesize - 1) do |i|
hash <<= 8
hash |= to_unsafe[i]
if (hash & search_mask) == search_hash
return i - (char.bytesize-1)
end
end
nil
end

# Returns the byte index of *search* in the string, or `nil` if the string is not present.
# If *offset* is present, it defines the position to start the search.
#
Expand Down
25 changes: 17 additions & 8 deletions src/string_scanner.cr
Original file line number Diff line number Diff line change
Expand Up @@ -172,15 +172,24 @@ class StringScanner
if pattern.bytesize > @str.bytesize - @byte_offset
nil
elsif anchored
unsafe_str = String.new(@str.unsafe_byte_slice(@byte_offset))
if unsafe_str.starts_with?(pattern)
result = pattern.to_s
@last_match = StringMatchData.new(result)
@byte_offset += pattern.bytesize if advance
result
i = 0
# check string starts with char
unsafe_ptr = @str.to_unsafe + @byte_offset
pattern.each_byte do |byte|
return nil unless unsafe_ptr[i] == byte
i += 1
end
else
match(pattern.to_s, advance: advance, anchored: false)
# ok, it starts
result = pattern.to_s
@last_match = StringMatchData.new(result)
@byte_offset += pattern.bytesize if advance
result
elsif (found = @str.byte_index(pattern, @byte_offset))
finish = found + pattern.bytesize
result = @str.byte_slice(@byte_offset, finish - @byte_offset)
@byte_offset = finish if advance
@last_match = StringMatchData.new(result)
result
end
end

Expand Down

0 comments on commit 2a68f3a

Please sign in to comment.