From 94f7bc5636f483a6ddef6cfa4d8eb66b4ea4ab7e Mon Sep 17 00:00:00 2001 From: Yura Sokolov Date: Wed, 20 Sep 2023 03:06:14 +0300 Subject: [PATCH 1/6] Use simpler faster Rabin-Karp-like search for short needle --- src/string.cr | 110 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 27 deletions(-) diff --git a/src/string.cr b/src/string.cr index 551c2d2dd9a2..23c0a097385c 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3312,15 +3312,20 @@ class String # Update rolling hash for Rabin-Karp algorithm `String#index`. private macro update_hash(n) {% for i in 1..n %} - {% if i != 1 %} - byte = head_pointer.value - {% end %} + byte = head_pointer.value hash = hash &* PRIME_RK &+ pointer.value &- pow &* byte pointer += 1 head_pointer += 1 {% end %} end + private macro update_simplehash(n) + {% for i in 1..n %} + hash = (hash << 8) | pointer.value + pointer += 1 + {% end %} + end + # Returns the index of the _first_ occurrence of *search* in the string, or `nil` if not present. # If *offset* is present, it defines the position to start the search. # @@ -3360,13 +3365,6 @@ class String # Rabin-Karp algorithm # https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm - # calculate a rolling hash of search text (needle) - search_hash = 0u32 - search.each_byte do |b| - search_hash = search_hash &* PRIME_RK &+ b - end - pow = PRIME_RK &** search.bytesize - # Find start index with offset char_index = 0 pointer = to_unsafe @@ -3377,16 +3375,56 @@ class String char_index += 1 end - head_pointer = pointer + return if pointer + search.bytesize > end_pointer - # calculate a rolling hash of this text (haystack) + if search.bytesize == 1 + byte = search.to_unsafe[0] + while pointer < end_pointer + return char_index if pointer.value == byte + pointer += String.char_bytesize_at(pointer) + char_index += 1 + end + return nil + end + + head_pointer = pointer + search_hash = 0u32 hash = 0u32 - hash_end_pointer = pointer + search.bytesize - return if hash_end_pointer > end_pointer - while pointer < hash_end_pointer + + if search.bytesize <= 4 + # simplified version with multiplier == 256 + mask = 0u32 + search.each_byte do |b| + search_hash = (search_hash << 8) | b + hash = (hash << 8) | pointer.value + mask = (mask << 8) | 0xff + pointer += 1 + end + + while true + return char_index if (hash & mask) == search_hash + + char_bytesize = String.char_bytesize_at(head_pointer) + return if pointer + char_bytesize > end_pointer + case char_bytesize + when 1 then update_simplehash 1 + when 2 then update_simplehash 2 + when 3 then update_simplehash 3 + else update_simplehash 4 + end + + head_pointer += char_bytesize + char_index += 1 + end + end + + # calculate a rolling hash of search text (needle) and this text (haystack) + search.each_byte do |b| + search_hash = search_hash &* PRIME_RK &+ b hash = hash &* PRIME_RK &+ pointer.value pointer += 1 end + pow = PRIME_RK &** search.bytesize while true # check hash equality and real string equality @@ -3394,7 +3432,6 @@ class String return char_index end - byte = head_pointer.value char_bytesize = String.char_bytesize_at(head_pointer) return if pointer + char_bytesize > end_pointer case char_bytesize @@ -3731,27 +3768,46 @@ class String return if offset < 0 return bytesize < offset ? nil : offset if search.empty? + return byte_index(search.to_unsafe[0], offset) if search.bytesize == 1 # Rabin-Karp algorithm # https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm - # calculate a rolling hash of search text (needle) + pointer = to_unsafe + offset + end_pointer = to_unsafe + bytesize + return if pointer + search.bytesize > end_pointer + search_hash = 0u32 - search.each_byte do |b| - search_hash = search_hash &* PRIME_RK &+ b + hash = 0u32 + + if search.bytesize <= 4 + # simplified version with multiplier == 256 + mask = 0u32 + search.each_byte do |b| + search_hash = (search_hash << 8) | b + hash = (hash << 8) | pointer.value + mask = (mask << 8) | 0xff + pointer += 1 + end + + while true + return offset if (hash & mask) == search_hash + return if pointer >= end_pointer + hash = (hash << 8) | pointer.value + pointer += 1 + offset += 1 + end end - pow = PRIME_RK &** search.bytesize - # calculate a rolling hash of this text (haystack) - pointer = head_pointer = to_unsafe + offset - hash_end_pointer = pointer + search.bytesize - end_pointer = to_unsafe + bytesize - hash = 0u32 - return if hash_end_pointer > end_pointer - while pointer < hash_end_pointer + head_pointer = pointer + + # calculate a rolling hash of search text (needle) and this text (haystack) + search.each_byte do |b| + search_hash = search_hash &* PRIME_RK &+ b hash = hash &* PRIME_RK &+ pointer.value pointer += 1 end + pow = PRIME_RK &** search.bytesize while true # check hash equality and real string equality From a63580e4d4f33a6b1d356af6d8ee835a90fa4662 Mon Sep 17 00:00:00 2001 From: Yura Sokolov Date: Sun, 8 Oct 2023 15:31:14 +0300 Subject: [PATCH 2/6] refactor simple rabin-karp --- src/string.cr | 134 +++++++++++++++++++++----------------------------- 1 file changed, 55 insertions(+), 79 deletions(-) diff --git a/src/string.cr b/src/string.cr index 23c0a097385c..4c7fe06ec54d 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3355,12 +3355,53 @@ class String nil end + private macro gen_index_short(int_class, by_char) + # simplified Rabin-Karp version with multiplier == 256 + search_hash = {{int_class}}.new(0) + hash = {{int_class}}.new(0) + mask = {{int_class}}.new(0) + + search.each_byte do |b| + search_hash = (search_hash << 8) | b + hash = (hash << 8) | pointer.value + mask = (mask << 8) | 0xff + pointer += 1 + end + {% if by_char %} + search_bytesize = search.bytesize + {% end %} + + while true + return offset if (hash & mask) == search_hash + + {% if by_char %} + char_bytesize = String.char_bytesize_at(pointer - search_bytesize) + {% else %} + char_bytesize = 1 + {% end %} + return if pointer + char_bytesize > end_pointer + case char_bytesize + when 1 then update_simplehash 1 + when 2 then update_simplehash 2 + when 3 then update_simplehash 3 + else update_simplehash 4 + end + + offset &+= 1 + end + end + + private def index_2to8bytes(offset : Int32, pointer : UInt8*, end_pointer : UInt8*, search : String) + gen_index_short(UInt64, true) + end + # :ditto: def index(search : String, offset = 0) offset += size if offset < 0 return if offset < 0 return size < offset ? nil : offset if search.empty? + return index(search[0], offset) if search.size == 1 && search.valid_encoding? # Rabin-Karp algorithm # https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm @@ -3377,47 +3418,14 @@ class String return if pointer + search.bytesize > end_pointer - if search.bytesize == 1 - byte = search.to_unsafe[0] - while pointer < end_pointer - return char_index if pointer.value == byte - pointer += String.char_bytesize_at(pointer) - char_index += 1 - end - return nil + if search.bytesize <= 8 + return index_2to8bytes(char_index, pointer, end_pointer, search) end head_pointer = pointer search_hash = 0u32 hash = 0u32 - if search.bytesize <= 4 - # simplified version with multiplier == 256 - mask = 0u32 - search.each_byte do |b| - search_hash = (search_hash << 8) | b - hash = (hash << 8) | pointer.value - mask = (mask << 8) | 0xff - pointer += 1 - end - - while true - return char_index if (hash & mask) == search_hash - - char_bytesize = String.char_bytesize_at(head_pointer) - return if pointer + char_bytesize > end_pointer - case char_bytesize - when 1 then update_simplehash 1 - when 2 then update_simplehash 2 - when 3 then update_simplehash 3 - else update_simplehash 4 - end - - head_pointer += char_bytesize - char_index += 1 - end - end - # calculate a rolling hash of search text (needle) and this text (haystack) search.each_byte do |b| search_hash = search_hash &* PRIME_RK &+ b @@ -3699,12 +3707,7 @@ class String offset += bytesize if offset < 0 return if offset < 0 - offset.upto(bytesize - 1) do |i| - if to_unsafe[i] == byte - return i - end - end - nil + to_slice.fast_index(byte.to_u8, offset) end # Returns the index of the _first_ occurrence of *char* in the string, or `nil` if not present. @@ -3722,32 +3725,20 @@ class String # "Dizzy Miss Lizzy".byte_index('z', -4) # => 13 # "Dizzy Miss Lizzy".byte_index('z', -17) # => nil # ``` - def byte_index(char : Char, offset = 0) : Int32? - return byte_index(char.ord, offset) if char.ascii? + def byte_index(search : Char, offset = 0) : Int32? + return byte_index(search.ord, offset) if search.ascii? offset += bytesize if offset < 0 return if offset < 0 - return if offset + char.bytesize > bytesize + return if offset + search.bytesize > bytesize - # Simplified "Rabin-Karp" algorithm - search_hash = 0u32 - search_mask = 0u32 - hash = 0u32 - char.each_byte do |byte| - search_hash = (search_hash << 8) | byte - search_mask = (search_mask << 8) | 0xff - hash = (hash << 8) | to_unsafe[offset] - offset += 1 - end + pointer = to_unsafe + offset + end_pointer = to_unsafe + bytesize + gen_index_short(UInt32, false) + end - offset.upto(bytesize) do |i| - if (hash & search_mask) == search_hash - return i - char.bytesize - end - # rely on zero terminating byte - hash = (hash << 8) | to_unsafe[i] - end - nil + private def byte_index_2to8bytes(offset : Int32, pointer : UInt8*, end_pointer : UInt8*, search : String) + gen_index_short(UInt64, false) end # Returns the byte index of *search* in the string, or `nil` if the string is not present. @@ -3780,23 +3771,8 @@ class String search_hash = 0u32 hash = 0u32 - if search.bytesize <= 4 - # simplified version with multiplier == 256 - mask = 0u32 - search.each_byte do |b| - search_hash = (search_hash << 8) | b - hash = (hash << 8) | pointer.value - mask = (mask << 8) | 0xff - pointer += 1 - end - - while true - return offset if (hash & mask) == search_hash - return if pointer >= end_pointer - hash = (hash << 8) | pointer.value - pointer += 1 - offset += 1 - end + if search.bytesize <= 8 + return byte_index_2to8bytes(offset, pointer, end_pointer, search) end head_pointer = pointer From 86518084e57e5b58b70bbe421101b7b58132aba3 Mon Sep 17 00:00:00 2001 From: Sokolov Yura Date: Mon, 9 Oct 2023 13:05:14 +0300 Subject: [PATCH 3/6] To preserve API, byte_index(char: c) should be accepted. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Müller --- src/string.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string.cr b/src/string.cr index 4c7fe06ec54d..a2e9c8dbcf01 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3725,7 +3725,7 @@ class String # "Dizzy Miss Lizzy".byte_index('z', -4) # => 13 # "Dizzy Miss Lizzy".byte_index('z', -17) # => nil # ``` - def byte_index(search : Char, offset = 0) : Int32? + def byte_index(char search : Char, offset = 0) : Int32? return byte_index(search.ord, offset) if search.ascii? offset += bytesize if offset < 0 From f4b138e4a2f164a48f5147fc4e6072be25ad8f04 Mon Sep 17 00:00:00 2001 From: Yura Sokolov Date: Mon, 9 Oct 2023 14:50:30 +0300 Subject: [PATCH 4/6] use function with type parameter and block instead of macros --- src/string.cr | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/src/string.cr b/src/string.cr index a2e9c8dbcf01..83b4f3512256 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3355,11 +3355,10 @@ class String nil end - private macro gen_index_short(int_class, by_char) - # simplified Rabin-Karp version with multiplier == 256 - search_hash = {{int_class}}.new(0) - hash = {{int_class}}.new(0) - mask = {{int_class}}.new(0) + private def index_short(hash_type, offset : Int32, pointer : UInt8*, end_pointer : UInt8*, search, &) + search_hash = hash_type.new(0) + hash = hash_type.new(0) + mask = hash_type.new(0) search.each_byte do |b| search_hash = (search_hash << 8) | b @@ -3367,18 +3366,11 @@ class String mask = (mask << 8) | 0xff pointer += 1 end - {% if by_char %} - search_bytesize = search.bytesize - {% end %} while true return offset if (hash & mask) == search_hash - {% if by_char %} - char_bytesize = String.char_bytesize_at(pointer - search_bytesize) - {% else %} - char_bytesize = 1 - {% end %} + char_bytesize = yield pointer return if pointer + char_bytesize > end_pointer case char_bytesize when 1 then update_simplehash 1 @@ -3391,10 +3383,6 @@ class String end end - private def index_2to8bytes(offset : Int32, pointer : UInt8*, end_pointer : UInt8*, search : String) - gen_index_short(UInt64, true) - end - # :ditto: def index(search : String, offset = 0) offset += size if offset < 0 @@ -3419,7 +3407,10 @@ class String return if pointer + search.bytesize > end_pointer if search.bytesize <= 8 - return index_2to8bytes(char_index, pointer, end_pointer, search) + search_bytesize = search.bytesize + return index_short(UInt64, char_index, pointer, end_pointer, search) {|pointer| + String.char_bytesize_at(pointer - search_bytesize) + } end head_pointer = pointer @@ -3734,11 +3725,7 @@ class String pointer = to_unsafe + offset end_pointer = to_unsafe + bytesize - gen_index_short(UInt32, false) - end - - private def byte_index_2to8bytes(offset : Int32, pointer : UInt8*, end_pointer : UInt8*, search : String) - gen_index_short(UInt64, false) + index_short(UInt32, offset, pointer, end_pointer, search) { 1 } end # Returns the byte index of *search* in the string, or `nil` if the string is not present. @@ -3772,7 +3759,7 @@ class String hash = 0u32 if search.bytesize <= 8 - return byte_index_2to8bytes(offset, pointer, end_pointer, search) + return index_short(UInt64, offset, pointer, end_pointer, search) { 1 } end head_pointer = pointer From 0e098c65cb38ea1243c34d10e1d01be0c37055f2 Mon Sep 17 00:00:00 2001 From: Yura Sokolov Date: Mon, 9 Oct 2023 14:56:46 +0300 Subject: [PATCH 5/6] fix format --- src/string.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string.cr b/src/string.cr index 83b4f3512256..b09c332304a9 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3408,7 +3408,7 @@ class String if search.bytesize <= 8 search_bytesize = search.bytesize - return index_short(UInt64, char_index, pointer, end_pointer, search) {|pointer| + return index_short(UInt64, char_index, pointer, end_pointer, search) { |pointer| String.char_bytesize_at(pointer - search_bytesize) } end From c7faf09c4e09379a0da43868da523bda3dbaf161 Mon Sep 17 00:00:00 2001 From: Yura Sokolov Date: Thu, 18 Apr 2024 19:13:45 +0300 Subject: [PATCH 6/6] string.cr: couple of review fixes in rabin-karp optimizations --- src/string.cr | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/string.cr b/src/string.cr index 76226e5b9e90..60205fe547c6 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3412,9 +3412,9 @@ class String if search.bytesize <= 8 search_bytesize = search.bytesize - return index_short(UInt64, char_index, pointer, end_pointer, search) { |pointer| + return index_short(UInt64, char_index, pointer, end_pointer, search) do |pointer| String.char_bytesize_at(pointer - search_bytesize) - } + end end head_pointer = pointer @@ -3804,9 +3804,7 @@ class String return if pointer >= end_pointer # update a rolling hash of this text (haystack) - hash = hash &* PRIME_RK &+ pointer.value &- pow &* head_pointer.value - pointer += 1 - head_pointer += 1 + update_hash 1 offset += 1 end