Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Myers algorithm for Levenshtein distance calculation #11370

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions spec/std/levenshtein_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ describe "levenshtein" do
it { Levenshtein.distance("abc", "cba").should eq(2) }
it { Levenshtein.distance("かんじ", "じんか").should eq(2) }
it { Levenshtein.distance("", "かんじ").should eq(3) }
it { Levenshtein.distance("مِكرٍّ مِفَرٍّ مُقبِلٍ مُدْبِرٍ معًا كجُلمودِ صخرٍ حطّه السيلُ من علِ",
"مِكرٍّ مِفَرٍّ مُقبِلٍ مُدْبِرٍ معًا كجُلموادِ صخرٍ خطّه اسيلُ من علِ").should eq(3) }
it { Levenshtein.distance("I didn't find the shirt I wanted",
"I cidn't fined he shirt I wasted").should eq(4) }
it { Levenshtein.distance("I cannot see how this could be a problem. It compiles doesn't it",
"It cannot see how this could be a broblem. It compiles doesnt it").should eq(3) }
it { Levenshtein.distance("Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commo",
"Lorem ipsun dolor sit Smet, consectetueo adipiscing lit. BAenean commo").should eq(5) }

it "finds with finder" do
finder = Levenshtein::Finder.new "hallo"
Expand Down
270 changes: 226 additions & 44 deletions src/levenshtein.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
require "bit_array"

# Levenshtein distance methods.
module Levenshtein
# Computes the [levenshtein distance](http://en.wikipedia.org/wiki/Levenshtein_distance) of two strings.
Expand All @@ -10,60 +12,50 @@ module Levenshtein
# Levenshtein.distance("こんにちは", "こんちは") # => 1
# Levenshtein.distance("hey", "hey") # => 0
# ```
def self.distance(string1 : String, string2 : String) : Int32
# If *cutoff* is given then the method is allowed to end once the loweset
# possible bound is greater than *cutoff* and return that lower bound.
Comment on lines +15 to +16
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could a doc/code example for this be given?

Alternatively, I think it's totally fine to add the cutoff logic in a separate PR. Given that it's a new feature, it could sparkle more discussion, eventually leading to this PR being merged less likely or less faster to happen (just a guess! I'm not actually requesting this)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would a pull request onto my feature branch show up here? I haven't really made a pull request onto a pull request before.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I don't think so, but also, the other PR can come/appear after this one is merged.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could also make a new pull request and first squash all the features together. Whatever is easier for you guys.

# This can improve performance in cases when values over *cutoff*
# don't need to be exact.
#
# ```
# require "levenshtein"
#
# string1 = File.read("file_with_really_long_string")
# string2 = File.read("another_file_with_long_string")
#
# Levenshtein.distance(string1, string2, 1000) # => 1275
# Levenshtein.distance(string1, string2) # => 2543
# ```
# In this example the first call to *distance* will return
# a result faster than the second.
def self.distance(string1 : String, string2 : String, cutoff : Int? = nil) : Int32
return 0 if string1 == string2

s_size = string1.size
t_size = string2.size

return t_size if s_size == 0
return s_size if t_size == 0
l_size = string2.size

# This is to allocate less memory
if t_size > s_size
if l_size < s_size
string1, string2 = string2, string1
t_size, s_size = s_size, t_size
l_size, s_size = s_size, l_size
end

costs = Slice(Int32).new(t_size + 1) { |i| i }
last_cost = 0

if string1.single_byte_optimizable? && string2.single_byte_optimizable?
s = string1.to_unsafe
t = string2.to_unsafe

s_size.times do |i|
last_cost = i + 1
return l_size if s_size == 0
if cutoff && cutoff < l_size - s_size
return l_size - s_size
end

t_size.times do |j|
sub_cost = s[i] == t[j] ? 0 : 1
cost = Math.min(Math.min(last_cost + 1, costs[j + 1] + 1), costs[j] + sub_cost)
costs[j] = last_cost
last_cost = cost
end
costs[t_size] = last_cost
if string1.ascii_only? && string2.ascii_only?
if l_size < 32
myers32_ascii(string1, string2)
else
myers_ascii(string1, string2, cutoff)
end

last_cost
else
reader = Char::Reader.new(string1)

# Use an array instead of a reader to decode the second string only once
chars = string2.chars

reader.each_with_index do |char1, i|
last_cost = i + 1

chars.each_with_index do |char2, j|
sub_cost = char1 == char2 ? 0 : 1
cost = Math.min(Math.min(last_cost + 1, costs[j + 1] + 1), costs[j] + sub_cost)
costs[j] = last_cost
last_cost = cost
end
costs[t_size] = last_cost
if l_size < 64
dynamic_matrix(string1, string2)
else
myers_unicode(string1, string2, cutoff)
end

last_cost
end
end

Expand Down Expand Up @@ -92,7 +84,7 @@ module Levenshtein
end

def test(name : String, value : String = name)
distance = Levenshtein.distance(@target, name)
distance = Levenshtein.distance(@target, name, @tolerance)
if distance <= @tolerance
if best_entry = @best_entry
if distance < best_entry.distance
Expand Down Expand Up @@ -155,4 +147,194 @@ module Levenshtein
def self.find(name, all_names, tolerance = nil)
Finder.find(name, all_names, tolerance)
end

# Measures Levenshtein distance by filling Dynamic Programming Matrix
private def self.dynamic_matrix(string1 : String, string2 : String) : Int32
s_size = string1.size
l_size = string2.size

costs = Slice(Int32).new(s_size + 1) { |i| i }
last_cost = 0

if string1.single_byte_optimizable? && string2.single_byte_optimizable?
s = string1.to_unsafe
l = string2.to_unsafe

l_size.times do |i|
last_cost = i + 1

s_size.times do |j|
sub_cost = l[i] == s[j] ? 0 : 1
Copy link
Contributor

@HertzDevil HertzDevil Jul 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

single_byte_optimizable? means bytes higher than 0x7F should all compare equal, since they behave like the replacement character:

Levenshtein.distance("\x81", "\x82").should eq(0)
Levenshtein.distance("\x82", "\x81").should eq(0)
Levenshtein.distance("\x81" * 33, String.new(Bytes.new(33) { |i| 0x80_u8 + i })).should eq(0)

# okay, as these use the `Char::Reader` overload instead
Levenshtein.distance("\x81", "\uFFFD").should eq(0)
Levenshtein.distance("\uFFFD", "\x81").should eq(0)
Suggested change
sub_cost = l[i] == s[j] ? 0 : 1
l_i = {l[i], 0x80_u8}.min
s_j = {s[j], 0x80_u8}.min
sub_cost = l_i == s_j ? 0 : 1

This also means the ascii_only? branches could be relaxed to single_byte_optimizable?, operating over an alphabet of 129 "code points" where again bytes larger than 0x7F are mapped to 0x80.

Of course, most strings are already valid UTF-8, so this is a rather low priority pre-existing issue. Feel free to ignore in this PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You bring up an interesting point. I am not sure I ever really thought about what the Levenshtein distance of invalid strings would look like. But having said that the I don't even know what the point of this single_byte_optimizable? path, because if I am not mistaken no valid string will ever end up there, since if they are ascii it will call a different function and if it isn't then it wouldn't be single_byte_optimizable.

I think past me in an effort to optimize for speed looked into using unsafe pointers in non-ascii strings, to bring a speed boost to that specific scenario. And correct me if I am wrong, isn't this segment of code just providing a faster way to measure the Levenshtein distance of invalid strings? If that is the case then why even have it?

Maybe getting rid of that 'if block' would be the cleanest solution.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A string that consists only of ASCII characters plus invalid UTF-8 byte sequences, but not valid ones of 2 bytes or more (code point 0x80 or above), is single_byte_optimizable? but not ascii_only?.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess my question is how often these occur in the wild. Is speeding up calculation of Levenshtein distance of strings that are single_byte_optimizable? but not ascii_only? even of value to anyone? Especially since the Char::Reader path is plenty fast to begin with, and handles the invalid bytes properly.

cost = Math.min(Math.min(last_cost + 1, costs[j + 1] + 1), costs[j] + sub_cost)
costs[j] = last_cost
last_cost = cost
end
costs[s_size] = last_cost
end

last_cost
else
reader = Char::Reader.new(string2)

# Use an array instead of a reader to decode the second string only once
chars = string1.chars

reader.each_with_index do |char1, i|
last_cost = i + 1

chars.each_with_index do |char2, j|
sub_cost = char1 == char2 ? 0 : 1
cost = Math.min(Math.min(last_cost + 1, costs[j + 1] + 1), costs[j] + sub_cost)
costs[j] = last_cost
last_cost = cost
end
costs[s_size] = last_cost
end

last_cost
end
end

# Myers Algorithm for ascii strings less than 32 bits in length
#
# This implmentation runs much faster than others for strings less
# than 32 bits in length.
private def self.myers32_ascii(string1 : String, string2 : String) : Int32
w = 32
one = 1_u32
zero = 0_u32

m = string1.size
n = string2.size
lpos = one << (m - 1)
score = m

# Setup char->bit-vector dictionary
pmr = StaticArray(UInt32, 128).new(zero)
s1 = string1.to_unsafe
s2 = string2.to_unsafe

vp = UInt32::MAX
vn = zero

# populate dictionary
count = m
count.times do |i|
pmr[s1[i]] |= one << i
end

n.times do |i|
# find char in dictionary
pm = pmr[s2[i]]
d0 = (((pm & vp) &+ vp) ^ vp) | pm | vn
hp = vn | ~(d0 | vp)
hn = d0 & vp
if ((hp & lpos) != 0)
score += 1
elsif ((hn & lpos) != 0)
score -= 1
end
hnx = (hn << 1)
hpx = (hp << 1)
vp = hnx | ~(d0 | hpx | one)
vn = d0 & (hpx | one)
end
score
end

# Myers Algorithm for ASCII and Unicode
#
# The algorithm uses uses a dictionary to store string char location as bits
# ASCII implementation uses StaticArray while for full Unicode a Hash is used
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The explanation is quite good, but why using a Hash for Unicode?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Briefly, the dictionary maps char to bit-vector (uint32 or uint64). So a hash is used in the general case. For ASCII chars the codepoint is between 0-128, so for a performance boost we can use a StaticArray of size 128 and use the char codepoint to point to the array address containing the appropriate bit-vector. If we used Arrays and addresses for the entire unicode range it will have to be an array of size 1,114,112.

When I started I tried making a StaticArray of that size, but it was just not working (I guess trying to put something that large on the stack was causing issues). I did try using a regular Array, but from early testing the Hash was not only much smaller memory usage but it had better performance, so I stuck with a Hash.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! The issue with StaticArray is related to #2485. May not make any major difference though. I was mainly wondering about not using Array.

# The bit width depends on the architecture
{% begin %}
{% width = flag?(:bits64) ? 64 : 32 %}
{% for enc in ["ascii", "unicode"] %}
private def self.myers_{{ enc.id }}(string1 : String, string2 : String, cutoff : Int? = nil) : Int32
w = {{ width }}
one = 1_u{{ width }}
zero = 0_u{{ width }}

m = string1.size
n = string2.size
rmax = (m / w).ceil.to_i
hna = BitArray.new(n)
hpa = BitArray.new(n)

cutoff = cutoff || n
# assign here so compiler guarantees int as return
score = m
# Setup char->bit-vector dictionary
{% if enc == "ascii" %}
pmr = StaticArray(UInt{{ width }}, 128).new(zero)
s1 = string1.to_unsafe
s2 = string2.to_unsafe
{% else %}
pmr = Hash(Int32, UInt{{ width }}).new(w) { zero }
reader = Char::Reader.new(string1)
chars = string2.chars
{% end %}

rmax.times do |r|
vp = UInt{{ width }}::MAX
vn = zero

last_r = (r == rmax-1)
score = last_r ? m : (r+1)*w
hmax = last_r ? ((m-1) % w) : w-1
lpos = one << hmax

# populate dictionary
start = r*w
count = last_r && ((m % w) != 0) ? (m % w) : w
count.times do |i|
{% if enc == "ascii" %}
pmr[s1[start+i]] |= one << i
{% else %}
pmr[reader.current_char.ord] |= one << i
reader.next_char
{% end %}
end

n.times do |i|
hn0 = hna[i].to_unsafe
hp0 = hpa[i].to_unsafe
# find char in dictionary
{% if enc == "ascii" %}
pm = pmr[s2[i]] | hn0
{% else %}
pm = pmr[chars[i].ord] | hn0
{% end %}
d0 = (((pm & vp) &+ vp) ^ vp) | pm | vn
hp = vn | ~ (d0 | vp)
hn = d0 & vp
score += 1 if ((hp & lpos) != 0)
score -= 1 if ((hn & lpos) != 0)
hnx = (hn << 1) | hn0
hpx = (hp << 1) | hp0
# Horizontal arrays don't need to be saved on last run
unless (last_r)
hna[i] = (hn >> (w-1)) == 1
hpa[i] = (hp >> (w-1)) == 1
end
nc = (r == 0) ? one : zero
vp = hnx | ~ (d0 | hpx | nc)
vn = d0 & (hpx | nc)
end
if last_r
return score
elsif score-(m-((r+1)*w)) > cutoff
return score-(m-((r+1)*w))
end
# clear dictionary
{% if enc == "ascii" %}
pmr.fill(zero)
{% else %}
pmr.clear
{% end %}
end
score
end
{% end %}
{% end %}
end