Skip to content

Commit

Permalink
use Crystal::Hasher and open addressing with quadratic probing in Str…
Browse files Browse the repository at this point in the history
…ingPool

cause StringPool is used in json decoding, it is important to have it safe.
  • Loading branch information
funny-falcon authored and akzhan committed Sep 20, 2017
1 parent 12ed6df commit bf016e7
Showing 1 changed file with 55 additions and 46 deletions.
101 changes: 55 additions & 46 deletions src/string_pool.cr
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@
# b.object_id # => 136294312
# ```
class StringPool
# Implementation uses open addressing scheme of hash table with [quadratic probing](https://en.wikipedia.org/wiki/Quadratic_probing).
# Quadratic probing, using the triangular numbers, avoids the clumping while keeping
# cache coherency in the common case.
# As long as the table size is a power of 2, the quadratic-probing method [described by "Triangular numbers mod 2^n"](https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/)
# will explore every table element if necessary, to find a good place to insert.

# Returns the size
#
# ```
Expand All @@ -26,7 +32,9 @@ class StringPool

# Creates a new empty string pool.
def initialize
@buckets = Array(Array(String)?).new(11, nil)
@capacity = 8
@hashes = Pointer(UInt64).malloc(@capacity, 0_u64)
@values = Pointer(String).malloc(@capacity, "")
@size = 0
end

Expand Down Expand Up @@ -70,26 +78,46 @@ class StringPool
# pool.size # => 1
# ```
def get(str : UInt8*, len)
rehash if @size > 5 * @buckets.size
hash = hash(str, len)
get(hash, str, len)
end

index = bucket_index str, len
bucket = @buckets[index]
private def get(hash : UInt64, str : UInt8*, len)
rehash if @size >= @capacity / 4 * 3

if bucket
entry = find_entry_in_bucket(bucket, str, len)
if entry
return entry
mask = (@capacity - 1).to_u64
index = hash & mask
next_probe_offset = 1_u64
while (h = @hashes[index]) != 0
if h == hash && @values[index].bytesize == len
if str.memcmp(@values[index].to_unsafe, len) == 0
return @values[index]
end
end
else
@buckets[index] = bucket = Array(String).new
index = (index + next_probe_offset) & mask
next_probe_offset += 1_u64
end

@size += 1
entry = String.new(str, len)
bucket.push entry
@hashes[index] = hash
@values[index] = entry
entry
end

private def put_on_rehash(hash : UInt64, entry : String)
mask = (@capacity - 1).to_u64
index = hash & mask
next_probe_offset = 1_u64
while @hashes[index] != 0
index = (index + next_probe_offset) & mask
next_probe_offset += 1_u64
end

@hashes[index] = hash
@values[index] = entry
end

# Returns a `String` with the contents of the given `IO::Memory`.
#
# If a string with those contents was already present in the pool, that one is returned.
Expand Down Expand Up @@ -127,48 +155,29 @@ class StringPool
#
# Call this method if you modified a string submitted to the pool.
def rehash
new_size = calculate_new_size(@size)
old_buckets = @buckets
@buckets = Array(Array(String)?).new(new_size, nil)
@size = 0

old_buckets.each do |bucket|
bucket.try &.each do |entry|
get(entry.to_unsafe, entry.size)
end
if @capacity * 2 <= 0
raise "Hash table too big"
end
end

private def bucket_index(str, len)
hash = hash(str, len)
(hash % @buckets.size).to_i
end
old_capacity = @capacity
old_hashes = @hashes
old_values = @values

private def find_entry_in_bucket(bucket, str, len)
bucket.each do |entry|
if entry.size == len
if str.memcmp(entry.to_unsafe, len) == 0
return entry
end
@capacity *= 2
@hashes = Pointer(UInt64).malloc(@capacity, 0_u64)
@values = Pointer(String).malloc(@capacity, "")

old_capacity.times do |i|
if old_hashes[i] != 0
put_on_rehash(old_hashes[i], old_values[i])
end
end
nil
end

private def hash(str, len)
h = 0
str.to_slice(len).each do |c|
h = 31 * h + c
end
h
end

private def calculate_new_size(size)
new_size = 8
Hash::HASH_PRIMES.each do |hash_size|
return hash_size if new_size > size
new_size <<= 1
end
raise "Hash table too big"
hasher = Crystal::Hasher.new
hasher = str.to_slice(len).hash(hasher)
# hash should be non-zero, so `or` it with high bit
hasher.result | 0x8000000000000000_u64
end
end

0 comments on commit bf016e7

Please sign in to comment.