Implement an ordered hash dictionary

The new implementation preserves insertion order. It may use slightly more memory overal but is much faster to iterate because the indices/values are stored densely (performance comparable to `Vector`, i.e. limited by memory bandwidth). Faster insertion/deletion/resizing due to less need to call `hash` (hashes are recorded).
andyferris · Feb 24, 2020 · 0708ce4 · 0708ce4
1 parent f92b3d4
commit 0708ce4
Show file tree

Hide file tree

Showing 6 changed files with 771 additions and 398 deletions.
diff --git a/src/Dictionaries.jl b/src/Dictionaries.jl
@@ -6,7 +6,7 @@ using Indexing
 
 using Base: @propagate_inbounds, Callable
 
-export getindices, setindices!, mapview
+export getindices, setindices!
 
 export AbstractDictionary, AbstractIndices, IndexError, Indices, HashIndices, HashDictionary, Dictionary, MappedDictionary, DictionaryView, FilteredDictionary, FilteredIndices, BroadcastedDictionary
 
@@ -35,13 +35,15 @@ include("HashIndices.jl")
 include("HashDictionary.jl")
 include("MappedDictionary.jl")
 
+include("OldHashIndices.jl")
+include("OldHashDictionary.jl")
+
 end # module
 
 # # TODO
 #
-# * Improved printing - don't calculate length (beyond some cutoff) if it is `SizeUnknown` and limit=true.
+# * Improved printing - don't calculate length (beyond some cutoff) if it is `SizeUnknown` and limit=true, fix indentiation problems for wider values
 # * `hash` and `isless`
-# * A manually-ordered dictionary would be quite useful, like [OrderedCollections.jl](https://github.com/JuliaCollections/OrderedCollections.jl).
 # * A surface interface for updates like https://github.com/JuliaLang/julia/pull/31367
 # * Soon we will have the concept of "ordered" indices/sets (sort-based dictionaries and
 #   B-trees). We can probably formalize an interface around a trait here. Certain operations

diff --git a/src/HashDictionary.jl b/src/HashDictionary.jl
@@ -1,102 +1,60 @@
-mutable struct HashDictionary{I,T} <: AbstractDictionary{I, T}
+struct HashDictionary{I, T} <: AbstractDictionary{I, T}
     indices::HashIndices{I}
     values::Vector{T}
 
-    HashDictionary{I, T}(indices::HashIndices{I}, values::Vector{T}, ::Nothing) where {I, T} = new(indices, values)
+    function HashDictionary{I, T}(inds::HashIndices{I}, values::Vector{T}) where {I, T}
+        return new(inds, values)
+    end
 end
 
-"""
-    HashDictionary{I, T}()
+HashDictionary(; sizehint = 8) = HashDictionary{Any, Any}(; sizehint = sizehint)
+HashDictionary{I}(; sizehint = 8) where {I} = HashDictionary{I, Any}(; sizehint = sizehint)
 
-Construct an empty `HashDictionary` with index type `I` and element type `T`. This type of
-dictionary uses hashes for fast lookup and insertion, and is both mutable and insertable.
-(See `issettable` and `isinsertable`).
-"""
-function HashDictionary{I, T}(; sizehint::Int = 16) where {I, T}
-    indices = HashIndices{I}(; sizehint=sizehint)
-    HashDictionary{I, T}(indices, Vector{T}(undef, length(indices.slots)), nothing)
+function HashDictionary{I, T}(; sizehint = 8) where {I, T}
+    HashDictionary{I, T}(HashIndices{I}(; sizehint = sizehint), Vector{T}())
 end
-HashDictionary{I}() where {I} = HashDictionary{I, Any}()
-HashDictionary() = HashDictionary{Any}()
 
-"""
-    HashDictionary{I, T}(indices, undef::UndefInitializer)
-
-Construct a `HashDictionary` with index type `I` and element type `T`. The container is
-initialized with `keys` that match the values of `indices`, but the values are unintialized.
-"""
-function HashDictionary{I, T}(indices, ::UndefInitializer) where {I, T} 
-    return HashDictionary{I, T}(HashIndices{I}(indices), undef)
+function HashDictionary(inds, values)
+    return HashDictionary(HashIndices(inds), values)
 end
 
-function HashDictionary{I, T}(h::HashIndices{I}, ::UndefInitializer) where {I, T}
-    return HashDictionary{I, T}(h, Vector{T}(undef, length(h.slots)), nothing)
+function HashDictionary(inds::HashIndices{I}, values) where {I}
+    return HashDictionary{I}(inds, values)
 end
 
-function HashDictionary{I, T}(indices::HashIndices{I}, values) where {I, T}
-    vals = Vector{T}(undef, length(indices.slots))
-    d = HashDictionary{I, T}(indices, vals, nothing)
-
-    @inbounds for (i, v) in zip(tokens(indices), values)
-        vals[i] = v
-    end
-
-    return d
+function HashDictionary{I}(inds, values) where {I}
+    return HashDictionary{I}(HashIndices{I}(inds), values)
 end
 
-"""
-    HashDictionary(indices, values)
-    HashDictionary{I}(indices, values)
-    HashDictionary{I, T}(indices, values)
-
-Construct a `HashDictionary` with indices from `indices` and values from `values`, matched
-in iteration order.
-"""
-function HashDictionary{I, T}(indices, values) where {I, T}
-    iter_size = Base.IteratorSize(indices)
-    if iter_size isa Union{Base.HasLength, Base.HasShape}
-        d = HashDictionary{I, T}(; sizehint = length(indices)*2)
-    else
-        d = HashDictionary{I, T}()
-    end
-
-    for (i, v) in zip(indices, values)
-        insert!(d, i, v)
-    end
-
-    return d
-end
-function HashDictionary{I}(indices, values) where {I}
+function HashDictionary{I}(inds::HashIndices{I}, values) where {I}
     if Base.IteratorEltype(values) === Base.EltypeUnknown()
-        # TODO: implement automatic widening from iterators of Base.EltypeUnkown
         values = collect(values)
     end
+
+    return HashDictionary{I, eltype(values)}(inds, values)
+end
 
-    return HashDictionary{I, eltype(values)}(indices, values)
+function HashDictionary{I, T}(inds, values) where {I, T}
+    return HashDictionary{I, T}(HashIndices{I}(inds), values)
 end
 
-function HashDictionary(indices, values)
-    if Base.IteratorEltype(indices) === Base.EltypeUnknown()
-        # TODO: implement automatic widening from iterators of Base.EltypeUnkown
-        indices = collect(indices)
+function HashDictionary{I, T}(inds::HashIndices{I}, values) where {I, T}
+    iter_size = Base.IteratorSize(values)
+    if iter_size isa Union{Base.HasLength, Base.HasShape}
+        vs = Vector{T}(undef, length(values))
+        @inbounds for (i, v) in enumerate(values)
+            vs[i] = v
+        end
+        return HashDictionary{I, T}(inds, vs)
+    else
+        vs = Vector{T}()
+        for v in values
+            push!(vs, v)
+        end
+        return HashDictionary{I, T}(inds, vs)
     end
-
-    return HashDictionary{eltype(indices)}(indices, values)
 end
 
-"""
-    HashDictionary(dict::AbstractDictionary)
-    HashDictionary{I}(dict::AbstractDictionary)
-    HashDictionary{I, T}(dict::AbstractDictionary)
-
-Construct a copy of `dict` with the same keys and values.
-
-(For copying an `AbstractDict` or other iterable of `Pair`s, see `dictionary`).
-"""
-HashDictionary(dict::AbstractDictionary) = HashDictionary(keys(dict), dict)
-HashDictionary{I}(dict::AbstractDictionary) where {I} = HashDictionary{I}(keys(dict), dict)
-HashDictionary{I, T}(dict::AbstractDictionary) where {I, T} = HashDictionary{I, T}(keys(dict), dict)
-
 """
     dictionary(iter)
 
@@ -116,7 +74,7 @@ dictionary(p1::Pair, p2::Pair...) = dictionary((p1, p2...))
 function _dictionary(::Type{Pair{I, T}}, iter) where {I, T}
     iter_size = Base.IteratorSize(iter)
     if iter_size isa Union{Base.HasLength, Base.HasShape}
-        d = HashDictionary{I, T}(; sizehint = length(iter)*2)
+        d = HashDictionary{I, T}(; sizehint = length(iter)*3 >>> 0x01)
     else
         d = HashDictionary{I, T}()
     end
@@ -128,85 +86,61 @@ function _dictionary(::Type{Pair{I, T}}, iter) where {I, T}
     return d
 end
 
-## Implementation
+# indice
 
-Base.keys(d::HashDictionary) = d.indices
-isinsertable(d::HashDictionary) = true
-issettable(d::HashDictionary) = true
+Base.keys(dict::HashDictionary) = dict.indices
 
-@propagate_inbounds function gettoken(d::HashDictionary{I}, i::I) where {I}
-    return gettoken(keys(d), i)
-end
+# tokens
 
-@inline function gettokenvalue(d::HashDictionary, token)
-    return @inbounds d.values[token]
-end
+tokenized(dict::HashDictionary) = dict.values
 
-function istokenassigned(d::HashDictionary, token)
-    return isassigned(d.values, token)
-end
+# values
 
-@inline function settokenvalue!(d::HashDictionary{I, T}, token, value::T) where {I, T}
-    @inbounds d.values[token] = value
-    return d
+function istokenassigned(dict::HashDictionary, (_slot, index))
+    return isassigned(dict.values, index)
 end
 
-function gettoken!(d::HashDictionary{T}, key::T) where {T}
-    indices = keys(d)
-    (token, values) = _gettoken!(indices, d.values, key)
-    if token < 0
-        (token, values) = _insert!(indices, values, key, -token)
-        d.values = values
-        return (false, token)
-    else
-        d.values = values
-        return (true, token)
-    end 
+@propagate_inbounds function gettokenvalue(dict::HashDictionary, (_slot, index))
+    return dict.values[index]
 end
 
-function Base.copy(d::HashDictionary{I, T}) where {I, T}
-    return HashDictionary{I, T}(d.indices, copy(d.values), nothing)
+issettable(::HashDictionary) = true
+
+@propagate_inbounds function settokenvalue!(dict::HashDictionary{<:Any, T}, (_slot, index), value::T) where {T}
+    dict.values[index] = value
+    return dict
 end
 
-tokenized(d::HashDictionary) = d.values
+# insertion
 
-function Base.empty!(d::HashDictionary)
-    empty!(d.indices)
-    empty!(d.values)
-    resize!(d.values, length(keys(d).slots))
-    return d
-end
+isinsertable(::HashDictionary) = true
 
-function deletetoken!(d::HashDictionary{I, T}, token) where {I, T}
-    deletetoken!(keys(d), token)
-    isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), d.values, token-1)
-    return d
+function gettoken!(dict::HashDictionary{I}, i::I) where {I}
+    (hadtoken, (slot, index)) = gettoken!(keys(dict), i)
+    if !hadtoken
+        resize!(dict.values, length(dict.values) + 1)
+    end
+    return (hadtoken, (slot, index))
 end
 
-function Base.sizehint!(d::HashDictionary, sz::Int)
-    d.values = _sizehint!(d.indices, d.values, sz)
-    return d
+function deletetoken!(dict::HashDictionary{I, T}, (slot, index)) where {I, T}
+    deletetoken!(dict.indices, (slot, index))
+    isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), dict.values, index-1)
+    return dict
 end
 
-function Base.rehash!(d::HashDictionary, newsz::Int = length(d.inds))
-    _rehash!(d.indices, d.values, newsz)
-    return d
-end
 
-Base.filter!(pred, d::HashDictionary) = Base.unsafe_filter!(pred, d)
+function Base.empty!(dict::HashDictionary{I, T}) where {I, T}
+    empty!(dict.values)
+    empty!(dict.indices)
 
-# `HashDictionary` is the default mutable AbstractDictionary
-# If given some other index type, create a new `HashIndices` copy of the indices
-function Base.similar(indices::AbstractIndices{I}, ::Type{T}) where {I, T}
-    return similar(HashIndices{I}(indices), T)
+    return dict
 end
 
-# For `HashIndices` we don't copy the indices, we allow the `keys` to remain identical (`===`)
-function Base.similar(indices::HashIndices{I}, ::Type{T}) where {I, T}
-    return HashDictionary{I, T}(indices, undef)
-end
+# Factories
+
+Base.empty(::AbstractIndices, ::Type{I}, ::Type{T}) where {I, T} = HashDictionary{I, T}()
 
-# `HashDictionary` is the default insertable AbstractDictionary
-function Base.empty(::AbstractDictionary, ::Type{I}, ::Type{T}) where {I, T}
-    return HashDictionary{I, T}()
+function Base.similar(indices::AbstractIndices{I}, ::Type{T}) where {I, T}
+    return HashDictionary(indices, Vector{T}(undef, length(indices)))
 end