From a5d049410d42b70c371a14629aace4dd9fa37967 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Mon, 24 Feb 2020 22:25:36 +1000 Subject: [PATCH 01/20] Implement an ordered hash dictionary The new implementation preserves insertion order. It may use slightly more memory overal but is much faster to iterate because the indices/values are stored densely (performance comparable to `Vector`, i.e. limited by memory bandwidth). Faster insertion/deletion/resizing due to less need to call `hash` (hashes are recorded). --- src/DenseHashDictionary.jl | 213 ++++++++++++++++ src/Dictionaries.jl | 11 +- src/HashDictionary.jl | 241 ++++++++---------- src/HashIndices.jl | 503 ++++++++++++++++++------------------- src/OldHashDictionary.jl | 173 +++++++++++++ src/OldHashIndices.jl | 332 ++++++++++++++++++++++++ src/insertion.jl | 44 ++-- 7 files changed, 1099 insertions(+), 418 deletions(-) create mode 100644 src/DenseHashDictionary.jl create mode 100644 src/OldHashDictionary.jl create mode 100644 src/OldHashIndices.jl diff --git a/src/DenseHashDictionary.jl b/src/DenseHashDictionary.jl new file mode 100644 index 0000000..23efffa --- /dev/null +++ b/src/DenseHashDictionary.jl @@ -0,0 +1,213 @@ +export DenseHashIndices, DenseHashDictionary + +perfect_hash(::Any) = false +perfect_hash(::Union{Bool, UInt8, UInt16, UInt32, UInt, Int8, Int16, Int32, Int, Char}) = true + +struct DenseHashIndices{I} <: AbstractIndices{I} + # The hash table + slots::Vector{Int} + + # Densely ordered hashes and values + hashes::Vector{UInt} + values::Vector{I} +end + +function DenseHashIndices{I}(; sizehint = 8) where {I} + @assert sizehint > 0 + DenseHashIndices{I}(fill(0, sizehint), Vector{UInt}(), Vector{I}()) +end + +function rehash!(indices::DenseHashIndices{I}, newsize::Integer) where {I} + slots = resize!(indices.slots, newsize) + fill!(slots, 0) + bit_mask = newsize - one(typeof(newsize)) # newsize is a power of two + + for (index, full_hash) in enumerate(indices.hashes) + trial_slot = reinterpret(Int, full_hash) & bit_mask + @inbounds while true + trial_slot = (trial_slot + 1) + if slots[trial_slot] == 0 + slots[trial_slot] = index + break + else + trial_slot = trial_slot & bit_mask + end + # This is potentially an infinte loop and care must be taken by the callee not + # to overfill the container + end + end +end + +Base.length(indices::DenseHashIndices) = length(indices.values) + +# Token interface +istokenizable(::DenseHashIndices) = true + +tokentype(::DenseHashIndices) = Int + +# Duration iteration the token cannot be used for deletion - we do not worry about the slots +@propagate_inbounds function iteratetoken(indices::DenseHashIndices) + if isempty(indices.values) + return nothing + end + return ((0, 1), 1) +end + +@propagate_inbounds function iteratetoken(indices::DenseHashIndices, index::Int) + if index == length(indices.values) + return nothing + end + index = index + 1 + return ((0, index), index) +end + + +function gettoken(indices::DenseHashIndices{I}, i::I) where {I} + full_hash = hash(i) + n_slots = length(indices.slots) + bit_mask = n_slots - 1 # n_slots is always a power of two + + trial_slot = reinterpret(Int, full_hash) & bit_mask + @inbounds while true + trial_slot = (trial_slot + 1) + trial_index = indices.slots[trial_slot] + if trial_index == 0 + return (false, (0, 0)) + end + + if full_hash == indices.hashes[trial_index] && (perfect_hash(i) || isequal(i, indices.values[trial_index])) + return (true, (trial_slot, trial_index)) + end + + trial_slot = trial_slot & bit_mask + # This is potentially an infinte loop and care must be taken upon insertion not + # to completely fill the container + end +end + +@propagate_inbounds function gettokenvalue(indices::DenseHashIndices, (_slot, index)) + return indices.values[index] +end + +# Insertion interface +isinsertable(::DenseHashIndices) = true + +function gettoken!(indices::DenseHashIndices{I}, i::I) where {I} + full_hash = hash(i) + n_slots = length(indices.slots) + bit_mask = n_slots - 1 # n_slots is always a power of two + n_values = length(indices.values) + + trial_slot = reinterpret(Int, full_hash) & bit_mask + trial_index = 0 + @inbounds while true + trial_slot = (trial_slot + 1) + trial_index = indices.slots[trial_slot] + if trial_index <= 0 + indices.slots[trial_slot] = n_values + 1 + break + end + + trial_hash = indices.hashes[trial_index] + + if trial_hash == full_hash && (perfect_hash(i) || isequal(i, indices.values[trial_index])) + return (true, (trial_slot, trial_index)) + end + + trial_slot = trial_slot & bit_mask + # This is potentially an infinte loop and care must be taken upon insertion not + # to completely fill the container + end + + push!(indices.hashes, full_hash) + push!(indices.values, i) + + # Expand the hash map when it reaches 2/3rd full + if 3 * (n_values + 1) > 2 * n_slots + # Grow faster for small hash maps than for large ones + rehash!(indices, n_slots > 16000 ? 2 * n_slots : 4 * n_slots) + end + + return (false, (trial_slot, n_values + 1)) +end + +@propagate_inbounds function deletetoken!(indices::DenseHashIndices, (slot, index)) + indices.slots[slot] = -1 + splice!(indices.hashes, index) + splice!(indices.values, index) + + # Shrink the hash map when it is less than 1/4th full + n_slots = length(indices.slots) + n_values = length(indices.values) + if 4 * n_values < n_slots && n_slots > 8 + # Halve each time + rehash!(indices, n_slots >> 0x01) + end + + return indices +end + +# Factories + +Base.empty(::DenseHashIndices, ::Type{I}) where {I} = DenseHashIndices{I}() + +######## + +struct DenseHashDictionary{I, T} <: AbstractDictionary{I, T} + indices::DenseHashIndices{I} + values::Vector{T} +end + +function DenseHashDictionary{I, T}(; sizehint = 8) where {I, T} + DenseHashDictionary{I, T}(DenseHashIndices{I}(; sizehint = sizehint), Vector{T}()) +end + +# indices + +Base.keys(dict::DenseHashDictionary) = dict.indices + +# tokens + +tokenized(dict::DenseHashDictionary) = dict.values + +# values + +function istokenassigned(dict::DenseHashDictionary, (_slot, index)) + return isassigned(dict.values, index) +end + +@propagate_inbounds function gettokenvalue(dict::DenseHashDictionary, (_slot, index)) + return dict.values[index] +end + +issettable(::DenseHashDictionary) = true + +@propagate_inbounds function settokenvalue!(dict::DenseHashDictionary{<:Any, T}, (_slot, index), value::T) where {T} + dict.values[index] = value + return dict +end + +# insertion + +function gettoken!(dict::DenseHashDictionary{I}, i::I) where {I} + (hadtoken, (slot, index)) = gettoken!(keys(dict), i) + if !hadtoken + resize!(dict.values, length(dict.values) + 1) + end + return (hadtoken, (slot, index)) +end + +function deletetoken!(dict::DenseHashDictionary, (slot, index)) + deletetoken!(dict.indices, (slot, index)) + splice!(dict.values, index) + return dict +end + + +# Factories + +Base.empty(::DenseHashIndices, ::Type{I}, ::Type{T}) where {I, T} = DenseHashDictionary{I, T}() + +function Base.similar(indices::DenseHashIndices{I}, ::Type{T}) where {I, T} + return DenseHashDictionary(indices, Vector{T}(undef, length(indices))) +end diff --git a/src/Dictionaries.jl b/src/Dictionaries.jl index 4739063..d12d217 100644 --- a/src/Dictionaries.jl +++ b/src/Dictionaries.jl @@ -6,7 +6,7 @@ using Indexing using Base: @propagate_inbounds, Callable -export getindices, setindices!, mapview +export getindices, setindices! export AbstractDictionary, AbstractIndices, IndexError, Indices, HashIndices, HashDictionary, Dictionary, MappedDictionary, DictionaryView, FilteredDictionary, FilteredIndices, BroadcastedDictionary @@ -35,13 +35,18 @@ include("HashIndices.jl") include("HashDictionary.jl") include("MappedDictionary.jl") +include("OldHashIndices.jl") +include("OldHashDictionary.jl") + end # module # # TODO # -# * Improved printing - don't calculate length (beyond some cutoff) if it is `SizeUnknown` and limit=true. +# * Improved printing - don't calculate length (beyond some cutoff) if it is `SizeUnknown` and limit=true, fix indentiation problems for wider values # * `hash` and `isless` -# * A manually-ordered dictionary would be quite useful, like [OrderedCollections.jl](https://github.com/JuliaCollections/OrderedCollections.jl). +# * TODO: have `delete!` return next element, `deletetoken!` return next token. +# For these kinds of algorithms, probably need: firstindex, firsttoken, nextind, prevind, +# nexttoken, prevtoken, lastindex, lasttoken. # * A surface interface for updates like https://github.com/JuliaLang/julia/pull/31367 # * Soon we will have the concept of "ordered" indices/sets (sort-based dictionaries and # B-trees). We can probably formalize an interface around a trait here. Certain operations diff --git a/src/HashDictionary.jl b/src/HashDictionary.jl index 2eda590..cd5da27 100644 --- a/src/HashDictionary.jl +++ b/src/HashDictionary.jl @@ -1,102 +1,60 @@ -mutable struct HashDictionary{I,T} <: AbstractDictionary{I, T} +struct HashDictionary{I, T} <: AbstractDictionary{I, T} indices::HashIndices{I} values::Vector{T} - HashDictionary{I, T}(indices::HashIndices{I}, values::Vector{T}, ::Nothing) where {I, T} = new(indices, values) + function HashDictionary{I, T}(inds::HashIndices{I}, values::Vector{T}) where {I, T} + return new(inds, values) + end end -""" - HashDictionary{I, T}() +HashDictionary(; sizehint = 8) = HashDictionary{Any, Any}(; sizehint = sizehint) +HashDictionary{I}(; sizehint = 8) where {I} = HashDictionary{I, Any}(; sizehint = sizehint) -Construct an empty `HashDictionary` with index type `I` and element type `T`. This type of -dictionary uses hashes for fast lookup and insertion, and is both mutable and insertable. -(See `issettable` and `isinsertable`). -""" -function HashDictionary{I, T}(; sizehint::Int = 16) where {I, T} - indices = HashIndices{I}(; sizehint=sizehint) - HashDictionary{I, T}(indices, Vector{T}(undef, length(indices.slots)), nothing) +function HashDictionary{I, T}(; sizehint = 8) where {I, T} + HashDictionary{I, T}(HashIndices{I}(; sizehint = sizehint), Vector{T}()) end -HashDictionary{I}() where {I} = HashDictionary{I, Any}() -HashDictionary() = HashDictionary{Any}() -""" - HashDictionary{I, T}(indices, undef::UndefInitializer) - -Construct a `HashDictionary` with index type `I` and element type `T`. The container is -initialized with `keys` that match the values of `indices`, but the values are unintialized. -""" -function HashDictionary{I, T}(indices, ::UndefInitializer) where {I, T} - return HashDictionary{I, T}(HashIndices{I}(indices), undef) +function HashDictionary(inds, values) + return HashDictionary(HashIndices(inds), values) end -function HashDictionary{I, T}(h::HashIndices{I}, ::UndefInitializer) where {I, T} - return HashDictionary{I, T}(h, Vector{T}(undef, length(h.slots)), nothing) +function HashDictionary(inds::HashIndices{I}, values) where {I} + return HashDictionary{I}(inds, values) end -function HashDictionary{I, T}(indices::HashIndices{I}, values) where {I, T} - vals = Vector{T}(undef, length(indices.slots)) - d = HashDictionary{I, T}(indices, vals, nothing) - - @inbounds for (i, v) in zip(tokens(indices), values) - vals[i] = v - end - - return d +function HashDictionary{I}(inds, values) where {I} + return HashDictionary{I}(HashIndices{I}(inds), values) end -""" - HashDictionary(indices, values) - HashDictionary{I}(indices, values) - HashDictionary{I, T}(indices, values) - -Construct a `HashDictionary` with indices from `indices` and values from `values`, matched -in iteration order. -""" -function HashDictionary{I, T}(indices, values) where {I, T} - iter_size = Base.IteratorSize(indices) - if iter_size isa Union{Base.HasLength, Base.HasShape} - d = HashDictionary{I, T}(; sizehint = length(indices)*2) - else - d = HashDictionary{I, T}() - end - - for (i, v) in zip(indices, values) - insert!(d, i, v) - end - - return d -end -function HashDictionary{I}(indices, values) where {I} +function HashDictionary{I}(inds::HashIndices{I}, values) where {I} if Base.IteratorEltype(values) === Base.EltypeUnknown() - # TODO: implement automatic widening from iterators of Base.EltypeUnkown values = collect(values) end + + return HashDictionary{I, eltype(values)}(inds, values) +end - return HashDictionary{I, eltype(values)}(indices, values) +function HashDictionary{I, T}(inds, values) where {I, T} + return HashDictionary{I, T}(HashIndices{I}(inds), values) end -function HashDictionary(indices, values) - if Base.IteratorEltype(indices) === Base.EltypeUnknown() - # TODO: implement automatic widening from iterators of Base.EltypeUnkown - indices = collect(indices) +function HashDictionary{I, T}(inds::HashIndices{I}, values) where {I, T} + iter_size = Base.IteratorSize(values) + if iter_size isa Union{Base.HasLength, Base.HasShape} + vs = Vector{T}(undef, length(values)) + @inbounds for (i, v) in enumerate(values) + vs[i] = v + end + return HashDictionary{I, T}(inds, vs) + else + vs = Vector{T}() + for v in values + push!(vs, v) + end + return HashDictionary{I, T}(inds, vs) end - - return HashDictionary{eltype(indices)}(indices, values) end -""" - HashDictionary(dict::AbstractDictionary) - HashDictionary{I}(dict::AbstractDictionary) - HashDictionary{I, T}(dict::AbstractDictionary) - -Construct a copy of `dict` with the same keys and values. - -(For copying an `AbstractDict` or other iterable of `Pair`s, see `dictionary`). -""" -HashDictionary(dict::AbstractDictionary) = HashDictionary(keys(dict), dict) -HashDictionary{I}(dict::AbstractDictionary) where {I} = HashDictionary{I}(keys(dict), dict) -HashDictionary{I, T}(dict::AbstractDictionary) where {I, T} = HashDictionary{I, T}(keys(dict), dict) - """ dictionary(iter) @@ -116,97 +74,98 @@ dictionary(p1::Pair, p2::Pair...) = dictionary((p1, p2...)) function _dictionary(::Type{Pair{I, T}}, iter) where {I, T} iter_size = Base.IteratorSize(iter) if iter_size isa Union{Base.HasLength, Base.HasShape} - d = HashDictionary{I, T}(; sizehint = length(iter)*2) + n = length(iter) + inds = Vector{I}(undef, n) + vals = Vector{T}(undef, n) + j = 1 + @inbounds for (i, v) in iter + inds[j] = i + vals[j] = v + j += 1 + end + return HashDictionary{I, T}(inds, vals) else - d = HashDictionary{I, T}() + inds = Vector{I}() + vals = Vector{T}() + @inbounds for (i, v) in iter + push!(inds, i) + push!(vals, v) + end + return HashDictionary{I, T}(inds, vals) end +end - for (i, v) in iter - insert!(d, i, v) - end +# indices - return d -end +Base.keys(dict::HashDictionary) = dict.indices -## Implementation +# tokens -Base.keys(d::HashDictionary) = d.indices -isinsertable(d::HashDictionary) = true -issettable(d::HashDictionary) = true +tokenized(dict::HashDictionary) = dict.values -@propagate_inbounds function gettoken(d::HashDictionary{I}, i::I) where {I} - return gettoken(keys(d), i) -end +# values -@inline function gettokenvalue(d::HashDictionary, token) - return @inbounds d.values[token] +function istokenassigned(dict::HashDictionary, (_slot, index)) + return isassigned(dict.values, index) end -function istokenassigned(d::HashDictionary, token) - return isassigned(d.values, token) +@propagate_inbounds function gettokenvalue(dict::HashDictionary, (_slot, index)) + return dict.values[index] end -@inline function settokenvalue!(d::HashDictionary{I, T}, token, value::T) where {I, T} - @inbounds d.values[token] = value - return d -end +issettable(::HashDictionary) = true -function gettoken!(d::HashDictionary{T}, key::T) where {T} - indices = keys(d) - (token, values) = _gettoken!(indices, d.values, key) - if token < 0 - (token, values) = _insert!(indices, values, key, -token) - d.values = values - return (false, token) - else - d.values = values - return (true, token) - end +@propagate_inbounds function settokenvalue!(dict::HashDictionary{<:Any, T}, (_slot, index), value::T) where {T} + dict.values[index] = value + return dict end -function Base.copy(d::HashDictionary{I, T}) where {I, T} - return HashDictionary{I, T}(d.indices, copy(d.values), nothing) -end +# insertion -tokenized(d::HashDictionary) = d.values +isinsertable(::HashDictionary) = true -function Base.empty!(d::HashDictionary) - empty!(d.indices) - empty!(d.values) - resize!(d.values, length(keys(d).slots)) - return d +function gettoken!(dict::HashDictionary{I}, i::I) where {I} + (hadtoken, (slot, index)) = gettoken!(keys(dict), i, (dict.values,)) + return (hadtoken, (slot, index)) end -function deletetoken!(d::HashDictionary{I, T}, token) where {I, T} - deletetoken!(keys(d), token) - isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), d.values, token-1) - return d +function deletetoken!(dict::HashDictionary{I, T}, (slot, index)) where {I, T} + isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), dict.values, index-1) + deletetoken!(dict.indices, (slot, index), (dict.values,)) + return dict end -function Base.sizehint!(d::HashDictionary, sz::Int) - d.values = _sizehint!(d.indices, d.values, sz) - return d -end -function Base.rehash!(d::HashDictionary, newsz::Int = length(d.inds)) - _rehash!(d.indices, d.values, newsz) - return d -end +function Base.empty!(dict::HashDictionary{I, T}) where {I, T} + empty!(dict.values) + empty!(dict.indices) -Base.filter!(pred, d::HashDictionary) = Base.unsafe_filter!(pred, d) + return dict +end -# `HashDictionary` is the default mutable AbstractDictionary -# If given some other index type, create a new `HashIndices` copy of the indices -function Base.similar(indices::AbstractIndices{I}, ::Type{T}) where {I, T} - return similar(HashIndices{I}(indices), T) +function Base.filter!(pred, dict::HashDictionary) + indices = keys(dict) + _filter!(i -> pred(@inbounds dict.values[i]), keys(indices.values), indices.values, indices.hashes, (dict.values,)) + indices.deleted = 0 + newsize = Base._tablesz(3*length(indices.values) >> 0x01) + rehash!(indices, newsize, (dict.values,)) + return dict end -# For `HashIndices` we don't copy the indices, we allow the `keys` to remain identical (`===`) -function Base.similar(indices::HashIndices{I}, ::Type{T}) where {I, T} - return HashDictionary{I, T}(indices, undef) +function Base.filter!(pred, dict::PairDictionary{<:Any, <:Any, <:HashDictionary}) + d = dict.d + indices = keys(d) + _filter!(i -> pred(@inbounds indices.values[i] => d.values[i]), keys(indices.values), indices.values, indices.hashes, (d.values,)) + indices.deleted = 0 + newsize = Base._tablesz(3*length(indices.values) >> 0x01) + rehash!(indices, newsize, (d.values,)) + return dict end -# `HashDictionary` is the default insertable AbstractDictionary -function Base.empty(::AbstractDictionary, ::Type{I}, ::Type{T}) where {I, T} - return HashDictionary{I, T}() +# Factories + +Base.empty(::AbstractIndices, ::Type{I}, ::Type{T}) where {I, T} = HashDictionary{I, T}() + +function Base.similar(indices::AbstractIndices{I}, ::Type{T}) where {I, T} + return HashDictionary(indices, Vector{T}(undef, length(indices))) end diff --git a/src/HashIndices.jl b/src/HashIndices.jl index 14ac3d0..72596fe 100644 --- a/src/HashIndices.jl +++ b/src/HashIndices.jl @@ -1,31 +1,23 @@ -# These can be changed, to trade off better performance for space -const global maxallowedprobe = 16 -const global maxprobeshift = 6 - -mutable struct HashIndices{T} <: AbstractIndices{T} - slots::Array{UInt8,1} - inds::Array{T,1} - ndel::Int - count::Int - idxfloor::Int # an index <= the indices of all used slots - maxprobe::Int -end +const hash_mask = typemax(UInt) >>> 0x01 +const deletion_mask = hash_mask + 0x01 -HashIndices() = HashIndices{Any}() +mutable struct HashIndices{I} <: AbstractIndices{I} + # The hash table + slots::Vector{Int} -""" - HashIndices{I}() + # Hashes and values + hashes::Vector{UInt} # Deletion marker stored in high bit + values::Vector{I} -Construct an empty `HashIndices` with indices of type `I`. This container uses hashes for -fast lookup, and is insertable. (See `isinsertable`). -""" -function HashIndices{T}(; sizehint::Int = 16) where {T} - sz = Base._tablesz(sizehint) - HashIndices{T}(zeros(UInt8, sz), Vector{T}(undef, sz), 0, 0, 1, 0) + deleted::Int end +HashIndices(; sizehint = 8) = HashIndices{Any}(; sizehint = sizehint) -## Constructors +function HashIndices{I}(; sizehint = 8) where {I} + newsize = Base._tablesz((3 * sizehint) >> 0x01); + HashIndices{I}(fill(0, sizehint), Vector{UInt}(), Vector{I}(), 0) +end """ HashIndices(iter) @@ -42,294 +34,297 @@ function HashIndices(iter) return HashIndices{eltype(iter)}(iter) end -function HashIndices{T}(iter) where {T} +function HashIndices{I}(iter) where {I} iter_size = Base.IteratorSize(iter) if iter_size isa Union{Base.HasLength, Base.HasShape} - h = HashIndices{T}(; sizehint = length(iter)*2) + values = Vector{I}(undef, length(iter)) + @inbounds for (i, value) in enumerate(iter) + values[i] = value + end + return HashIndices{I}(values) else - h = HashIndices{T}() - end - - for i in iter - insert!(h, i) # should this be `set!` or `insert!`? + h = HashIndices{I}() + for i in iter + insert!(h, i) + end + return h end - - return h -end - -function Base.copy(h::HashIndices{T}) where {T} - return HashIndices{T}(copy(h.slots), copy(h.inds), h.ndel, h.count, h.idxfloor, h.maxprobe) end -Base.empty(::HashIndices, ::Type{T}) where {T} = HashIndices{T}() - - -## Length -Base.length(h::HashIndices) = h.count - - -## Token interface - -istokenizable(::HashIndices) = true -tokentype(::HashIndices) = Int - -@propagate_inbounds isslotempty(h::HashIndices, i::Int) = h.slots[i] == 0x0 -@propagate_inbounds isslotfilled(h::HashIndices, i::Int) = h.slots[i] == 0x1 -@propagate_inbounds isslotdeleted(h::HashIndices, i::Int) = h.slots[i] == 0x2 # deletion marker/tombstone - -istokenassigned(h::HashIndices, i::Int) = isslotfilled(h, i) - -# iteratetoken - -function skip_deleted(h::HashIndices, i) - L = length(h.slots) - @inbounds while i <= L && !isslotfilled(h, i) - i += 1 - end - return i +function HashIndices{I}(values::Vector{I}) where {I} + hashes = map(v -> hash(v) & hash_mask, values) + slots = Vector{Int}() + out = HashIndices{I}(slots, hashes, values, 0) + newsize = Base._tablesz(3*length(values) >> 0x01) + rehash!(out, newsize) + return out end -@propagate_inbounds function iteratetoken(h::HashIndices{T}) where {T} - idx = skip_deleted(h, h.idxfloor) - h.idxfloor = idx # An optimization to skip unnecessary elements when iterating multiple times - - if idx > length(h.inds) - return nothing +function Base.copy(indices::HashIndices{I}) where {I} + if indices.deleted == 0 + return HashIndices{I}(copy(indices.slots), copy(indices.hashes), copy(indices.values), 0) else - return (idx, idx + 1) + out = HashIndices{I}(Vector{Int}(), copy(indices.hashes), copy(indices.values), indices.deleted) + newsize = Base._tablesz(3*length(indices) >> 0x01) + rehash!(out, newsize) end end -@propagate_inbounds function iteratetoken(h::HashIndices{T}, idx::Int) where {T} - idx = skip_deleted(h, idx) +# private (note that newsize must be power of two) +function rehash!(indices::HashIndices{I}, newsize::Int, values = (), include_last_values::Bool = true) where {I} + slots = resize!(indices.slots, newsize) + fill!(slots, 0) + bit_mask = newsize - 1 # newsize is a power of two - if idx > length(h.inds) - return nothing + if indices.deleted == 0 + for (index, full_hash) in enumerate(indices.hashes) + trial_slot = reinterpret(Int, full_hash) & bit_mask + @inbounds while true + trial_slot = (trial_slot + 1) + if slots[trial_slot] == 0 + slots[trial_slot] = index + break + else + trial_slot = trial_slot & bit_mask + end + # This is potentially an infinte loop and care must be taken by the callee not + # to overfill the container + end + end else - return (idx, idx + 1) - end -end - -# gettoken -function hashtoken(key, sz::Int) - # Given key what is the hash slot? sz is a power of two - (((hash(key)%Int) & (sz-1)) + 1)::Int -end - -function gettoken(h::HashIndices{T}, key::T) where {T} - sz = length(h.inds) - iter = 0 - maxprobe = h.maxprobe - token = hashtoken(key, sz) - keys = h.inds - - @inbounds while true - if isslotempty(h, token) - break + to_index = Ref(1) # Reassigning to to_index/from_index gives the closure capture boxing issue, so mutate a reference instead + from_index = Ref(1) + n_values = length(indices.values) + @inbounds while from_index[] <= n_values + full_hash = indices.hashes[from_index[]] + if full_hash & deletion_mask === zero(UInt) + trial_slot = reinterpret(Int, full_hash) & bit_mask + @inbounds while true + trial_slot = trial_slot + 1 + if slots[trial_slot] == 0 + slots[trial_slot] = to_index[] + indices.hashes[to_index[]] = indices.hashes[from_index[]] + indices.values[to_index[]] = indices.values[from_index[]] + if include_last_values || from_index[] < n_values + map(values) do (vals) + @inbounds vals[to_index[]] = vals[from_index[]] + end + end + to_index[] += 1 + break + else + trial_slot = trial_slot & bit_mask + end + end + end + + from_index[] += 1 end - if !isslotdeleted(h, token) && (key === keys[token] || isequal(key, keys[token])) - return (true, token) + + new_size = length(indices.values) - indices.deleted + resize!(indices.values, new_size) + resize!(indices.hashes, new_size) + map(values) do (vals) + resize!(vals, new_size) end - - token = (token & (sz-1)) + 1 - iter += 1 - iter > maxprobe && break + indices.deleted = 0 end - return (false, 0) + return indices end -# gettokenvalue -@propagate_inbounds function gettokenvalue(h::HashIndices, token::Int) - return h.inds[token] -end +Base.length(indices::HashIndices) = length(indices.values) - indices.deleted +# Token interface +istokenizable(::HashIndices) = true -# insertable interface -isinsertable(::HashIndices) = true - -function Base.empty!(h::HashIndices{T}) where {T} - fill!(h.slots, 0x0) # It should be OK to reduce this back to some smaller size. - sz = length(h.slots) - empty!(h.inds) - resize!(h.inds, sz) - h.ndel = 0 - h.count = 0 - h.idxfloor = 1 - return h -end +tokentype(::HashIndices) = Int -function Base.rehash!(h::HashIndices, newsz::Int = length(h.inds)) - _rehash!(h, nothing, newsz) - return h +# Duration iteration the token cannot be used for deletion - we do not worry about the slots +@propagate_inbounds function iteratetoken(indices::HashIndices) + if indices.deleted == 0 + return length(indices) > 0 ? ((0, 1), 1) : nothing + end + index = 1 + @inbounds while index <= length(indices.hashes) + if indices.hashes[index] & deletion_mask === zero(UInt) + return ((0, index), index) + end + index += 1 + end + return nothing end -function _rehash!(h::HashIndices{T}, oldv::Union{Nothing, Vector}, newsz::Int) where {T} - olds = h.slots - oldk = h.inds - sz = length(olds) - newsz = Base._tablesz(newsz) - h.idxfloor = 1 - if h.count == 0 - resize!(h.slots, newsz) - fill!(h.slots, 0) - resize!(h.inds, newsz) - error() - oldv === nothing || resize!(oldv, newsz) - h.ndel = 0 - return oldv +@propagate_inbounds function iteratetoken(indices::HashIndices, index::Int) + index += 1 + if indices.deleted == 0 # apparently this is enough to make it iterate as fast as `Vector` + return index <= length(indices.values) ? ((0, index), index) : nothing end - - slots = zeros(UInt8, newsz) - keys = Vector{T}(undef, newsz) - vals = oldv === nothing ? nothing : Vector{eltype(oldv)}(undef, newsz) - count = 0 - maxprobe = h.maxprobe - - for i ∈ 1:sz - @inbounds if olds[i] == 0x1 - k = oldk[i] - v = vals === nothing ? nothing : oldv[i] - index0 = index = hashtoken(k, newsz) - while slots[index] != 0 - index = (index & (newsz-1)) + 1 - end - probe = (index - index0) & (newsz-1) - probe > maxprobe && (maxprobe = probe) - slots[index] = 0x1 - keys[index] = k - vals === nothing || (vals[index] = v) - count += 1 + @inbounds while index <= length(indices.hashes) + if indices.hashes[index] & deletion_mask === zero(UInt) + return ((0, index), index) end + index += 1 end + return nothing +end - h.slots = slots - h.inds = keys - h.count = count - h.ndel = 0 - h.maxprobe = maxprobe +function gettoken(indices::HashIndices{I}, i::I) where {I} + full_hash = hash(i) & hash_mask + n_slots = length(indices.slots) + bit_mask = n_slots - 1 # n_slots is always a power of two - return vals -end + trial_slot = reinterpret(Int, full_hash) & bit_mask + @inbounds while true + trial_slot = (trial_slot + 1) + trial_index = indices.slots[trial_slot] + if trial_index == 0 + return (false, (0, 0)) + end -Base.sizehint!(h::HashIndices, newsz::Int) = _sizehint!(h, nothing, newsz) + if full_hash === indices.hashes[trial_index] && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) + return (true, (trial_slot, trial_index)) + end -function _sizehint!(h::HashIndices{T}, values::Union{Nothing, Vector}, newsz::Int) where {T} - oldsz = length(h.slots) - if newsz <= oldsz - # TODO: shrink - # be careful: rehash!() assumes everything fits. it was only designed - # for growing. - return hash + trial_slot = trial_slot & bit_mask + # This is potentially an infinte loop and care must be taken upon insertion not + # to completely fill the container end - # grow at least 25% - newsz = min(max(newsz, (oldsz*5)>>2), - Base.max_values(T)) - return _rehash!(h, values, newsz) end +@propagate_inbounds function gettokenvalue(indices::HashIndices, (_slot, index)) + return indices.values[index] +end +# Insertion interface +isinsertable(::HashIndices) = true -function gettoken!(h::HashIndices{T}, key::T) where {T} - (token, _) = _gettoken!(h, nothing, key) # This will make sure a slot is available at `token` (or `-token` if it is new) - - if token < 0 - @inbounds (token, _) = _insert!(h, nothing, key, -token) # This will fill the slot with `key` - return (false, token) - else - return (true, token) - end -end +function gettoken!(indices::HashIndices{I}, i::I, values = ()) where {I} + full_hash = hash(i) & hash_mask + n_slots = length(indices.slots) + bit_mask = n_slots - 1 # n_slots is always a power of two + n_values = length(indices.values) + new_index = n_values + 1 -# get the index where a key is stored, or -pos if not present -# and the key would be inserted at pos -# This version is for use by insert!, set! and get! -function _gettoken!(h::HashIndices{T}, values::Union{Nothing, Vector}, key::T) where {T} - sz = length(h.inds) - iter = 0 - maxprobe = h.maxprobe - token = hashtoken(key, sz) - avail = 0 - keys = h.inds - - # Search of the key is present or if there is a deleted slot `key` could fill. + trial_slot = reinterpret(Int, full_hash) & bit_mask + trial_index = 0 + deleted_slot = 0 @inbounds while true - if isslotempty(h, token) - if avail < 0 - return (avail, values) - end - return (-token, values) + trial_slot = (trial_slot + 1) + trial_index = indices.slots[trial_slot] + if trial_index == 0 + break + end + if trial_index < 0 && deleted_slot == 0 + deleted_slot = trial_slot end - if isslotdeleted(h, token) - if avail == 0 - # found an available deleted slot, but we need to keep scanning - # in case `key` already exists in a later collided slot. - avail = -token - end - elseif key === keys[token] || isequal(key, keys[token]) - return (token, values) + trial_hash = indices.hashes[trial_index] + + if trial_hash === full_hash && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) + return (true, (trial_slot, trial_index)) end - token = (token & (sz-1)) + 1 - iter += 1 - iter > maxprobe && break + trial_slot = trial_slot & bit_mask + # This is potentially an infinte loop and care must be taken upon insertion not + # to completely fill the container end - avail < 0 && return (avail, values) - - # The key definitely isn't present, but a slot may become available if we increase - # `maxprobe` (up to some reasonable global limits). - maxallowed = max(maxallowedprobe, sz>>maxprobeshift) + if deleted_slot == 0 + # Use the trail slot + indices.slots[trial_slot] = new_index + else + # Use the deleted slot + indices.slots[trial_slot] = new_index + indices.deleted -= 1 + end + push!(indices.hashes, full_hash) + push!(indices.values, i) + map(values) do (vals) + resize!(vals, length(vals) + 1) + end - @inbounds while iter < maxallowed - if !isslotfilled(h,token) - h.maxprobe = iter - return (-token, values) + # Expand the hash map when it reaches 2/3rd full + if 3 * new_index > 2 * n_slots + # Grow faster for small hash maps than for large ones + newsize = n_slots > 16000 ? 2 * n_slots : 4 * n_slots + rehash!(indices, newsize, values, false) + + # The slot almost certainly has changed + bit_mask = newsize - 1 + trial_slot = reinterpret(Int, full_hash) & bit_mask + @inbounds while true + trial_slot = (trial_slot + 1) + if indices.slots[trial_slot] == new_index + break + end + trial_slot = trial_slot & bit_mask end - token = (token & (sz-1)) + 1 - iter += 1 + + # The index may have changed + new_index = length(indices.values) end - # If we get here, then all the probable slots are filled, and the only recourse is to - # increase the size of the hash map and try again - values = _rehash!(h, values, h.count > 64000 ? sz*2 : sz*4) - return _gettoken!(h, values, key) + return (false, (trial_slot, new_index)) end -@propagate_inbounds function _insert!(h::HashIndices{T}, values::Union{Nothing, Vector}, key::T, token::Int) where {T} - h.slots[token] = 0x1 - h.inds[token] = key - h.count += 1 - if token < h.idxfloor - h.idxfloor = token +@propagate_inbounds function deletetoken!(indices::HashIndices{I}, (slot, index), values = ()) where {I} + @boundscheck if slot == 0 + error("Cannot use iteration token for deletion") end - - # TODO revisit this... - #= - sz = length(h.inds) - # Rehash now if necessary - if h.ndel >= ((3*sz)>>2) || h.count*3 > sz*2 - # > 3/4 deleted or > 2/3 full - values = _rehash!(h, values, h.count > 64000 ? h.count*2 : h.count*4) - (_, token) = gettoken(h, key) + indices.slots[slot] = -index + indices.hashes[index] = deletion_mask + isbitstype(I) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), indices.values, index-1) + indices.deleted += 1 + + # Recreate the hash map when 1/3rd of the values are deletions + n_slots = length(indices.slots) + n_values = length(indices.values) - indices.deleted + if 3 * indices.deleted > n_values + # Halve if necessary + halve = 4 * n_values < n_slots && n_slots > 8 + rehash!(indices, halve ? n_slots >> 0x01 : n_slots, values) end - =# - return (token, values) + return indices end - -function deletetoken!(h::HashIndices{T}, token::Int) where {T} - h.slots[token] = 0x2 - isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), h.inds, token-1) +function Base.empty!(indices::HashIndices{I}) where {I} + indices.hashes = Vector{UInt}() + indices.values = Vector{I}() + indices.slots = fill(0, 8) + indices.deleted = 0 - h.ndel += 1 - h.count -= 1 - return h + return indices +end + +# Accelerated filtering + +function Base.filter!(pred, indices::HashIndices) + _filter!(i -> pred(@inbounds indices.values[i]), keys(indices.values), indices.values, indices.hashes, ()) + indices.deleted = 0 + newsize = Base._tablesz(3*length(indices.values) >> 0x01) + rehash!(indices, newsize) +end + +@inline function _filter!(pred, range, indices, hashes, values = ()) + n = length(indices) + i = Ref(0) + j = Ref(0) + @inbounds while i[] < n + i[] += 1 + if hashes[i[]] & deletion_mask === zero(UInt) && pred(i[]) + j[] += 1 + indices[j[]] = indices[i[]] + hashes[j[]] = hashes[i[]] + map(vec -> @inbounds(vec[j[]] = vec[i[]]), values) + end + end + newsize = j[] + resize!(indices, newsize) + resize!(hashes, newsize) + map(vec -> resize!(vec, newsize), values) end -# Since deleting elements doesn't mess with iteration, we can use `unsafe_filter!`` -Base.filter!(pred, h::HashIndices) = Base.unsafe_filter!(pred, h) +# Factories -# The default insertable indices -Base.empty(d::AbstractDictionary, ::Type{T}) where {T} = HashIndices{T}() +Base.empty(::AbstractIndices, ::Type{I}) where {I} = HashIndices{I}() diff --git a/src/OldHashDictionary.jl b/src/OldHashDictionary.jl new file mode 100644 index 0000000..0f293bb --- /dev/null +++ b/src/OldHashDictionary.jl @@ -0,0 +1,173 @@ +mutable struct OldHashDictionary{I,T} <: AbstractDictionary{I, T} + indices::OldHashIndices{I} + values::Vector{T} + + OldHashDictionary{I, T}(indices::OldHashIndices{I}, values::Vector{T}, ::Nothing) where {I, T} = new(indices, values) +end + +""" + OldHashDictionary{I, T}() + +Construct an empty `OldHashDictionary` with index type `I` and element type `T`. This type of +dictionary uses hashes for fast lookup and insertion, and is both mutable and insertable. +(See `issettable` and `isinsertable`). +""" +function OldHashDictionary{I, T}(; sizehint::Int = 16) where {I, T} + indices = OldHashIndices{I}(; sizehint=sizehint) + OldHashDictionary{I, T}(indices, Vector{T}(undef, length(indices.slots)), nothing) +end +OldHashDictionary{I}() where {I} = OldHashDictionary{I, Any}() +OldHashDictionary() = OldHashDictionary{Any}() + +""" + OldHashDictionary{I, T}(indices, undef::UndefInitializer) + +Construct a `OldHashDictionary` with index type `I` and element type `T`. The container is +initialized with `keys` that match the values of `indices`, but the values are unintialized. +""" +function OldHashDictionary{I, T}(indices, ::UndefInitializer) where {I, T} + return OldHashDictionary{I, T}(OldHashIndices{I}(indices), undef) +end + +function OldHashDictionary{I, T}(h::OldHashIndices{I}, ::UndefInitializer) where {I, T} + return OldHashDictionary{I, T}(h, Vector{T}(undef, length(h.slots)), nothing) +end + +function OldHashDictionary{I, T}(indices::OldHashIndices{I}, values) where {I, T} + vals = Vector{T}(undef, length(indices.slots)) + d = OldHashDictionary{I, T}(indices, vals, nothing) + + @inbounds for (i, v) in zip(tokens(indices), values) + vals[i] = v + end + + return d +end + +""" + OldHashDictionary(indices, values) + OldHashDictionary{I}(indices, values) + OldHashDictionary{I, T}(indices, values) + +Construct a `OldHashDictionary` with indices from `indices` and values from `values`, matched +in iteration order. +""" +function OldHashDictionary{I, T}(indices, values) where {I, T} + iter_size = Base.IteratorSize(indices) + if iter_size isa Union{Base.HasLength, Base.HasShape} + d = OldHashDictionary{I, T}(; sizehint = length(indices)*2) + else + d = OldHashDictionary{I, T}() + end + + for (i, v) in zip(indices, values) + insert!(d, i, v) + end + + return d +end +function OldHashDictionary{I}(indices, values) where {I} + if Base.IteratorEltype(values) === Base.EltypeUnknown() + # TODO: implement automatic widening from iterators of Base.EltypeUnkown + values = collect(values) + end + + return OldHashDictionary{I, eltype(values)}(indices, values) +end + +function OldHashDictionary(indices, values) + if Base.IteratorEltype(indices) === Base.EltypeUnknown() + # TODO: implement automatic widening from iterators of Base.EltypeUnkown + indices = collect(indices) + end + + return OldHashDictionary{eltype(indices)}(indices, values) +end + +""" + OldHashDictionary(dict::AbstractDictionary) + OldHashDictionary{I}(dict::AbstractDictionary) + OldHashDictionary{I, T}(dict::AbstractDictionary) + +Construct a copy of `dict` with the same keys and values. +(For copying an `AbstractDict` or other iterable of `Pair`s, see `dictionary`). +""" +OldHashDictionary(dict::AbstractDictionary) = OldHashDictionary(keys(dict), dict) +OldHashDictionary{I}(dict::AbstractDictionary) where {I} = OldHashDictionary{I}(keys(dict), dict) +OldHashDictionary{I, T}(dict::AbstractDictionary) where {I, T} = OldHashDictionary{I, T}(keys(dict), dict) + +## Implementation + +Base.keys(d::OldHashDictionary) = d.indices +isinsertable(d::OldHashDictionary) = true +issettable(d::OldHashDictionary) = true + +@propagate_inbounds function gettoken(d::OldHashDictionary{I}, i::I) where {I} + return gettoken(keys(d), i) +end + +@inline function gettokenvalue(d::OldHashDictionary, token) + return @inbounds d.values[token] +end + +function istokenassigned(d::OldHashDictionary, token) + return isassigned(d.values, token) +end + +@inline function settokenvalue!(d::OldHashDictionary{I, T}, token, value::T) where {I, T} + @inbounds d.values[token] = value + return d +end + +function gettoken!(d::OldHashDictionary{T}, key::T) where {T} + indices = keys(d) + (token, values) = _gettoken!(indices, d.values, key) + if token < 0 + (token, values) = _insert!(indices, values, key, -token) + d.values = values + return (false, token) + else + d.values = values + return (true, token) + end +end + +function Base.copy(d::OldHashDictionary{I, T}) where {I, T} + return OldHashDictionary{I, T}(d.indices, copy(d.values), nothing) +end + +tokenized(d::OldHashDictionary) = d.values + +function Base.empty!(d::OldHashDictionary) + empty!(d.indices) + empty!(d.values) + resize!(d.values, length(keys(d).slots)) + return d +end + +function deletetoken!(d::OldHashDictionary{I, T}, token) where {I, T} + deletetoken!(keys(d), token) + isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), d.values, token-1) + return d +end + +function Base.sizehint!(d::OldHashDictionary, sz::Int) + d.values = _sizehint!(d.indices, d.values, sz) + return d +end + +function Base.rehash!(d::OldHashDictionary, newsz::Int = length(d.inds)) + _rehash!(d.indices, d.values, newsz) + return d +end + +Base.filter!(pred, d::OldHashDictionary) = Base.unsafe_filter!(pred, d) + +# For `OldHashIndices` we don't copy the indices, we allow the `keys` to remain identical (`===`) +function Base.similar(indices::OldHashIndices{I}, ::Type{T}) where {I, T} + return OldHashDictionary{I, T}(indices, undef) +end + +function Base.empty(indices::OldHashIndices, ::Type{I}, ::Type{T}) where {I, T} + return OldHashDictionary{I, T}() +end \ No newline at end of file diff --git a/src/OldHashIndices.jl b/src/OldHashIndices.jl new file mode 100644 index 0000000..858fbf3 --- /dev/null +++ b/src/OldHashIndices.jl @@ -0,0 +1,332 @@ +# These can be changed, to trade off better performance for space +const global maxallowedprobe = 16 +const global maxprobeshift = 6 + +mutable struct OldHashIndices{T} <: AbstractIndices{T} + slots::Array{UInt8,1} + inds::Array{T,1} + ndel::Int + count::Int + idxfloor::Int # an index <= the indices of all used slots + maxprobe::Int +end + +OldHashIndices() = OldHashIndices{Any}() + +""" + OldHashIndices{I}() + +Construct an empty `OldHashIndices` with indices of type `I`. This container uses hashes for +fast lookup, and is insertable. (See `isinsertable`). +""" +function OldHashIndices{T}(; sizehint::Int = 16) where {T} + sz = Base._tablesz(sizehint) + OldHashIndices{T}(zeros(UInt8, sz), Vector{T}(undef, sz), 0, 0, 1, 0) +end + + +## Constructors + +""" + OldHashIndices(iter) + OldHashIndices{I}(iter) + +Construct a `OldHashIndices` with indices from iterable container `iter`. +""" +function OldHashIndices(iter) + if Base.IteratorEltype(iter) === Base.EltypeUnknown() + # TODO: implement automatic widening from iterators of Base.EltypeUnkown + iter = collect(iter) + end + + return OldHashIndices{eltype(iter)}(iter) +end + +function OldHashIndices{T}(iter) where {T} + iter_size = Base.IteratorSize(iter) + if iter_size isa Union{Base.HasLength, Base.HasShape} + h = OldHashIndices{T}(; sizehint = length(iter)*2) + else + h = OldHashIndices{T}() + end + + for i in iter + insert!(h, i) # should this be `set!` or `insert!`? + end + + return h +end + +function Base.copy(h::OldHashIndices{T}) where {T} + return OldHashIndices{T}(copy(h.slots), copy(h.inds), h.ndel, h.count, h.idxfloor, h.maxprobe) +end + +## Length +Base.length(h::OldHashIndices) = h.count + + +## Token interface + +istokenizable(::OldHashIndices) = true +tokentype(::OldHashIndices) = Int + +@propagate_inbounds isslotempty(h::OldHashIndices, i::Int) = h.slots[i] == 0x0 +@propagate_inbounds isslotfilled(h::OldHashIndices, i::Int) = h.slots[i] == 0x1 +@propagate_inbounds isslotdeleted(h::OldHashIndices, i::Int) = h.slots[i] == 0x2 # deletion marker/tombstone + +istokenassigned(h::OldHashIndices, i::Int) = isslotfilled(h, i) + +# iteratetoken + +function skip_deleted(h::OldHashIndices, i) + L = length(h.slots) + @inbounds while i <= L && !isslotfilled(h, i) + i += 1 + end + return i +end + +@propagate_inbounds function iteratetoken(h::OldHashIndices{T}) where {T} + idx = skip_deleted(h, h.idxfloor) + h.idxfloor = idx # An optimization to skip unnecessary elements when iterating multiple times + + if idx > length(h.inds) + return nothing + else + return (idx, idx + 1) + end +end + +@propagate_inbounds function iteratetoken(h::OldHashIndices{T}, idx::Int) where {T} + idx = skip_deleted(h, idx) + + if idx > length(h.inds) + return nothing + else + return (idx, idx + 1) + end +end + +# gettoken +function hashtoken(key, sz::Int) + # Given key what is the hash slot? sz is a power of two + (((hash(key)%Int) & (sz-1)) + 1)::Int +end + +function gettoken(h::OldHashIndices{T}, key::T) where {T} + sz = length(h.inds) + iter = 0 + maxprobe = h.maxprobe + token = hashtoken(key, sz) + keys = h.inds + + @inbounds while true + if isslotempty(h, token) + break + end + if !isslotdeleted(h, token) && (key === keys[token] || isequal(key, keys[token])) + return (true, token) + end + + token = (token & (sz-1)) + 1 + iter += 1 + iter > maxprobe && break + end + return (false, 0) +end + +# gettokenvalue +@propagate_inbounds function gettokenvalue(h::OldHashIndices, token::Int) + return h.inds[token] +end + + +# insertable interface +isinsertable(::OldHashIndices) = true + +function Base.empty!(h::OldHashIndices{T}) where {T} + fill!(h.slots, 0x0) # It should be OK to reduce this back to some smaller size. + sz = length(h.slots) + empty!(h.inds) + resize!(h.inds, sz) + h.ndel = 0 + h.count = 0 + h.idxfloor = 1 + return h +end + +function Base.rehash!(h::OldHashIndices, newsz::Int = length(h.inds)) + _rehash!(h, nothing, newsz) + return h +end + +function _rehash!(h::OldHashIndices{T}, oldv::Union{Nothing, Vector}, newsz::Int) where {T} + olds = h.slots + oldk = h.inds + sz = length(olds) + newsz = Base._tablesz(newsz) + h.idxfloor = 1 + if h.count == 0 + resize!(h.slots, newsz) + fill!(h.slots, 0) + resize!(h.inds, newsz) + error() + oldv === nothing || resize!(oldv, newsz) + h.ndel = 0 + return oldv + end + + slots = zeros(UInt8, newsz) + keys = Vector{T}(undef, newsz) + vals = oldv === nothing ? nothing : Vector{eltype(oldv)}(undef, newsz) + count = 0 + maxprobe = h.maxprobe + + for i ∈ 1:sz + @inbounds if olds[i] == 0x1 + k = oldk[i] + v = vals === nothing ? nothing : oldv[i] + index0 = index = hashtoken(k, newsz) + while slots[index] != 0 + index = (index & (newsz-1)) + 1 + end + probe = (index - index0) & (newsz-1) + probe > maxprobe && (maxprobe = probe) + slots[index] = 0x1 + keys[index] = k + vals === nothing || (vals[index] = v) + count += 1 + end + end + + h.slots = slots + h.inds = keys + h.count = count + h.ndel = 0 + h.maxprobe = maxprobe + + return vals +end + +Base.sizehint!(h::OldHashIndices, newsz::Int) = _sizehint!(h, nothing, newsz) + +function _sizehint!(h::OldHashIndices{T}, values::Union{Nothing, Vector}, newsz::Int) where {T} + oldsz = length(h.slots) + if newsz <= oldsz + # TODO: shrink + # be careful: rehash!() assumes everything fits. it was only designed + # for growing. + return hash + end + # grow at least 25% + newsz = min(max(newsz, (oldsz*5)>>2), + Base.max_values(T)) + return _rehash!(h, values, newsz) +end + + + +function gettoken!(h::OldHashIndices{T}, key::T) where {T} + (token, _) = _gettoken!(h, nothing, key) # This will make sure a slot is available at `token` (or `-token` if it is new) + + if token < 0 + @inbounds (token, _) = _insert!(h, nothing, key, -token) # This will fill the slot with `key` + return (false, token) + else + return (true, token) + end +end + +# get the index where a key is stored, or -pos if not present +# and the key would be inserted at pos +# This version is for use by insert!, set! and get! +function _gettoken!(h::OldHashIndices{T}, values::Union{Nothing, Vector}, key::T) where {T} + sz = length(h.inds) + iter = 0 + maxprobe = h.maxprobe + token = hashtoken(key, sz) + avail = 0 + keys = h.inds + + # Search of the key is present or if there is a deleted slot `key` could fill. + @inbounds while true + if isslotempty(h, token) + if avail < 0 + return (avail, values) + end + return (-token, values) + end + + if isslotdeleted(h, token) + if avail == 0 + # found an available deleted slot, but we need to keep scanning + # in case `key` already exists in a later collided slot. + avail = -token + end + elseif key === keys[token] || isequal(key, keys[token]) + return (token, values) + end + + token = (token & (sz-1)) + 1 + iter += 1 + iter > maxprobe && break + end + + avail < 0 && return (avail, values) + + # The key definitely isn't present, but a slot may become available if we increase + # `maxprobe` (up to some reasonable global limits). + maxallowed = max(maxallowedprobe, sz>>maxprobeshift) + + @inbounds while iter < maxallowed + if !isslotfilled(h,token) + h.maxprobe = iter + return (-token, values) + end + token = (token & (sz-1)) + 1 + iter += 1 + end + + # If we get here, then all the probable slots are filled, and the only recourse is to + # increase the size of the hash map and try again + values = _rehash!(h, values, h.count > 64000 ? sz*2 : sz*4) + return _gettoken!(h, values, key) +end + +@propagate_inbounds function _insert!(h::OldHashIndices{T}, values::Union{Nothing, Vector}, key::T, token::Int) where {T} + h.slots[token] = 0x1 + h.inds[token] = key + h.count += 1 + if token < h.idxfloor + h.idxfloor = token + end + + # TODO revisit this... + #= + sz = length(h.inds) + # Rehash now if necessary + if h.ndel >= ((3*sz)>>2) || h.count*3 > sz*2 + # > 3/4 deleted or > 2/3 full + values = _rehash!(h, values, h.count > 64000 ? h.count*2 : h.count*4) + (_, token) = gettoken(h, key) + end + =# + + return (token, values) +end + + +function deletetoken!(h::OldHashIndices{T}, token::Int) where {T} + h.slots[token] = 0x2 + isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), h.inds, token-1) + + h.ndel += 1 + h.count -= 1 + return h +end + +# Since deleting elements doesn't mess with iteration, we can use `unsafe_filter!`` +Base.filter!(pred, h::OldHashIndices) = Base.unsafe_filter!(pred, h) + +# The default insertable indices +Base.empty(d::OldHashIndices, ::Type{T}) where {T} = OldHashIndices{T}() diff --git a/src/insertion.jl b/src/insertion.jl index e407fd8..ee45cce 100644 --- a/src/insertion.jl +++ b/src/insertion.jl @@ -399,25 +399,27 @@ end ## Filtering -# `filter!` is basically a programmatic version of `intersect!`. -function Base.filter!(pred, indices::AbstractIndices) - for i in indices - if !pred(i) - delete!(indices, i) - end - end - return indices -end - -# Dictionary version is similar -function Base.filter!(pred, dict::AbstractDictionary) - for (i, v) in pairs(dict) - if !pred(v) - delete!(dict, i) - end - end - return dict -end +# These generic implementations are gimped. + +# # `filter!` is basically a programmatic version of `intersect!`. +# function Base.filter!(pred, indices::AbstractIndices) +# for i in copy(indices) +# if !pred(i) +# delete!(indices, i) +# end +# end +# return indices +# end + +# # Dictionary version is similar +# function Base.filter!(pred, dict::AbstractDictionary) +# for (i, v) in pairs(copy(dict)) +# if !pred(v) +# delete!(dict, i) +# end +# end +# return dict +# end # This implementation is faster when deleting indices does not invalidate tokens/iteration, # and is opt-in only. Works for both dictionaries and indices @@ -458,4 +460,6 @@ of type `T` (even when the first argument is are indices). The default container Return an empty, insertable `AbstractDictionary` with indices of type `keytype(dict)` and elements of type `eltype(inds)`. """ -Base.empty(d::AbstractDictionary) = empty(d, keytype(d), eltype(d)) +Base.empty(d::AbstractDictionary) = empty(keys(d), keytype(d), eltype(d)) + +Base.empty(d::AbstractDictionary, ::Type{I}) where {I} = empty(keys(d), I) \ No newline at end of file From 20d634476528d3edae9a62662bdb28fc5b7c497a Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Tue, 10 Mar 2020 22:17:07 +1000 Subject: [PATCH 02/20] Added Benchmark to CI via github workflows --- .github/workflows/TagBot.yml | 11 ++ .github/workflows/benchmark.yml | 19 ++++ benchmark/Project.toml | 4 + benchmark/bench_indices.jl | 190 ++++++++++++++++++++++++++++++++ benchmark/benchmarks.jl | 12 ++ 5 files changed, 236 insertions(+) create mode 100644 .github/workflows/TagBot.yml create mode 100644 .github/workflows/benchmark.yml create mode 100644 benchmark/Project.toml create mode 100644 benchmark/bench_indices.jl create mode 100644 benchmark/benchmarks.jl diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..d77d3a0 --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,11 @@ +name: TagBot +on: + schedule: + - cron: 0 * * * * +jobs: + TagBot: + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..78e3711 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,19 @@ +name: Run benchmarks + +on: + pull_request: + +jobs: + Benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@latest + with: + version: 1.3 + - name: Install dependencies + run: julia -e 'using Pkg; pkg"add PkgBenchmark BenchmarkCI@0.1"' + - name: Run benchmarks + run: julia -e 'using BenchmarkCI; BenchmarkCI.judge()' + - name: Print judgement + run: julia -e 'using BenchmarkCI; BenchmarkCI.displayjudgement()' diff --git a/benchmark/Project.toml b/benchmark/Project.toml new file mode 100644 index 0000000..0963be5 --- /dev/null +++ b/benchmark/Project.toml @@ -0,0 +1,4 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +PkgBenchmark = "32113eaa-f34f-5b0d-bd6c-c81e245fc73d" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/benchmark/bench_indices.jl b/benchmark/bench_indices.jl new file mode 100644 index 0000000..2005a04 --- /dev/null +++ b/benchmark/bench_indices.jl @@ -0,0 +1,190 @@ +module BenchHashIndices + +using BenchmarkTools +using Dictionaries + +const suite = BenchmarkGroup() + +#sizes = [(8 .^ (0:8))...] +sizes = [10, 10_000] + +function build_set_by_insertion(n) + out = Set{Int}() + for i in 1:n + push!(out, i) + end + return out +end + +function build_hashindices_by_insertion(n) + out = HashIndices{Int}() + for i in 1:n + insert!(out, i) + end + return out +end + +function empty_by_deletion(set::Set, n) + for i in 1:n + delete!(set, i) + end + return set +end + +function empty_by_deletion(indices::HashIndices, n) + for i in 1:n + delete!(indices, i) + end + return indices +end + +function foreachsum(set) + count = Ref(0) + foreach(x -> count[] += 1, set) + return count[] +end + + +for n in sizes + r = 1:n + y = n ÷ 2 + pred1(x) = x != y + pred2(x) = x == y + vec = collect(r) + set = Set(r) + indices = Indices(collect(r)) + hash_indices = HashIndices(r) + + s = suite["constructor ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable Vector($r) + s["Set"] = @benchmarkable Set($r) + s["Indices"] = @benchmarkable Indices($r) + s["HashIndices"] = @benchmarkable HashIndices($r) + + s = suite["build by insertion ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable build_set_by_insertion($n) + s["HashIndices"] = @benchmarkable build_hashindices_by_insertion($n) + + #s = suite["empty by deletion ($n)"] = BenchmarkGroup() + #s["Set"] = @benchmarkable empty_by_deletion($(Set(r)), $n) + #s["HashIndices"] = @benchmarkable empty_by_deletion($(HashIndices(r)), $n) + + s = suite["in ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable in($y, $vec) + s["Set"] = @benchmarkable in($y, $set) + s["Indices"] = @benchmarkable in($y, $indices) + s["HashIndices"] = @benchmarkable in($y, $hash_indices) + + s = suite["count ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable count(iseven, $vec) + s["Set"] = @benchmarkable count(iseven, $set) + s["Indices"] = @benchmarkable count(iseven, $indices) + s["HashIndices"] = @benchmarkable count(iseven, $hash_indices) + + s = suite["sum ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable sum($vec) + s["Set"] = @benchmarkable sum($set) + s["Indices"] = @benchmarkable sum($indices) + s["HashIndices"] = @benchmarkable sum($hash_indices) + + s = suite["foreach ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable foreachsum($vec) + s["Set"] = @benchmarkable foreachsum($set) + s["Indices"] = @benchmarkable foreachsum($indices) + s["HashIndices"] = @benchmarkable foreachsum($hash_indices) + + s = suite["filter-map-reduce via generator ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x))) + s["Set"] = @benchmarkable sum($(2x for x in set if isodd(x))) + s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x))) + s["HashIndices"] = @benchmarkable sum($(2x for x in hash_indices if isodd(x))) + + s = suite["filter (most) ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable filter($pred1, $vec) + s["Set"] = @benchmarkable filter($pred1, $set) + s["Indices"] = @benchmarkable filter($pred1, $indices) + s["HashIndices"] = @benchmarkable filter($pred1, $hash_indices) + + s = suite["filter (half) ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable filter(iseven, $vec) + s["Set"] = @benchmarkable filter(iseven, $set) + s["Indices"] = @benchmarkable filter(iseven, $indices) + s["HashIndices"] = @benchmarkable filter(iseven, $hash_indices) + + s = suite["filter (few) ($n)"] = BenchmarkGroup() + s["Vector"] = @benchmarkable filter($pred2, $vec) + s["Set"] = @benchmarkable filter($pred2, $set) + s["Indices"] = @benchmarkable filter($pred2, $indices) + s["HashIndices"] = @benchmarkable filter($pred2, $hash_indices) + + # s = suite["filter! (most) ($n)"] = BenchmarkGroup() + # s["Vector"] = @benchmarkable filter($pred1, $(collect(r))) + # s["Set"] = @benchmarkable filter($pred1, $(Set(r))) + # #s["Indices"] = @benchmarkable filter($pred1, $(Indices(collect(r)))) + # s["HashIndices"] = @benchmarkable filter($pred1, $(HashIndices(r))) + + # s = suite["filter! (half) ($n)"] = BenchmarkGroup() + # s["Vector"] = @benchmarkable filter(iseven, $(collect(r))) + # s["Set"] = @benchmarkable filter(iseven, $(Set(r))) + # #s["Indices"] = @benchmarkable filter(iseven, $(Indices(collect(r)))) + # s["HashIndices"] = @benchmarkable filter(iseven, $(HashIndices(r))) + + # s = suite["filter! (few) ($n)"] = BenchmarkGroup() + # s["Vector"] = @benchmarkable filter!($pred2, $(collect(r))) + # s["Set"] = @benchmarkable filter!($pred2, $(Set(r))) + # #s["Indices"] = @benchmarkable filter!($pred2, $(Indices(collect(r)))) + # s["HashIndices"] = @benchmarkable filter!($pred2, $(HashIndices(r))) + + even_set = Set(2:2:n) + odd_set = Set(1:2:n) + even_hash_indices = HashIndices(2:2:n) + odd_hash_indices = HashIndices(1:2:n) + + s = suite["union ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable union($even_set, $odd_set) + s["HashIndices"] = @benchmarkable union($even_hash_indices, $odd_hash_indices) + + s = suite["intersect (empty) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable intersect($even_set, $odd_set) + s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $odd_hash_indices) + + s = suite["intersect (half) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable intersect($even_set, $set) + s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $hash_indices) + + s = suite["intersect (whole) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable intersect($set, $set) + s["HashIndices"] = @benchmarkable intersect($hash_indices, $hash_indices) + + s = suite["setdiff (whole) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable setdiff($even_set, $odd_set) + s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $odd_hash_indices) + + s = suite["setdiff (half) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable setdiff($even_set, $set) + s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $hash_indices) + + s = suite["setdiff (empty) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable setdiff($set, $set) + s["HashIndices"] = @benchmarkable setdiff($hash_indices, $hash_indices) + + s = suite["symdiff (whole) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable symdiff($even_set, $odd_set) + s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) + + s = suite["symdiff (left half) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable symdiff($set, $odd_set) + s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) + + s = suite["symdiff (right half) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable symdiff($even_set, $set) + s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) + + s = suite["symdiff (empty) ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable symdiff($set, $set) + s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) +end + +end # module + +BenchHashIndices.suite diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl new file mode 100644 index 0000000..e3b408d --- /dev/null +++ b/benchmark/benchmarks.jl @@ -0,0 +1,12 @@ +# Each file of the form "bench_$(name).jl" in this directory is `include`d and +# its last statement is assumed to be a `BenchmarkGroup`. This group is added +# to the top-level group `SUITE` with the `$name` extracted from the file name. + +using BenchmarkTools +const SUITE = BenchmarkGroup() +for file in sort(readdir(@__DIR__)) + if startswith(file, "bench_") && endswith(file, ".jl") + SUITE[chop(file, head = length("bench_"), tail = length(".jl"))] = + include(file) + end +end From 7330f19bc289d0cd2956abbf6f1a4dbf292cae32 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Sun, 31 May 2020 15:38:07 +1000 Subject: [PATCH 03/20] Use github actions for tests etc --- .github/workflows/CompatHelper.yml | 26 ++++++++++++++ .github/workflows/test.yml | 51 ++++++++++++++++++++++++++++ README.md | 2 +- benchmark/bench_indices.jl | 54 ++++++++++++++++-------------- 4 files changed, 106 insertions(+), 27 deletions(-) create mode 100644 .github/workflows/CompatHelper.yml create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..12aae0b --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,26 @@ + +name: CompatHelper + +on: + schedule: + - cron: '00 00 * * *' + +jobs: + CompatHelper: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: [1] + julia-arch: [x64] + os: [ubuntu-latest] + steps: + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.julia-version }} + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..04ae0cb --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,51 @@ +name: Test +on: + push: + branches: + - master + tags: '*' + pull_request: +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} + runs-on: ${{ matrix.os }} + continue-on-error: ${{ matrix.version == 'nightly' }} + strategy: + matrix: + version: + - '1.0' + - '1.1' + - '1.2' + - '1.3' + - '1.4' + - 'nightly' + os: + - ubuntu-latest + - macOS-latest + - windows-latest + arch: + - x86 + - x64 + exclude: + # Remove some configurations from the build matrix to reduce CI time. + # See https://github.com/marketplace/actions/setup-julia-environment + # MacOS not available on x86 + - {os: 'macOS-latest', arch: 'x86'} + # Don't test on all versions + - {os: 'macOS-latest', version: '1.1'} + - {os: 'macOS-latest', version: '1.2'} + - {os: 'macOS-latest', version: '1.3'} + - {os: 'windows-latest', version: '1.1'} + - {os: 'windows-latest', version: '1.2'} + - {os: 'windows-latest', version: '1.3'} + steps: + - uses: actions/checkout@v1 + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/julia-buildpkg@latest + - uses: julia-actions/julia-runtest@latest + - uses: julia-actions/julia-uploadcodecov@latest + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/README.md b/README.md index 8f29a8a..12c0f36 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ *An alternative interface for dictionaries in Julia, for improved productivity and performance* -[![Build Status](https://travis-ci.org/andyferris/Dictionaries.jl.svg?branch=master)](https://travis-ci.org/andyferris/Dictionaries.jl) +![Test Status](https://github.com/andyferris/Dictionaries.jl/workflows/Test/badge.svg) [![Codecov](https://codecov.io/gh/andyferris/Dictionaries.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/andyferris/Dictionaries.jl) This package is still quite young - new features are being added and some (low-level) interfaces may be tweaked in the future, but things should be stable enough for general usage. Contributions welcome - please submit an issue or PR! diff --git a/benchmark/bench_indices.jl b/benchmark/bench_indices.jl index 2005a04..7168536 100644 --- a/benchmark/bench_indices.jl +++ b/benchmark/bench_indices.jl @@ -6,7 +6,7 @@ using Dictionaries const suite = BenchmarkGroup() #sizes = [(8 .^ (0:8))...] -sizes = [10, 10_000] +sizes = [10, 10_000, 10_000_000] function build_set_by_insertion(n) out = Set{Int}() @@ -55,81 +55,83 @@ for n in sizes indices = Indices(collect(r)) hash_indices = HashIndices(r) - s = suite["constructor ($n)"] = BenchmarkGroup() + suite_n = suite["$n"] = BenchmarkGroup() + + s = suite_n["constructor ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable Vector($r) s["Set"] = @benchmarkable Set($r) s["Indices"] = @benchmarkable Indices($r) s["HashIndices"] = @benchmarkable HashIndices($r) - s = suite["build by insertion ($n)"] = BenchmarkGroup() + s = suite_n["build by insertion ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable build_set_by_insertion($n) s["HashIndices"] = @benchmarkable build_hashindices_by_insertion($n) - #s = suite["empty by deletion ($n)"] = BenchmarkGroup() + #s = suite_n["empty by deletion ($n)"] = BenchmarkGroup() #s["Set"] = @benchmarkable empty_by_deletion($(Set(r)), $n) #s["HashIndices"] = @benchmarkable empty_by_deletion($(HashIndices(r)), $n) - s = suite["in ($n)"] = BenchmarkGroup() + s = suite_n["in ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable in($y, $vec) s["Set"] = @benchmarkable in($y, $set) s["Indices"] = @benchmarkable in($y, $indices) s["HashIndices"] = @benchmarkable in($y, $hash_indices) - s = suite["count ($n)"] = BenchmarkGroup() + s = suite_n["count ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable count(iseven, $vec) s["Set"] = @benchmarkable count(iseven, $set) s["Indices"] = @benchmarkable count(iseven, $indices) s["HashIndices"] = @benchmarkable count(iseven, $hash_indices) - s = suite["sum ($n)"] = BenchmarkGroup() + s = suite_n["sum ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable sum($vec) s["Set"] = @benchmarkable sum($set) s["Indices"] = @benchmarkable sum($indices) s["HashIndices"] = @benchmarkable sum($hash_indices) - s = suite["foreach ($n)"] = BenchmarkGroup() + s = suite_n["foreach ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable foreachsum($vec) s["Set"] = @benchmarkable foreachsum($set) s["Indices"] = @benchmarkable foreachsum($indices) s["HashIndices"] = @benchmarkable foreachsum($hash_indices) - s = suite["filter-map-reduce via generator ($n)"] = BenchmarkGroup() + s = suite_n["filter-map-reduce via generator ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x))) s["Set"] = @benchmarkable sum($(2x for x in set if isodd(x))) s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x))) s["HashIndices"] = @benchmarkable sum($(2x for x in hash_indices if isodd(x))) - s = suite["filter (most) ($n)"] = BenchmarkGroup() + s = suite_n["filter (most) ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable filter($pred1, $vec) s["Set"] = @benchmarkable filter($pred1, $set) s["Indices"] = @benchmarkable filter($pred1, $indices) s["HashIndices"] = @benchmarkable filter($pred1, $hash_indices) - s = suite["filter (half) ($n)"] = BenchmarkGroup() + s = suite_n["filter (half) ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable filter(iseven, $vec) s["Set"] = @benchmarkable filter(iseven, $set) s["Indices"] = @benchmarkable filter(iseven, $indices) s["HashIndices"] = @benchmarkable filter(iseven, $hash_indices) - s = suite["filter (few) ($n)"] = BenchmarkGroup() + s = suite_n["filter (few) ($n)"] = BenchmarkGroup() s["Vector"] = @benchmarkable filter($pred2, $vec) s["Set"] = @benchmarkable filter($pred2, $set) s["Indices"] = @benchmarkable filter($pred2, $indices) s["HashIndices"] = @benchmarkable filter($pred2, $hash_indices) - # s = suite["filter! (most) ($n)"] = BenchmarkGroup() + # s = suite_n["filter! (most) ($n)"] = BenchmarkGroup() # s["Vector"] = @benchmarkable filter($pred1, $(collect(r))) # s["Set"] = @benchmarkable filter($pred1, $(Set(r))) # #s["Indices"] = @benchmarkable filter($pred1, $(Indices(collect(r)))) # s["HashIndices"] = @benchmarkable filter($pred1, $(HashIndices(r))) - # s = suite["filter! (half) ($n)"] = BenchmarkGroup() + # s = suite_n["filter! (half) ($n)"] = BenchmarkGroup() # s["Vector"] = @benchmarkable filter(iseven, $(collect(r))) # s["Set"] = @benchmarkable filter(iseven, $(Set(r))) # #s["Indices"] = @benchmarkable filter(iseven, $(Indices(collect(r)))) # s["HashIndices"] = @benchmarkable filter(iseven, $(HashIndices(r))) - # s = suite["filter! (few) ($n)"] = BenchmarkGroup() + # s = suite_n["filter! (few) ($n)"] = BenchmarkGroup() # s["Vector"] = @benchmarkable filter!($pred2, $(collect(r))) # s["Set"] = @benchmarkable filter!($pred2, $(Set(r))) # #s["Indices"] = @benchmarkable filter!($pred2, $(Indices(collect(r)))) @@ -140,47 +142,47 @@ for n in sizes even_hash_indices = HashIndices(2:2:n) odd_hash_indices = HashIndices(1:2:n) - s = suite["union ($n)"] = BenchmarkGroup() + s = suite_n["union ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable union($even_set, $odd_set) s["HashIndices"] = @benchmarkable union($even_hash_indices, $odd_hash_indices) - s = suite["intersect (empty) ($n)"] = BenchmarkGroup() + s = suite_n["intersect (empty) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable intersect($even_set, $odd_set) s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $odd_hash_indices) - s = suite["intersect (half) ($n)"] = BenchmarkGroup() + s = suite_n["intersect (half) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable intersect($even_set, $set) s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $hash_indices) - s = suite["intersect (whole) ($n)"] = BenchmarkGroup() + s = suite_n["intersect (whole) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable intersect($set, $set) s["HashIndices"] = @benchmarkable intersect($hash_indices, $hash_indices) - s = suite["setdiff (whole) ($n)"] = BenchmarkGroup() + s = suite_n["setdiff (whole) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable setdiff($even_set, $odd_set) s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $odd_hash_indices) - s = suite["setdiff (half) ($n)"] = BenchmarkGroup() + s = suite_n["setdiff (half) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable setdiff($even_set, $set) s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $hash_indices) - s = suite["setdiff (empty) ($n)"] = BenchmarkGroup() + s = suite_n["setdiff (empty) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable setdiff($set, $set) s["HashIndices"] = @benchmarkable setdiff($hash_indices, $hash_indices) - s = suite["symdiff (whole) ($n)"] = BenchmarkGroup() + s = suite_n["symdiff (whole) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable symdiff($even_set, $odd_set) s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) - s = suite["symdiff (left half) ($n)"] = BenchmarkGroup() + s = suite_n["symdiff (left half) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable symdiff($set, $odd_set) s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) - s = suite["symdiff (right half) ($n)"] = BenchmarkGroup() + s = suite_n["symdiff (right half) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable symdiff($even_set, $set) s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) - s = suite["symdiff (empty) ($n)"] = BenchmarkGroup() + s = suite_n["symdiff (empty) ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable symdiff($set, $set) s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) end From afa4c382d63229f935767eaa2afc5216a1dffbfb Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Sun, 31 May 2020 21:36:05 +1000 Subject: [PATCH 04/20] Improve CI, add tests, fix a deleted bug Still doesn't pass tests though... --- Project.toml | 2 +- src/HashIndices.jl | 27 +++++++++--------- test/HashDictionary.jl | 62 ++++++++++++++++++++++++++++++++++++++++++ test/HashIndices.jl | 48 ++++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 14 deletions(-) diff --git a/Project.toml b/Project.toml index a30d161..4c9cdd9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ authors = ["Andy Ferris "] name = "Dictionaries" uuid = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4" -version = "0.2.1" +version = "0.3.0" [deps] Indexing = "313cdc1a-70c2-5d6a-ae34-0150d3930a38" diff --git a/src/HashIndices.jl b/src/HashIndices.jl index 72596fe..4a7dc63 100644 --- a/src/HashIndices.jl +++ b/src/HashIndices.jl @@ -179,10 +179,10 @@ function gettoken(indices::HashIndices{I}, i::I) where {I} trial_index = indices.slots[trial_slot] if trial_index == 0 return (false, (0, 0)) - end - - if full_hash === indices.hashes[trial_index] && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) - return (true, (trial_slot, trial_index)) + elseif trial_index > 0 + if full_hash === indices.hashes[trial_index] && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) + return (true, (trial_slot, trial_index)) + end end trial_slot = trial_slot & bit_mask @@ -213,15 +213,16 @@ function gettoken!(indices::HashIndices{I}, i::I, values = ()) where {I} trial_index = indices.slots[trial_slot] if trial_index == 0 break - end - if trial_index < 0 && deleted_slot == 0 - deleted_slot = trial_slot - end - - trial_hash = indices.hashes[trial_index] + elseif trial_index < 0 + if deleted_slot == 0 + deleted_slot = trial_slot + end + else + trial_hash = indices.hashes[trial_index] - if trial_hash === full_hash && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) - return (true, (trial_slot, trial_index)) + if trial_hash === full_hash && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) + return (true, (trial_slot, trial_index)) + end end trial_slot = trial_slot & bit_mask @@ -234,7 +235,7 @@ function gettoken!(indices::HashIndices{I}, i::I, values = ()) where {I} indices.slots[trial_slot] = new_index else # Use the deleted slot - indices.slots[trial_slot] = new_index + indices.slots[deleted_slot] = new_index indices.deleted -= 1 end push!(indices.hashes, full_hash) diff --git a/test/HashDictionary.jl b/test/HashDictionary.jl index 3b1f664..f1da44d 100644 --- a/test/HashDictionary.jl +++ b/test/HashDictionary.jl @@ -106,4 +106,66 @@ @test isempty(empty!(d)) # TODO token interface + + @testset "Dict tests from Base" begin + h = HashDictionary{Int, Int}() + + for i in 1:10000 + insert!(h, i, i+1) + end + for i in 1:10000 + @test h[i] == i+1 + end + for i in 1:2:10000 + delete!(h, i) + end + for i in 1:10000 + if iseven(i) + @test h[i] == i+1 + else + @test_throws IndexError h[i] + end + end + for i in 1:2:10000 + insert!(h, i, i+1) + end + for i in 1:10000 + @test h[i] == i+1 + end + for i in 1:10000 + delete!(h, i) + end + @test isempty(h) + insert!(h, 77, 100) + @test h[77] == 100 + for i in 1:10000 + set!(h, i, i+1) + end + for i in 1:10000 + @test h[i] == i+1 + end + for i in 1:2:10000 + delete!(h, i) + end + for i in 1:10000 + if iseven(i) + @test h[i] == i+1 + else + @test_throws IndexError h[i] + end + end + for i in 10001:20000 + insert!(h, i, i+1) + end + for i in 1:10000 + if iseven(i) + @test h[i] == i+1 + else + @test_throws IndexError h[i] + end + end + for i in 10000:20000 + @test h[i] == i+1 + end + end end \ No newline at end of file diff --git a/test/HashIndices.jl b/test/HashIndices.jl index b06dfba..9dc15b0 100644 --- a/test/HashIndices.jl +++ b/test/HashIndices.jl @@ -46,5 +46,53 @@ @test all(in(i, h) == iseven(i) for i in 2:1000) @test isempty(empty!(h)) + @testset "Adapated from Dict tests from Base" begin + h = HashIndices{Int}() + N = 10000 + + for i in 1:N + insert!(h, i) + end + for i in 1:N + @test i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + @test (i in h) == iseven(i) + end + for i in 1:2:N + insert!(h, i) + end + for i in 1:N + @test i in h + end + for i in 1:N + delete!(h, i) + end + @test isempty(h) + insert!(h, 77) + @test 77 in h + for i in 1:N + set!(h, i) + end + for i in 1:N + @test i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + @test (i in h) == iseven(i) + end + for i in N+1:2N + insert!(h, i) + end + for i in 1:2N + @test (i in h) == i > N || h == iseven(i) + end + end + # TODO: token interface end \ No newline at end of file From 0ae4d60be47494c9eafee09f7911c4f06b9bc414 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Mon, 1 Jun 2020 10:33:00 +1000 Subject: [PATCH 05/20] Fix the implementation :) --- src/HashIndices.jl | 55 +++++++++++++++++++++++---------------------- test/HashIndices.jl | 2 +- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/HashIndices.jl b/src/HashIndices.jl index 4a7dc63..88723ad 100644 --- a/src/HashIndices.jl +++ b/src/HashIndices.jl @@ -9,7 +9,7 @@ mutable struct HashIndices{I} <: AbstractIndices{I} hashes::Vector{UInt} # Deletion marker stored in high bit values::Vector{I} - deleted::Int + holes::Int # Number of "vacant" slots in hashes and values end HashIndices(; sizehint = 8) = HashIndices{Any}(; sizehint = sizehint) @@ -27,7 +27,6 @@ Construct a `HashIndices` with indices from iterable container `iter`. """ function HashIndices(iter) if Base.IteratorEltype(iter) === Base.EltypeUnknown() - # TODO: implement automatic widening from iterators of Base.EltypeUnkown iter = collect(iter) end @@ -61,10 +60,10 @@ function HashIndices{I}(values::Vector{I}) where {I} end function Base.copy(indices::HashIndices{I}) where {I} - if indices.deleted == 0 + if indices.holes == 0 return HashIndices{I}(copy(indices.slots), copy(indices.hashes), copy(indices.values), 0) else - out = HashIndices{I}(Vector{Int}(), copy(indices.hashes), copy(indices.values), indices.deleted) + out = HashIndices{I}(Vector{Int}(), copy(indices.hashes), copy(indices.values), indices.holes) newsize = Base._tablesz(3*length(indices) >> 0x01) rehash!(out, newsize) end @@ -76,7 +75,7 @@ function rehash!(indices::HashIndices{I}, newsize::Int, values = (), include_las fill!(slots, 0) bit_mask = newsize - 1 # newsize is a power of two - if indices.deleted == 0 + if indices.holes == 0 for (index, full_hash) in enumerate(indices.hashes) trial_slot = reinterpret(Int, full_hash) & bit_mask @inbounds while true @@ -87,11 +86,11 @@ function rehash!(indices::HashIndices{I}, newsize::Int, values = (), include_las else trial_slot = trial_slot & bit_mask end - # This is potentially an infinte loop and care must be taken by the callee not - # to overfill the container + # This is potentially an infinte loop and care must be taken not to overfill the container end end else + # Compactify indices.values, indices.hashes and the values while we are at it to_index = Ref(1) # Reassigning to to_index/from_index gives the closure capture boxing issue, so mutate a reference instead from_index = Ref(1) n_values = length(indices.values) @@ -106,6 +105,9 @@ function rehash!(indices::HashIndices{I}, newsize::Int, values = (), include_las indices.hashes[to_index[]] = indices.hashes[from_index[]] indices.values[to_index[]] = indices.values[from_index[]] if include_last_values || from_index[] < n_values + # Note - the last slot might end up with a random value (or + # GC'd reference). It's the callers responsibility to ensure the + # last slot is written to after this operation. map(values) do (vals) @inbounds vals[to_index[]] = vals[from_index[]] end @@ -121,18 +123,18 @@ function rehash!(indices::HashIndices{I}, newsize::Int, values = (), include_las from_index[] += 1 end - new_size = length(indices.values) - indices.deleted + new_size = n_values - indices.holes resize!(indices.values, new_size) resize!(indices.hashes, new_size) map(values) do (vals) resize!(vals, new_size) end - indices.deleted = 0 + indices.holes = 0 end return indices end -Base.length(indices::HashIndices) = length(indices.values) - indices.deleted +Base.length(indices::HashIndices) = length(indices.values) - indices.holes # Token interface istokenizable(::HashIndices) = true @@ -141,7 +143,7 @@ tokentype(::HashIndices) = Int # Duration iteration the token cannot be used for deletion - we do not worry about the slots @propagate_inbounds function iteratetoken(indices::HashIndices) - if indices.deleted == 0 + if indices.holes == 0 return length(indices) > 0 ? ((0, 1), 1) : nothing end index = 1 @@ -156,7 +158,7 @@ end @propagate_inbounds function iteratetoken(indices::HashIndices, index::Int) index += 1 - if indices.deleted == 0 # apparently this is enough to make it iterate as fast as `Vector` + if indices.holes == 0 # apparently this is enough to make it iterate as fast as `Vector` return index <= length(indices.values) ? ((0, index), index) : nothing end @inbounds while index <= length(indices.hashes) @@ -203,7 +205,6 @@ function gettoken!(indices::HashIndices{I}, i::I, values = ()) where {I} n_slots = length(indices.slots) bit_mask = n_slots - 1 # n_slots is always a power of two n_values = length(indices.values) - new_index = n_values + 1 trial_slot = reinterpret(Int, full_hash) & bit_mask trial_index = 0 @@ -230,27 +231,30 @@ function gettoken!(indices::HashIndices{I}, i::I, values = ()) where {I} # to completely fill the container end + new_index = n_values + 1 if deleted_slot == 0 # Use the trail slot indices.slots[trial_slot] = new_index else # Use the deleted slot indices.slots[deleted_slot] = new_index - indices.deleted -= 1 end push!(indices.hashes, full_hash) push!(indices.values, i) map(values) do (vals) resize!(vals, length(vals) + 1) end - + # Expand the hash map when it reaches 2/3rd full if 3 * new_index > 2 * n_slots # Grow faster for small hash maps than for large ones newsize = n_slots > 16000 ? 2 * n_slots : 4 * n_slots rehash!(indices, newsize, values, false) - # The slot almost certainly has changed + # The index has changed + new_index = length(indices.values) + + # The slot also has changed bit_mask = newsize - 1 trial_slot = reinterpret(Int, full_hash) & bit_mask @inbounds while true @@ -260,9 +264,6 @@ function gettoken!(indices::HashIndices{I}, i::I, values = ()) where {I} end trial_slot = trial_slot & bit_mask end - - # The index may have changed - new_index = length(indices.values) end return (false, (trial_slot, new_index)) @@ -275,13 +276,13 @@ end indices.slots[slot] = -index indices.hashes[index] = deletion_mask isbitstype(I) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), indices.values, index-1) - indices.deleted += 1 + indices.holes += 1 # Recreate the hash map when 1/3rd of the values are deletions - n_slots = length(indices.slots) - n_values = length(indices.values) - indices.deleted - if 3 * indices.deleted > n_values + n_values = length(indices.values) - indices.holes + if 3 * indices.holes > n_values # Halve if necessary + n_slots = length(indices.slots) halve = 4 * n_values < n_slots && n_slots > 8 rehash!(indices, halve ? n_slots >> 0x01 : n_slots, values) end @@ -293,7 +294,7 @@ function Base.empty!(indices::HashIndices{I}) where {I} indices.hashes = Vector{UInt}() indices.values = Vector{I}() indices.slots = fill(0, 8) - indices.deleted = 0 + indices.holes = 0 return indices end @@ -301,13 +302,13 @@ end # Accelerated filtering function Base.filter!(pred, indices::HashIndices) - _filter!(i -> pred(@inbounds indices.values[i]), keys(indices.values), indices.values, indices.hashes, ()) - indices.deleted = 0 + _filter!(i -> pred(@inbounds indices.values[i]), indices.values, indices.hashes, ()) + indices.holes = 0 newsize = Base._tablesz(3*length(indices.values) >> 0x01) rehash!(indices, newsize) end -@inline function _filter!(pred, range, indices, hashes, values = ()) +@inline function _filter!(pred, indices, hashes, values = ()) n = length(indices) i = Ref(0) j = Ref(0) diff --git a/test/HashIndices.jl b/test/HashIndices.jl index 9dc15b0..fec66fa 100644 --- a/test/HashIndices.jl +++ b/test/HashIndices.jl @@ -90,7 +90,7 @@ insert!(h, i) end for i in 1:2N - @test (i in h) == i > N || h == iseven(i) + @test (i in h) == (i > N || iseven(i)) end end From 320cd057c1da373f92e3c7bf73b3275d5061f7e5 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Mon, 1 Jun 2020 22:19:21 +1000 Subject: [PATCH 06/20] Fix benchmarks, add union etc. --- benchmark/bench_indices.jl | 198 +++++++++++++++++++++++++++---------- src/AbstractIndices.jl | 48 +++++++++ src/Indices.jl | 5 + 3 files changed, 200 insertions(+), 51 deletions(-) diff --git a/benchmark/bench_indices.jl b/benchmark/bench_indices.jl index 7168536..2ed7e5e 100644 --- a/benchmark/bench_indices.jl +++ b/benchmark/bench_indices.jl @@ -6,7 +6,16 @@ using Dictionaries const suite = BenchmarkGroup() #sizes = [(8 .^ (0:8))...] -sizes = [10, 10_000, 10_000_000] +sizes = [10, 100, 1000, 10_000] #, 10_000, 10_000_000] +cutoff = 101 + +function build_vector_by_insertion(n) + out = Vector{Int}() + for i in 1:n + push!(out, i) + end + return out +end function build_set_by_insertion(n) out = Set{Int}() @@ -16,6 +25,14 @@ function build_set_by_insertion(n) return out end +function build_indices_by_insertion(n) + out = Indices{Int}() + for i in 1:n + insert!(out, i) + end + return out +end + function build_hashindices_by_insertion(n) out = HashIndices{Int}() for i in 1:n @@ -24,6 +41,21 @@ function build_hashindices_by_insertion(n) return out end +function build_old_hashindices_by_insertion(n) + out = Dictionaries.OldHashIndices{Int}() + for i in 1:n + insert!(out, i) + end + return out +end + +function empty_by_deletion(set::Vector, n) + for i in 1:n + pop!(set) + end + return set +end + function empty_by_deletion(set::Set, n) for i in 1:n delete!(set, i) @@ -31,7 +63,7 @@ function empty_by_deletion(set::Set, n) return set end -function empty_by_deletion(indices::HashIndices, n) +function empty_by_deletion(indices::AbstractIndices, n) for i in 1:n delete!(indices, i) end @@ -50,141 +82,205 @@ for n in sizes y = n ÷ 2 pred1(x) = x != y pred2(x) = x == y - vec = collect(r) + + if n < cutoff + vec = collect(r) + even_vec = collect(2:2:n) + odd_vec = collect(1:2:n) + + indices = Indices(collect(r)) + even_indices = Indices(collect(2:2:n)) + odd_indices = Indices(collect(1:2:n)) + end + set = Set(r) - indices = Indices(collect(r)) + even_set = Set(2:2:n) + odd_set = Set(1:2:n) + hash_indices = HashIndices(r) + even_hash_indices = HashIndices(2:2:n) + odd_hash_indices = HashIndices(1:2:n) + + old_hash_indices = Dictionaries.OldHashIndices(r) + even_old_hash_indices = Dictionaries.OldHashIndices(2:2:n) + odd_old_hash_indices = Dictionaries.OldHashIndices(1:2:n) suite_n = suite["$n"] = BenchmarkGroup() s = suite_n["constructor ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable Vector($r) + n < cutoff && (s["Vector"] = @benchmarkable Vector($r)) s["Set"] = @benchmarkable Set($r) - s["Indices"] = @benchmarkable Indices($r) + n < cutoff && (s["Indices"] = @benchmarkable Indices($r)) s["HashIndices"] = @benchmarkable HashIndices($r) + s["OldHashIndices"] = @benchmarkable HashIndices($r) s = suite_n["build by insertion ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector (push!)"] = @benchmarkable build_vector_by_insertion($n)) s["Set"] = @benchmarkable build_set_by_insertion($n) + n < cutoff && (s["Set"] = @benchmarkable build_indices_by_insertion($n)) s["HashIndices"] = @benchmarkable build_hashindices_by_insertion($n) - - #s = suite_n["empty by deletion ($n)"] = BenchmarkGroup() - #s["Set"] = @benchmarkable empty_by_deletion($(Set(r)), $n) - #s["HashIndices"] = @benchmarkable empty_by_deletion($(HashIndices(r)), $n) + s["OldHashIndices"] = @benchmarkable build_old_hashindices_by_insertion($n) + + s = suite_n["empty by deletion ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector (pop!)"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=collect($r))) + s["Set"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Set($r)) + n < cutoff && (s["Indices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Indices(collect($r)))) + s["HashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=HashIndices($r)) + s["OldHashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Dictionaries.OldHashIndices($r)) s = suite_n["in ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable in($y, $vec) + n < cutoff && (s["Vector"] = @benchmarkable in($y, $vec)) s["Set"] = @benchmarkable in($y, $set) - s["Indices"] = @benchmarkable in($y, $indices) + n < cutoff && (s["Indices"] = @benchmarkable in($y, $indices)) s["HashIndices"] = @benchmarkable in($y, $hash_indices) + s["OldHashIndices"] = @benchmarkable in($y, $old_hash_indices) s = suite_n["count ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable count(iseven, $vec) + n < cutoff && (s["Vector"] = @benchmarkable count(iseven, $vec)) s["Set"] = @benchmarkable count(iseven, $set) - s["Indices"] = @benchmarkable count(iseven, $indices) + n < cutoff && (s["Indices"] = @benchmarkable count(iseven, $indices)) s["HashIndices"] = @benchmarkable count(iseven, $hash_indices) + s["OldHashIndices"] = @benchmarkable count(iseven, $old_hash_indices) s = suite_n["sum ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable sum($vec) + n < cutoff && (s["Vector"] = @benchmarkable sum($vec)) s["Set"] = @benchmarkable sum($set) - s["Indices"] = @benchmarkable sum($indices) + n < cutoff && (s["Indices"] = @benchmarkable sum($indices)) s["HashIndices"] = @benchmarkable sum($hash_indices) + s["OldHashIndices"] = @benchmarkable sum($old_hash_indices) s = suite_n["foreach ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable foreachsum($vec) + n < cutoff && (s["Vector"] = @benchmarkable foreachsum($vec)) s["Set"] = @benchmarkable foreachsum($set) - s["Indices"] = @benchmarkable foreachsum($indices) + n < cutoff && (s["Indices"] = @benchmarkable foreachsum($indices)) s["HashIndices"] = @benchmarkable foreachsum($hash_indices) + s["OldHashIndices"] = @benchmarkable foreachsum($old_hash_indices) s = suite_n["filter-map-reduce via generator ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x))) + n < cutoff && (s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x)))) s["Set"] = @benchmarkable sum($(2x for x in set if isodd(x))) - s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x))) + n < cutoff && (s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x)))) s["HashIndices"] = @benchmarkable sum($(2x for x in hash_indices if isodd(x))) + s["OldHashIndices"] = @benchmarkable sum($(2x for x in old_hash_indices if isodd(x))) s = suite_n["filter (most) ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable filter($pred1, $vec) + n < cutoff && (s["Vector"] = @benchmarkable filter($pred1, $vec)) s["Set"] = @benchmarkable filter($pred1, $set) - s["Indices"] = @benchmarkable filter($pred1, $indices) + n < cutoff && (s["Indices"] = @benchmarkable filter($pred1, $indices)) s["HashIndices"] = @benchmarkable filter($pred1, $hash_indices) + s["OldHashIndices"] = @benchmarkable filter($pred1, $old_hash_indices) s = suite_n["filter (half) ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable filter(iseven, $vec) + n < cutoff && (s["Vector"] = @benchmarkable filter(iseven, $vec)) s["Set"] = @benchmarkable filter(iseven, $set) - s["Indices"] = @benchmarkable filter(iseven, $indices) + n < cutoff && (s["Indices"] = @benchmarkable filter(iseven, $indices)) s["HashIndices"] = @benchmarkable filter(iseven, $hash_indices) + s["OldHashIndices"] = @benchmarkable filter(iseven, $old_hash_indices) s = suite_n["filter (few) ($n)"] = BenchmarkGroup() - s["Vector"] = @benchmarkable filter($pred2, $vec) + n < cutoff && (s["Vector"] = @benchmarkable filter($pred2, $vec)) s["Set"] = @benchmarkable filter($pred2, $set) - s["Indices"] = @benchmarkable filter($pred2, $indices) + n < cutoff && (s["Indices"] = @benchmarkable filter($pred2, $indices)) s["HashIndices"] = @benchmarkable filter($pred2, $hash_indices) - - # s = suite_n["filter! (most) ($n)"] = BenchmarkGroup() - # s["Vector"] = @benchmarkable filter($pred1, $(collect(r))) - # s["Set"] = @benchmarkable filter($pred1, $(Set(r))) - # #s["Indices"] = @benchmarkable filter($pred1, $(Indices(collect(r)))) - # s["HashIndices"] = @benchmarkable filter($pred1, $(HashIndices(r))) - - # s = suite_n["filter! (half) ($n)"] = BenchmarkGroup() - # s["Vector"] = @benchmarkable filter(iseven, $(collect(r))) - # s["Set"] = @benchmarkable filter(iseven, $(Set(r))) - # #s["Indices"] = @benchmarkable filter(iseven, $(Indices(collect(r)))) - # s["HashIndices"] = @benchmarkable filter(iseven, $(HashIndices(r))) - - # s = suite_n["filter! (few) ($n)"] = BenchmarkGroup() - # s["Vector"] = @benchmarkable filter!($pred2, $(collect(r))) - # s["Set"] = @benchmarkable filter!($pred2, $(Set(r))) - # #s["Indices"] = @benchmarkable filter!($pred2, $(Indices(collect(r)))) - # s["HashIndices"] = @benchmarkable filter!($pred2, $(HashIndices(r))) - - even_set = Set(2:2:n) - odd_set = Set(1:2:n) - even_hash_indices = HashIndices(2:2:n) - odd_hash_indices = HashIndices(1:2:n) + s["OldHashIndices"] = @benchmarkable filter($pred2, $old_hash_indices) + + s = suite_n["filter! (most) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter!($pred1, s) setup=(s=collect($r))) + s["Set"] = @benchmarkable filter!($pred1, s) setup=(s=Set($r)) + n < cutoff && (s["Indices"] = @benchmarkable filter!($pred1, s) setup=(s=Indices(collect($r)))) + s["HashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=HashIndices($r)) + s["OldHashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=Dictionaries.OldHashIndices($r)) + + s = suite_n["filter! (half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter!($iseven, s) setup=(s=collect($r))) + s["Set"] = @benchmarkable filter!($iseven, s) setup=(s=Set($r)) + n < cutoff && (s["Indices"] = @benchmarkable filter!($iseven, s) setup=(s=Indices(collect($r)))) + s["HashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=HashIndices($r)) + s["OldHashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=Dictionaries.OldHashIndices($r)) + + s = suite_n["filter! (few) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter!($pred2, s) setup=(s=collect($r))) + s["Set"] = @benchmarkable filter!($pred2, s) setup=(s=Set($r)) + n < cutoff && (s["Indices"] = @benchmarkable filter!($pred2, s) setup=(s=Indices(collect($r)))) + s["HashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=HashIndices($r)) + s["OldHashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=Dictionaries.OldHashIndices($r)) s = suite_n["union ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable union($even_vec, $odd_vec)) s["Set"] = @benchmarkable union($even_set, $odd_set) + n < cutoff && (s["Indices"] = @benchmarkable union($even_indices, $even_indices)) s["HashIndices"] = @benchmarkable union($even_hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable union($even_old_hash_indices, $odd_old_hash_indices) s = suite_n["intersect (empty) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $odd_vec)) s["Set"] = @benchmarkable intersect($even_set, $odd_set) + n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $odd_indices)) s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $odd_old_hash_indices) s = suite_n["intersect (half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $vec)) s["Set"] = @benchmarkable intersect($even_set, $set) + n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $indices)) s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $old_hash_indices) s = suite_n["intersect (whole) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable intersect($vec, $vec)) s["Set"] = @benchmarkable intersect($set, $set) + n < cutoff && (s["Indices"] = @benchmarkable intersect($indices, $indices)) s["HashIndices"] = @benchmarkable intersect($hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable intersect($old_hash_indices, $old_hash_indices) s = suite_n["setdiff (whole) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $odd_vec)) s["Set"] = @benchmarkable setdiff($even_set, $odd_set) + n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $odd_indices)) s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $odd_old_hash_indices) s = suite_n["setdiff (half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $vec)) s["Set"] = @benchmarkable setdiff($even_set, $set) + n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $indices)) s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $old_hash_indices) s = suite_n["setdiff (empty) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable setdiff($vec, $vec)) s["Set"] = @benchmarkable setdiff($set, $set) + n < cutoff && (s["Indices"] = @benchmarkable setdiff($indices, $indices)) s["HashIndices"] = @benchmarkable setdiff($hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable setdiff($old_hash_indices, $old_hash_indices) s = suite_n["symdiff (whole) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $odd_vec)) s["Set"] = @benchmarkable symdiff($even_set, $odd_set) + n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $odd_indices)) s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $odd_old_hash_indices) s = suite_n["symdiff (left half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $odd_vec)) s["Set"] = @benchmarkable symdiff($set, $odd_set) - s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) + n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $odd_indices)) + s["HashIndices"] = @benchmarkable symdiff($hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $odd_old_hash_indices) s = suite_n["symdiff (right half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $vec)) s["Set"] = @benchmarkable symdiff($even_set, $set) - s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) + n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $indices)) + s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $old_hash_indices) s = suite_n["symdiff (empty) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $vec)) s["Set"] = @benchmarkable symdiff($set, $set) + n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $indices)) s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $old_hash_indices) end end # module diff --git a/src/AbstractIndices.jl b/src/AbstractIndices.jl index 84b8a44..6296652 100644 --- a/src/AbstractIndices.jl +++ b/src/AbstractIndices.jl @@ -206,3 +206,51 @@ function Base.hash(inds::AbstractIndices, h::UInt) return hash(hash(UInt === UInt64 ? 0x8955a87bc313a509 : 0xa9cff5d1, h1), h1) end + +function Base.union(i::AbstractIndices, itr) + if isinsertable(i) + out = copy(i) + union!(out, itr) + else + out = empty(i) + union!(out, i) + union!(out, itr) + end + return out +end + +function Base.intersect(i::AbstractIndices, itr) + if isinsertable(i) + out = copy(i) + intersect!(out, itr) + else + out = empty(i) + intersect!(out, i) + intersect!(out, itr) + end + return out +end + +function Base.setdiff(i::AbstractIndices, itr) + if isinsertable(i) + out = copy(i) + setdiff!(out, itr) + else + out = empty(i) + setdiff!(out, i) + setdiff!(out, itr) + end + return out +end + +function Base.symdiff(i::AbstractIndices, itr) + if isinsertable(i) + out = copy(i) + symdiff!(out, itr) + else + out = empty(i) + symdiff!(out, i) + symdiff!(out, itr) + end + return out +end diff --git a/src/Indices.jl b/src/Indices.jl index f6650ce..0a7855a 100644 --- a/src/Indices.jl +++ b/src/Indices.jl @@ -69,3 +69,8 @@ end Base.empty(inds::VectorIndices, ::Type{I}) where {I} = Indices{I, Vector{I}}(Vector{I}()) Base.copy(inds::VectorIndices) = Indices(copy(inds.inds)) + +function Base.filter!(pred, inds::VectorIndices) + filter!(pred, inds.inds) + return inds +end \ No newline at end of file From 2fe4c7a14aa94616fc9a87181d87f21f4aac032d Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Tue, 2 Jun 2020 10:50:58 +1000 Subject: [PATCH 07/20] Add some `@inbounds` --- .github/workflows/benchmark.yml | 2 +- benchmark/bench_indices.jl | 44 ++++++++++++++++----------------- src/HashIndices.jl | 6 ++--- src/insertion.jl | 18 +++++++------- 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 78e3711..081759a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -10,7 +10,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest with: - version: 1.3 + version: 1.4 - name: Install dependencies run: julia -e 'using Pkg; pkg"add PkgBenchmark BenchmarkCI@0.1"' - name: Run benchmarks diff --git a/benchmark/bench_indices.jl b/benchmark/bench_indices.jl index 2ed7e5e..2533ed0 100644 --- a/benchmark/bench_indices.jl +++ b/benchmark/bench_indices.jl @@ -6,7 +6,7 @@ using Dictionaries const suite = BenchmarkGroup() #sizes = [(8 .^ (0:8))...] -sizes = [10, 100, 1000, 10_000] #, 10_000, 10_000_000] +sizes = [10 ]#, 100, 1000, 10_000] #, 10_000, 10_000_000] cutoff = 101 function build_vector_by_insertion(n) @@ -122,12 +122,12 @@ for n in sizes s["OldHashIndices"] = @benchmarkable build_old_hashindices_by_insertion($n) s = suite_n["empty by deletion ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector (pop!)"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=collect($r))) - s["Set"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Set($r)) - n < cutoff && (s["Indices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Indices(collect($r)))) - s["HashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=HashIndices($r)) - s["OldHashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Dictionaries.OldHashIndices($r)) - + n < cutoff && (s["Vector (pop!)"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=collect($r)) evals=1) + s["Set"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Set($r)) evals=1 + n < cutoff && (s["Indices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Indices(collect($r))) evals=1) + s["HashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=HashIndices($r)) evals=1 + s["OldHashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 + s = suite_n["in ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable in($y, $vec)) s["Set"] = @benchmarkable in($y, $set) @@ -185,25 +185,25 @@ for n in sizes s["OldHashIndices"] = @benchmarkable filter($pred2, $old_hash_indices) s = suite_n["filter! (most) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter!($pred1, s) setup=(s=collect($r))) - s["Set"] = @benchmarkable filter!($pred1, s) setup=(s=Set($r)) - n < cutoff && (s["Indices"] = @benchmarkable filter!($pred1, s) setup=(s=Indices(collect($r)))) - s["HashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=HashIndices($r)) - s["OldHashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=Dictionaries.OldHashIndices($r)) + n < cutoff && (s["Vector"] = @benchmarkable filter!($pred1, s) setup=(s=collect($r)) evals=1) + s["Set"] = @benchmarkable filter!($pred1, s) setup=(s=Set($r)) evals=1 + n < cutoff && (s["Indices"] = @benchmarkable filter!($pred1, s) setup=(s=Indices(collect($r))) evals=1) + s["HashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=HashIndices($r)) evals=1 + s["OldHashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 s = suite_n["filter! (half) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter!($iseven, s) setup=(s=collect($r))) - s["Set"] = @benchmarkable filter!($iseven, s) setup=(s=Set($r)) - n < cutoff && (s["Indices"] = @benchmarkable filter!($iseven, s) setup=(s=Indices(collect($r)))) - s["HashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=HashIndices($r)) - s["OldHashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=Dictionaries.OldHashIndices($r)) + n < cutoff && (s["Vector"] = @benchmarkable filter!($iseven, s) setup=(s=collect($r)) evals=1) + s["Set"] = @benchmarkable filter!($iseven, s) setup=(s=Set($r)) evals=1 + n < cutoff && (s["Indices"] = @benchmarkable filter!($iseven, s) setup=(s=Indices(collect($r))) evals=1) + s["HashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=HashIndices($r)) evals=1 + s["OldHashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 s = suite_n["filter! (few) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter!($pred2, s) setup=(s=collect($r))) - s["Set"] = @benchmarkable filter!($pred2, s) setup=(s=Set($r)) - n < cutoff && (s["Indices"] = @benchmarkable filter!($pred2, s) setup=(s=Indices(collect($r)))) - s["HashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=HashIndices($r)) - s["OldHashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=Dictionaries.OldHashIndices($r)) + n < cutoff && (s["Vector"] = @benchmarkable filter!($pred2, s) setup=(s=collect($r)) evals=1) + s["Set"] = @benchmarkable filter!($pred2, s) setup=(s=Set($r)) evals=1 + n < cutoff && (s["Indices"] = @benchmarkable filter!($pred2, s) setup=(s=Indices(collect($r))) evals=1) + s["HashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=HashIndices($r)) evals=1 + s["OldHashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 s = suite_n["union ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable union($even_vec, $odd_vec)) diff --git a/src/HashIndices.jl b/src/HashIndices.jl index 88723ad..70a99b8 100644 --- a/src/HashIndices.jl +++ b/src/HashIndices.jl @@ -179,12 +179,12 @@ function gettoken(indices::HashIndices{I}, i::I) where {I} @inbounds while true trial_slot = (trial_slot + 1) trial_index = indices.slots[trial_slot] - if trial_index == 0 - return (false, (0, 0)) - elseif trial_index > 0 + if trial_index > 0 if full_hash === indices.hashes[trial_index] && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) return (true, (trial_slot, trial_index)) end + elseif trial_index === 0 + return (false, (0, 0)) end trial_slot = trial_slot & bit_mask diff --git a/src/insertion.jl b/src/insertion.jl index ee45cce..d9e7f54 100644 --- a/src/insertion.jl +++ b/src/insertion.jl @@ -232,9 +232,9 @@ end function Base.get!(d::AbstractDictionary{I, T}, i::I, default::T) where {I, T} (hadindex, token) = gettoken!(d, i) if hadindex - return gettokenvalue(d, token) + return @inbounds gettokenvalue(d, token) else - settokenvalue!(d, token, default) + @inbounds settokenvalue!(d, token, default) return default end end @@ -257,10 +257,10 @@ end function Base.get!(f::Callable, d::AbstractDictionary{I}, i::I) where {I} (hadindex, token) = gettoken!(d, i) if hadindex - return gettokenvalue(d, token) + return @inbounds gettokenvalue(d, token) else default = f() - settokenvalue!(d, token, default) + @inbounds settokenvalue!(d, token, default) return default end end @@ -290,7 +290,7 @@ function Base.delete!(d::AbstractDictionary{I}, i::I) where {I} if !hasindex throw(IndexError("Index doesn't exist: $i")) end - deletetoken!(d, token) + @inbounds deletetoken!(d, token) return d end @@ -317,7 +317,7 @@ end function unset!(d::AbstractDictionary{I}, i::I) where {I} (hasindex, token) = gettoken(d, i) if hasindex - deletetoken!(d, token) + @inbounds deletetoken!(d, token) end return d end @@ -330,9 +330,9 @@ function Base.merge!(combiner::Callable, d::AbstractDictionary, d2::AbstractDict for (i, v) in pairs(d2) (hasindex, token) = gettoken!(d, i) if hasindex - settokenvalue!(d, token, combiner(gettokenvalue(d, token), v)) + @inbounds settokenvalue!(d, token, combiner(gettokenvalue(d, token), v)) else - settokenvalue!(d, token, v) + @inbounds settokenvalue!(d, token, v) end end return d @@ -391,7 +391,7 @@ function Base.symdiff!(s1::AbstractIndices, s2::AbstractIndices) for i in s2 (hastoken, token) = gettoken!(s1, i) if hastoken - deletetoken!(s1, token) + @inbounds deletetoken!(s1, token) end end return s1 From 33608fdf6cf2c3b6a6a5d343f232530fd2b7c889 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Wed, 3 Jun 2020 10:23:57 +1000 Subject: [PATCH 08/20] Don't compare hashes, benchmarking progress. --- benchmark/Project.toml | 1 + benchmark/bench_indices.jl | 665 +++++++++++++++++++++++++++---------- src/HashIndices.jl | 9 +- 3 files changed, 496 insertions(+), 179 deletions(-) diff --git a/benchmark/Project.toml b/benchmark/Project.toml index 0963be5..49bfdf4 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -1,4 +1,5 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PkgBenchmark = "32113eaa-f34f-5b0d-bd6c-c81e245fc73d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/benchmark/bench_indices.jl b/benchmark/bench_indices.jl index 2533ed0..0614228 100644 --- a/benchmark/bench_indices.jl +++ b/benchmark/bench_indices.jl @@ -2,11 +2,12 @@ module BenchHashIndices using BenchmarkTools using Dictionaries +using OrderedCollections const suite = BenchmarkGroup() #sizes = [(8 .^ (0:8))...] -sizes = [10 ]#, 100, 1000, 10_000] #, 10_000, 10_000_000] +sizes = [10, 100, 1000, 10_000] #, 10_000, 10_000_000] cutoff = 101 function build_vector_by_insertion(n) @@ -25,6 +26,14 @@ function build_set_by_insertion(n) return out end +function build_ordered_set_by_insertion(n) + out = OrderedSet{Int}() + for i in 1:n + push!(out, i) + end + return out +end + function build_indices_by_insertion(n) out = Indices{Int}() for i in 1:n @@ -63,6 +72,13 @@ function empty_by_deletion(set::Set, n) return set end +function empty_by_deletion(set::OrderedSet, n) + for i in 1:n + delete!(set, i) + end + return set +end + function empty_by_deletion(indices::AbstractIndices, n) for i in 1:n delete!(indices, i) @@ -76,6 +92,261 @@ function foreachsum(set) return count[] end +function all_in(set, n) + out = true + for i in 1:n + out &= i in set + end + return out +end + +function not_in(set, n) + out = true + for i in n+1:2n + out &= i in set + end + return out +end + +function basic_set_test(N) + h = Set{Int}() + out = true + for i in 1:N + push!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in 1:2:N + push!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:N + delete!(h, i) + end + out &= isempty(h) + push!(h, 7) + out &= 7 in h + for i in 1:N + push!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in N+1:2N + push!(h, i) + end + for i in 1:2N + out &= (i in h) == (i > N || iseven(i)) + end + return out +end + +function basic_ordered_set_test(N) + h = OrderedSet{Int}() + out = true + for i in 1:N + push!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in 1:2:N + push!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:N + delete!(h, i) + end + out &= isempty(h) + push!(h, 7) + out &= 7 in h + for i in 1:N + push!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in N+1:2N + push!(h, i) + end + for i in 1:2N + out &= (i in h) == (i > N || iseven(i)) + end + return out +end + +function basic_indices_test(N) + h = Indices{Int}() + out = true + for i in 1:N + insert!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in 1:2:N + insert!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:N + delete!(h, i) + end + out &= isempty(h) + insert!(h, 7) + out &= 7 in h + for i in 1:N + set!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in N+1:2N + insert!(h, i) + end + for i in 1:2N + out &= (i in h) == (i > N || iseven(i)) + end + return out +end + +function basic_hash_indices_test(N) + h = HashIndices{Int}() + out = true + for i in 1:N + insert!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in 1:2:N + insert!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:N + delete!(h, i) + end + out &= isempty(h) + insert!(h, 7) + out &= 7 in h + for i in 1:N + set!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in N+1:2N + insert!(h, i) + end + for i in 1:2N + out &= (i in h) == (i > N || iseven(i)) + end + return out +end + +function basic_old_hash_indices_test(N) + h = Dictionaries.OldHashIndices{Int}() + out = true + for i in 1:N + insert!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in 1:2:N + insert!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:N + delete!(h, i) + end + out &= isempty(h) + insert!(h, 7) + out &= 7 in h + for i in 1:N + set!(h, i) + end + for i in 1:N + out &= i in h + end + for i in 1:2:N + delete!(h, i) + end + for i in 1:N + out &= (i in h) == iseven(i) + end + for i in N+1:2N + insert!(h, i) + end + for i in 1:2N + out &= (i in h) == (i > N || iseven(i)) + end + return out +end for n in sizes r = 1:n @@ -97,6 +368,10 @@ for n in sizes even_set = Set(2:2:n) odd_set = Set(1:2:n) + ordered_set = OrderedSet(r) + even_ordered_set = OrderedSet(2:2:n) + odd_ordered_set = OrderedSet(1:2:n) + hash_indices = HashIndices(r) even_hash_indices = HashIndices(2:2:n) odd_hash_indices = HashIndices(1:2:n) @@ -107,180 +382,220 @@ for n in sizes suite_n = suite["$n"] = BenchmarkGroup() - s = suite_n["constructor ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable Vector($r)) - s["Set"] = @benchmarkable Set($r) - n < cutoff && (s["Indices"] = @benchmarkable Indices($r)) - s["HashIndices"] = @benchmarkable HashIndices($r) - s["OldHashIndices"] = @benchmarkable HashIndices($r) - - s = suite_n["build by insertion ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector (push!)"] = @benchmarkable build_vector_by_insertion($n)) - s["Set"] = @benchmarkable build_set_by_insertion($n) - n < cutoff && (s["Set"] = @benchmarkable build_indices_by_insertion($n)) - s["HashIndices"] = @benchmarkable build_hashindices_by_insertion($n) - s["OldHashIndices"] = @benchmarkable build_old_hashindices_by_insertion($n) - - s = suite_n["empty by deletion ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector (pop!)"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=collect($r)) evals=1) - s["Set"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Set($r)) evals=1 - n < cutoff && (s["Indices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Indices(collect($r))) evals=1) - s["HashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=HashIndices($r)) evals=1 - s["OldHashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 - - s = suite_n["in ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable in($y, $vec)) - s["Set"] = @benchmarkable in($y, $set) - n < cutoff && (s["Indices"] = @benchmarkable in($y, $indices)) - s["HashIndices"] = @benchmarkable in($y, $hash_indices) - s["OldHashIndices"] = @benchmarkable in($y, $old_hash_indices) - - s = suite_n["count ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable count(iseven, $vec)) - s["Set"] = @benchmarkable count(iseven, $set) - n < cutoff && (s["Indices"] = @benchmarkable count(iseven, $indices)) - s["HashIndices"] = @benchmarkable count(iseven, $hash_indices) - s["OldHashIndices"] = @benchmarkable count(iseven, $old_hash_indices) - - s = suite_n["sum ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable sum($vec)) - s["Set"] = @benchmarkable sum($set) - n < cutoff && (s["Indices"] = @benchmarkable sum($indices)) - s["HashIndices"] = @benchmarkable sum($hash_indices) - s["OldHashIndices"] = @benchmarkable sum($old_hash_indices) - - s = suite_n["foreach ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable foreachsum($vec)) - s["Set"] = @benchmarkable foreachsum($set) - n < cutoff && (s["Indices"] = @benchmarkable foreachsum($indices)) - s["HashIndices"] = @benchmarkable foreachsum($hash_indices) - s["OldHashIndices"] = @benchmarkable foreachsum($old_hash_indices) - - s = suite_n["filter-map-reduce via generator ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x)))) - s["Set"] = @benchmarkable sum($(2x for x in set if isodd(x))) - n < cutoff && (s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x)))) - s["HashIndices"] = @benchmarkable sum($(2x for x in hash_indices if isodd(x))) - s["OldHashIndices"] = @benchmarkable sum($(2x for x in old_hash_indices if isodd(x))) - - s = suite_n["filter (most) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter($pred1, $vec)) - s["Set"] = @benchmarkable filter($pred1, $set) - n < cutoff && (s["Indices"] = @benchmarkable filter($pred1, $indices)) - s["HashIndices"] = @benchmarkable filter($pred1, $hash_indices) - s["OldHashIndices"] = @benchmarkable filter($pred1, $old_hash_indices) - - s = suite_n["filter (half) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter(iseven, $vec)) - s["Set"] = @benchmarkable filter(iseven, $set) - n < cutoff && (s["Indices"] = @benchmarkable filter(iseven, $indices)) - s["HashIndices"] = @benchmarkable filter(iseven, $hash_indices) - s["OldHashIndices"] = @benchmarkable filter(iseven, $old_hash_indices) - - s = suite_n["filter (few) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter($pred2, $vec)) - s["Set"] = @benchmarkable filter($pred2, $set) - n < cutoff && (s["Indices"] = @benchmarkable filter($pred2, $indices)) - s["HashIndices"] = @benchmarkable filter($pred2, $hash_indices) - s["OldHashIndices"] = @benchmarkable filter($pred2, $old_hash_indices) - - s = suite_n["filter! (most) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter!($pred1, s) setup=(s=collect($r)) evals=1) - s["Set"] = @benchmarkable filter!($pred1, s) setup=(s=Set($r)) evals=1 - n < cutoff && (s["Indices"] = @benchmarkable filter!($pred1, s) setup=(s=Indices(collect($r))) evals=1) - s["HashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=HashIndices($r)) evals=1 - s["OldHashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 - - s = suite_n["filter! (half) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter!($iseven, s) setup=(s=collect($r)) evals=1) - s["Set"] = @benchmarkable filter!($iseven, s) setup=(s=Set($r)) evals=1 - n < cutoff && (s["Indices"] = @benchmarkable filter!($iseven, s) setup=(s=Indices(collect($r))) evals=1) - s["HashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=HashIndices($r)) evals=1 - s["OldHashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 - - s = suite_n["filter! (few) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable filter!($pred2, s) setup=(s=collect($r)) evals=1) - s["Set"] = @benchmarkable filter!($pred2, s) setup=(s=Set($r)) evals=1 - n < cutoff && (s["Indices"] = @benchmarkable filter!($pred2, s) setup=(s=Indices(collect($r))) evals=1) - s["HashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=HashIndices($r)) evals=1 - s["OldHashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 - - s = suite_n["union ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable union($even_vec, $odd_vec)) - s["Set"] = @benchmarkable union($even_set, $odd_set) - n < cutoff && (s["Indices"] = @benchmarkable union($even_indices, $even_indices)) - s["HashIndices"] = @benchmarkable union($even_hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable union($even_old_hash_indices, $odd_old_hash_indices) - - s = suite_n["intersect (empty) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $odd_vec)) - s["Set"] = @benchmarkable intersect($even_set, $odd_set) - n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $odd_indices)) - s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $odd_old_hash_indices) - - s = suite_n["intersect (half) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $vec)) - s["Set"] = @benchmarkable intersect($even_set, $set) - n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $indices)) - s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $old_hash_indices) - - s = suite_n["intersect (whole) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable intersect($vec, $vec)) - s["Set"] = @benchmarkable intersect($set, $set) - n < cutoff && (s["Indices"] = @benchmarkable intersect($indices, $indices)) - s["HashIndices"] = @benchmarkable intersect($hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable intersect($old_hash_indices, $old_hash_indices) - - s = suite_n["setdiff (whole) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $odd_vec)) - s["Set"] = @benchmarkable setdiff($even_set, $odd_set) - n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $odd_indices)) - s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $odd_old_hash_indices) - - s = suite_n["setdiff (half) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $vec)) - s["Set"] = @benchmarkable setdiff($even_set, $set) - n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $indices)) - s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $old_hash_indices) - - s = suite_n["setdiff (empty) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable setdiff($vec, $vec)) - s["Set"] = @benchmarkable setdiff($set, $set) - n < cutoff && (s["Indices"] = @benchmarkable setdiff($indices, $indices)) - s["HashIndices"] = @benchmarkable setdiff($hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable setdiff($old_hash_indices, $old_hash_indices) - - s = suite_n["symdiff (whole) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $odd_vec)) - s["Set"] = @benchmarkable symdiff($even_set, $odd_set) - n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $odd_indices)) - s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $odd_old_hash_indices) - - s = suite_n["symdiff (left half) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $odd_vec)) - s["Set"] = @benchmarkable symdiff($set, $odd_set) - n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $odd_indices)) - s["HashIndices"] = @benchmarkable symdiff($hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $odd_old_hash_indices) - - s = suite_n["symdiff (right half) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $vec)) - s["Set"] = @benchmarkable symdiff($even_set, $set) - n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $indices)) - s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $old_hash_indices) - - s = suite_n["symdiff (empty) ($n)"] = BenchmarkGroup() - n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $vec)) - s["Set"] = @benchmarkable symdiff($set, $set) - n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $indices)) - s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $old_hash_indices) + # s = suite_n["constructor ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable Vector($r)) + # s["Set"] = @benchmarkable Set($r) + # s["OrderedSet"] = @benchmarkable OrderedSet($r) + # n < cutoff && (s["Indices"] = @benchmarkable Indices($r)) + # s["HashIndices"] = @benchmarkable HashIndices($r) + # s["OldHashIndices"] = @benchmarkable HashIndices($r) + + # s = suite_n["build by insertion ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector (push!)"] = @benchmarkable build_vector_by_insertion($n)) + # s["Set"] = @benchmarkable build_set_by_insertion($n) + # s["OrderedSet"] = @benchmarkable build_ordered_set_by_insertion($n) + # n < cutoff && (s["Set"] = @benchmarkable build_indices_by_insertion($n)) + # s["HashIndices"] = @benchmarkable build_hashindices_by_insertion($n) + # s["OldHashIndices"] = @benchmarkable build_old_hashindices_by_insertion($n) + + # s = suite_n["empty by deletion ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector (pop!)"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=collect($r)) evals=1) + # s["Set"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Set($r)) evals=1 + # s["OrderedSet"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=OrderedSet($r)) evals=1 + # n < cutoff && (s["Indices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Indices(collect($r))) evals=1) + # s["HashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=HashIndices($r)) evals=1 + # s["OldHashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 + + s = suite_n["insertion/deletion tests ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable basic_set_test($n) + s["OrderedSet"] = @benchmarkable basic_ordered_set_test($n) + n < cutoff && (s["Indices"] = @benchmarkable basic_indices_test($n)) + s["HashIndices"] = @benchmarkable basic_hash_indices_test($n) + s["OldHashIndices"] = @benchmarkable basic_old_hash_indices_test($n) + + # s = suite_n["in ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable all_in($vec, $n)) + # s["Set"] = @benchmarkable all_in($set, $n) + # s["OrderedSet"] = @benchmarkable all_in($ordered_set, $n) + # n < cutoff && (s["Indices"] = @benchmarkable all_in($indices, $n)) + # s["HashIndices"] = @benchmarkable all_in($hash_indices, $n) + # s["OldHashIndices"] = @benchmarkable all_in($old_hash_indices, $n) + + # s = suite_n["not in ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable not_in($vec, $n)) + # s["Set"] = @benchmarkable not_in($set, $n) + # s["OrderedSet"] = @benchmarkable not_in($ordered_set, $n) + # n < cutoff && (s["Indices"] = @benchmarkable not_in($indices, $n)) + # s["HashIndices"] = @benchmarkable not_in($hash_indices, $n) + # s["OldHashIndices"] = @benchmarkable not_in($old_hash_indices, $n) + + # s = suite_n["count ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable count(iseven, $vec)) + # s["Set"] = @benchmarkable count(iseven, $set) + # s["OrderedSet"] = @benchmarkable count(iseven, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable count(iseven, $indices)) + # s["HashIndices"] = @benchmarkable count(iseven, $hash_indices) + # s["OldHashIndices"] = @benchmarkable count(iseven, $old_hash_indices) + + # s = suite_n["sum ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable sum($vec)) + # s["Set"] = @benchmarkable sum($set) + # s["OrderedSet"] = @benchmarkable sum($ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable sum($indices)) + # s["HashIndices"] = @benchmarkable sum($hash_indices) + # s["OldHashIndices"] = @benchmarkable sum($old_hash_indices) + + # s = suite_n["foreach ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable foreachsum($vec)) + # s["Set"] = @benchmarkable foreachsum($set) + # s["OrderedSet"] = @benchmarkable foreachsum($ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable foreachsum($indices)) + # s["HashIndices"] = @benchmarkable foreachsum($hash_indices) + # s["OldHashIndices"] = @benchmarkable foreachsum($old_hash_indices) + + # s = suite_n["filter-map-reduce via generator ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x)))) + # s["Set"] = @benchmarkable sum($(2x for x in set if isodd(x))) + # s["OrderedSet"] = @benchmarkable sum($(2x for x in ordered_set if isodd(x))) + # n < cutoff && (s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x)))) + # s["HashIndices"] = @benchmarkable sum($(2x for x in hash_indices if isodd(x))) + # s["OldHashIndices"] = @benchmarkable sum($(2x for x in old_hash_indices if isodd(x))) + + # s = suite_n["filter (most) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable filter($pred1, $vec)) + # s["Set"] = @benchmarkable filter($pred1, $set) + # s["OrderedSet"] = @benchmarkable filter($pred1, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable filter($pred1, $indices)) + # s["HashIndices"] = @benchmarkable filter($pred1, $hash_indices) + # s["OldHashIndices"] = @benchmarkable filter($pred1, $old_hash_indices) + + # s = suite_n["filter (half) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable filter(iseven, $vec)) + # s["Set"] = @benchmarkable filter(iseven, $set) + # s["OrderedSet"] = @benchmarkable filter(iseven, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable filter(iseven, $indices)) + # s["HashIndices"] = @benchmarkable filter(iseven, $hash_indices) + # s["OldHashIndices"] = @benchmarkable filter(iseven, $old_hash_indices) + + # s = suite_n["filter (few) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable filter($pred2, $vec)) + # s["Set"] = @benchmarkable filter($pred2, $set) + # s["OrderedSet"] = @benchmarkable filter($pred2, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable filter($pred2, $indices)) + # s["HashIndices"] = @benchmarkable filter($pred2, $hash_indices) + # s["OldHashIndices"] = @benchmarkable filter($pred2, $old_hash_indices) + + # s = suite_n["filter! (most) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable filter!($pred1, s) setup=(s=collect($r)) evals=1) + # s["Set"] = @benchmarkable filter!($pred1, s) setup=(s=Set($r)) evals=1 + # s["OrderedSet"] = @benchmarkable filter!($pred1, s) setup=(s=OrderedSet($r)) evals=1 + # n < cutoff && (s["Indices"] = @benchmarkable filter!($pred1, s) setup=(s=Indices(collect($r))) evals=1) + # s["HashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=HashIndices($r)) evals=1 + # s["OldHashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 + + # s = suite_n["filter! (half) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable filter!($iseven, s) setup=(s=collect($r)) evals=1) + # s["Set"] = @benchmarkable filter!($iseven, s) setup=(s=Set($r)) evals=1 + # s["OrderedSet"] = @benchmarkable filter!($iseven, s) setup=(s=OrderedSet($r)) evals=1 + # n < cutoff && (s["Indices"] = @benchmarkable filter!($iseven, s) setup=(s=Indices(collect($r))) evals=1) + # s["HashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=HashIndices($r)) evals=1 + # s["OldHashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 + + # s = suite_n["filter! (few) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable filter!($pred2, s) setup=(s=collect($r)) evals=1) + # s["Set"] = @benchmarkable filter!($pred2, s) setup=(s=Set($r)) evals=1 + # s["OrderedSet"] = @benchmarkable filter!($pred2, s) setup=(s=OrderedSet($r)) evals=1 + # n < cutoff && (s["Indices"] = @benchmarkable filter!($pred2, s) setup=(s=Indices(collect($r))) evals=1) + # s["HashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=HashIndices($r)) evals=1 + # s["OldHashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 + + # s = suite_n["union ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable union($even_vec, $odd_vec)) + # s["Set"] = @benchmarkable union($even_set, $odd_set) + # s["OrderedSet"] = @benchmarkable union($even_ordered_set, $odd_ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable union($even_indices, $even_indices)) + # s["HashIndices"] = @benchmarkable union($even_hash_indices, $odd_hash_indices) + # s["OldHashIndices"] = @benchmarkable union($even_old_hash_indices, $odd_old_hash_indices) + + # s = suite_n["intersect (empty) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $odd_vec)) + # s["Set"] = @benchmarkable intersect($even_set, $odd_set) + # s["OrderedSet"] = @benchmarkable intersect($even_ordered_set, $odd_ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $odd_indices)) + # s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $odd_hash_indices) + # s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $odd_old_hash_indices) + + # s = suite_n["intersect (half) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $vec)) + # s["Set"] = @benchmarkable intersect($even_set, $set) + # s["OrderedSet"] = @benchmarkable intersect($even_ordered_set, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $indices)) + # s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $hash_indices) + # s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $old_hash_indices) + + # s = suite_n["intersect (whole) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable intersect($vec, $vec)) + # s["Set"] = @benchmarkable intersect($set, $set) + # s["OrderedSet"] = @benchmarkable intersect($ordered_set, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable intersect($indices, $indices)) + # s["HashIndices"] = @benchmarkable intersect($hash_indices, $hash_indices) + # s["OldHashIndices"] = @benchmarkable intersect($old_hash_indices, $old_hash_indices) + + # s = suite_n["setdiff (whole) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $odd_vec)) + # s["Set"] = @benchmarkable setdiff($even_set, $odd_set) + # s["OrderedSet"] = @benchmarkable setdiff($even_ordered_set, $odd_ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $odd_indices)) + # s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $odd_hash_indices) + # s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $odd_old_hash_indices) + + # s = suite_n["setdiff (half) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $vec)) + # s["Set"] = @benchmarkable setdiff($even_set, $set) + # s["OrderedSet"] = @benchmarkable setdiff($even_ordered_set, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $indices)) + # s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $hash_indices) + # s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $old_hash_indices) + + # s = suite_n["setdiff (empty) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable setdiff($vec, $vec)) + # s["Set"] = @benchmarkable setdiff($set, $set) + # s["OrderedSet"] = @benchmarkable setdiff($ordered_set, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable setdiff($indices, $indices)) + # s["HashIndices"] = @benchmarkable setdiff($hash_indices, $hash_indices) + # s["OldHashIndices"] = @benchmarkable setdiff($old_hash_indices, $old_hash_indices) + + # s = suite_n["symdiff (whole) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $odd_vec)) + # s["Set"] = @benchmarkable symdiff($even_set, $odd_set) + # s["OrderedSet"] = @benchmarkable symdiff($even_ordered_set, $odd_ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $odd_indices)) + # s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) + # s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $odd_old_hash_indices) + + # s = suite_n["symdiff (left half) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $odd_vec)) + # s["Set"] = @benchmarkable symdiff($set, $odd_set) + # s["OrderedSet"] = @benchmarkable symdiff($ordered_set, $odd_ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $odd_indices)) + # s["HashIndices"] = @benchmarkable symdiff($hash_indices, $odd_hash_indices) + # s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $odd_old_hash_indices) + + # s = suite_n["symdiff (right half) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $vec)) + # s["Set"] = @benchmarkable symdiff($even_set, $set) + # s["OrderedSet"] = @benchmarkable symdiff($even_ordered_set, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $indices)) + # s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $hash_indices) + # s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $old_hash_indices) + + # s = suite_n["symdiff (empty) ($n)"] = BenchmarkGroup() + # n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $vec)) + # s["Set"] = @benchmarkable symdiff($set, $set) + # s["OrderedSet"] = @benchmarkable symdiff($ordered_set, $ordered_set) + # n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $indices)) + # s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) + # s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $old_hash_indices) end end # module diff --git a/src/HashIndices.jl b/src/HashIndices.jl index 70a99b8..80f0f48 100644 --- a/src/HashIndices.jl +++ b/src/HashIndices.jl @@ -51,6 +51,7 @@ function HashIndices{I}(iter) where {I} end function HashIndices{I}(values::Vector{I}) where {I} + # TODO Incrementally build the hashmap removing duplicates hashes = map(v -> hash(v) & hash_mask, values) slots = Vector{Int}() out = HashIndices{I}(slots, hashes, values, 0) @@ -180,7 +181,8 @@ function gettoken(indices::HashIndices{I}, i::I) where {I} trial_slot = (trial_slot + 1) trial_index = indices.slots[trial_slot] if trial_index > 0 - if full_hash === indices.hashes[trial_index] && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) + value = indices.values[trial_index] + if i === value || isequal(i, value) return (true, (trial_slot, trial_index)) end elseif trial_index === 0 @@ -219,9 +221,8 @@ function gettoken!(indices::HashIndices{I}, i::I, values = ()) where {I} deleted_slot = trial_slot end else - trial_hash = indices.hashes[trial_index] - - if trial_hash === full_hash && isequal(i, indices.values[trial_index]) # Note: the first bit also ensures the value wasn't deleted (and potentiall undefined) + value = indices.values[trial_index] + if i === value || isequal(i, value) return (true, (trial_slot, trial_index)) end end From 0e0a306bd094a588c3888167eb334b3a16648d37 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Mon, 8 Jun 2020 21:30:20 +1000 Subject: [PATCH 09/20] Many many changes --- README.md | 168 +++++++------- benchmark/bench_indices.jl | 455 ++++++++++++++++++++----------------- src/AbstractDictionary.jl | 93 +++++++- src/AbstractIndices.jl | 67 +++++- src/Dictionaries.jl | 7 +- src/Dictionary.jl | 2 - src/HashDictionary.jl | 19 +- src/HashIndices.jl | 153 +++++++++++-- src/Indices.jl | 8 +- src/OldHashDictionary.jl | 2 +- src/OldHashIndices.jl | 2 +- src/foreach.jl | 18 +- src/insertion.jl | 21 +- src/map.jl | 53 ++++- 14 files changed, 732 insertions(+), 336 deletions(-) diff --git a/README.md b/README.md index 12c0f36..7fd1215 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,16 @@ In this package we aim to devise a cohesive interface for abstract dictionaries ## Getting started -Dictionaries share the common supertype `AbstractDictionary`, and the go-to container in this package is `HashDictionary` - which shares the same hash-based implementation as Julia's inbuilt `Dict` type (using `hash` and `isequal` for key lookup and comparison). You can construct one from a list of indices (or keys) and a list of values. +Dictionaries share the common supertype `AbstractDictionary`, and the go-to container in this package is `HashDictionary` - which is a new hash-based implementation that serves as a replacement of Julia's inbuilt `Dict` type (using `hash` and `isequal` for key lookup and comparison). The three main difference to `Dict` are that it preserves the order of elements, it iterates much faster, and it iterates values rather than key-value pairs. + +You can construct one from a list of indices (or keys) and a list of values. ```julia julia> dict = HashDictionary(["a", "b", "c"], [1, 2, 3]) 3-element HashDictionary{String,Int64} - "c" │ 3 - "b" │ 2 "a" │ 1 + "b" │ 2 + "c" │ 3 julia> dict["a"] 1 @@ -32,13 +34,13 @@ If you prefer, you can use the `dictionary` function to create a dictionary from ```julia julia> dictionary(["a" => 1, "b" => 2, "c" => 3]) 3-element HashDictionary{String,Int64} - "c" │ 3 - "b" │ 2 "a" │ 1 + "b" │ 2 + "c" │ 3 ``` The values of `HashDictionary` are mutable, or "settable", and can be modified via `setindex!`. -However, just like for `Array`s, new indices (keys) are *never* created this way. +However, just like for `Array`s, new indices (keys) are *never* created or rearranged this way. ```julia julia> dict["a"] = 10 @@ -46,15 +48,15 @@ julia> dict["a"] = 10 julia> dict 3-element HashDictionary{String,Int64} - "c" │ 3 - "b" │ 2 "a" │ 10 + "b" │ 2 + "c" │ 3 julia> dict["d"] = 42 ERROR: IndexError("Dictionary does not contain index: d") Stacktrace: - [1] setindex!(::HashDictionary{String,Int64}, ::Int64, ::String) at /home/ferris/.julia/dev/Dictionaries/src/AbstractDictionary.jl:134 - [2] top-level scope at REPL[15]:1 + [1] setindex!(::HashDictionary{String,Int64}, ::Int64, ::String) at /home/ferris/.julia/dev/Dictionaries/src/AbstractDictionary.jl:347 + [2] top-level scope at REPL[7]:1 ``` The indices of `HashDictionary` are said to be "insertable" - indices can be added or removed with the `insert!` and `delete!` functions. @@ -62,16 +64,16 @@ The indices of `HashDictionary` are said to be "insertable" - indices can be add ``` julia> insert!(dict, "d", 42) 4-element HashDictionary{String,Int64} - "c" │ 3 - "b" │ 2 "a" │ 10 + "b" │ 2 + "c" │ 3 "d" │ 42 julia> delete!(dict, "d") 3-element HashDictionary{String,Int64} - "c" │ 3 - "b" │ 2 "a" │ 10 + "b" │ 2 + "c" │ 3 ``` Note that `insert!` and `delete!` are precise in the sense that `insert!` will error if the index already exists, and `delete!` will error if the index does not. The `set!` function provides "upsert" functionality ("update or insert") and `unset!` is useful for removing an index that may or may not exist. @@ -83,9 +85,9 @@ Dictionaries can be manipulated and transformed using a similar interface to Jul ```julia julia> dict = HashDictionary(["a", "b", "c"], [1, 2, 3]) 3-element HashDictionary{String,Int64} - "c" │ 3 - "b" │ 2 "a" │ 1 + "b" │ 2 + "c" │ 3 julia> sum(dict) 6 @@ -99,21 +101,21 @@ Mapping and broadcasting also function as-per arrays, preserving the indices and ```julia julia> map(iseven, dict) 3-element HashDictionary{String,Bool} - "c" │ false - "b" │ true "a" │ false + "b" │ true + "c" │ false julia> map(*, dict, dict) 3-element HashDictionary{String,Int64} - "c" │ 9 - "b" │ 4 "a" │ 1 + "b" │ 4 + "c" │ 9 julia> dict .+ 1 3-element HashDictionary{String,Int64} - "c" │ 4 - "b" │ 3 "a" │ 2 + "b" │ 3 + "c" │ 4 ``` There is a `mapview` function, which is the lazy version of the above. @@ -123,8 +125,8 @@ Filtering a dictionary also preserves the keys, dropping the remainder. ```julia julia> filter(isodd, dict) 2-element HashDictionary{String,Bool} - "c" │ 3 "a" │ 1 + "c" │ 3 ``` The `filterview` function is provided to lazily filter a dictionary, which may occassionally @@ -135,15 +137,15 @@ The `pairs` function allows access to both the index (key) and value when iterat ```julia julia> pairs(dict) 3-element Dictionaries.PairDictionary{String,Int64,HashDictionary{String,Int64}} - "c" │ "c" => 3 - "b" │ "b" => 2 "a" │ "a" => 1 + "b" │ "b" => 2 + "c" │ "c" => 3 julia> map(((k,v),) -> k^v, pairs(dict)) 3-element HashDictionary{String,String} - "c" │ "ccc" - "b" │ "bb" "a" │ "a" + "b" │ "bb" + "c" │ "ccc" ``` ### Indices @@ -153,9 +155,9 @@ The indices of a dictionary are unique, and form a set (in the mathematical sens ```julia julia> keys(dict) 3-element HashIndices{String} - "c" - "b" "a" + "b" + "c" ``` Whenever you call `keys(::AbstractDictionary)`, you always receive an `AbstractIndices` in return. @@ -164,26 +166,38 @@ Whenever you call `keys(::AbstractDictionary)`, you always receive an `AbstractI ```julia julia> inds = HashIndices(["a", "b", "c"]) 3-element HashIndices{String} - "c" - "b" "a" + "b" + "c" ``` +You can also use the `distinct` function, which is similar to `unique` from `Base`, to construct indices where the input may not be unique. + +```julia +julia> distinct([1,2,3,3]) +3-element HashIndices{Int64} + 1 + 2 + 3 +``` + +The `distinct` function may be considered as useful replacement of `unique` in many cases, as the `unique` function internally constructs a hashmap (`Set`) anyway before returning a `Vector`. However, a `HashIndices` iterates as fast as `Vector` and in many cases it can be useful to be able to `map` it into a dictionary. + `HashIndices` are insertable, so you can use `insert!` and `delete!` (or `set!` and `unset!`) to add and remove elements. ```julia julia> insert!(inds, "d") 4-element HashIndices{String} - "c" - "b" "a" + "b" + "c" "d" julia> delete!(inds, "d") 3-element HashIndices{String} - "c" - "b" "a" + "b" + "c" ``` One crucial property of `AbstractIndices` is that they are a subtype of `AbstractDictionary` (similar to how the `keys` of an `AbstractArray` are always `AbstractArray`s). But how can a set, or indices, be a dictionary? Under `getindex`, they form a map from each element to itself. @@ -193,7 +207,9 @@ julia> inds["b"] "b" ``` -Since all dictionaries have `keys`, even indices must have `keys` - and in this case `keys(inds) === inds`. +Thus, if you iterate an `AbstractIndices` you are guaranteed never to get the same value twice, and the collection is a set. All the usual set operations like `union`, `intersect`, `setdiff` and `symdiff` are defined, as well as a newly exported predicate function `disjoint(set1, set2)` which returns `true` if `set1` and `set2` do not intersect/overlap according to an elementwise `isequal` check, and `false` otherwise. + +Since all dictionaries have `keys`, even indices must have `keys` - and in this case `keys(inds::AbstractIndices) === inds`. ### Working with indices @@ -204,15 +220,15 @@ If you wish to perform an operation on each element of a set, you can simply `ma ```julia julia> map(uppercase, inds) 3-element HashDictionary{String,String} - "c" │ "C" - "b" │ "B" "a" │ "A" + "b" │ "B" + "c" │ "C" julia> inds .* "at" 3-element HashDictionary{String,String} - "c" │ "cat" - "b" │ "bat" "a" │ "aat" + "b" │ "bat" + "c" │ "cat" ``` You can filter indices. @@ -220,8 +236,8 @@ You can filter indices. ```julia julia> filter(in(["a", "b"]), inds) 2-element HashIndices{String} - "b" "a" + "b" ``` To find the subset of dictionary indices/keys that satisfy some constraint on the values, use the `findall` function. @@ -229,14 +245,14 @@ To find the subset of dictionary indices/keys that satisfy some constraint on th ```julia julia> dict 3-element HashDictionary{String,Int64} - "c" │ 3 - "b" │ 2 "a" │ 1 + "b" │ 2 + "c" │ 3 julia> inds2 = findall(isodd, dict) 2-element HashIndices{String} - "c" "a" + "c" ``` And, finally, one useful thing you can do with indices is, well, *indexing*. Non-scalar indexing of dictionaries is a little more complicated than that of arrays, since there is an ambiguity on whether the indexer is a *single* index or a collection of indices (for arrays, the scalar indices are integers (or `CartesianIndex`es) so this ambiguity is less of a problem). The [Indexing.jl](https://github.com/andyferris/Indexing.jl) provides the `getindices` function to return a container with the same indices as the indexer, and this is re-exported here. @@ -244,8 +260,8 @@ And, finally, one useful thing you can do with indices is, well, *indexing*. Non ```julia julia> getindices(dict, inds2) 2-element HashDictionary{String,Int64} - "c" │ 3 "a" │ 1 + "c" │ 3 ``` It has [been suggested](https://github.com/JuliaLang/julia/issues/30845) to make the syntax `dict.[inds2]` available in Julia in the future for unambiguous non-scalar indexing. @@ -255,8 +271,8 @@ Lazy non-scalar indexing may be achieved, as usual, with the `view` function. ```julia julia> view(dict, inds2) 2-element DictionaryView{String,Int64,HashIndices{String},HashDictionary{String,Int64}} - "c" │ 3 "a" │ 1 + "c" │ 3 ``` Boolean or "logical" indexing is also ambiguous with scalar and non-scalar indexing. Luckily, the `findall` function is a convenient way to convert a Boolean-valued dictionary into indices, which we can use with `getindices`: @@ -264,14 +280,14 @@ Boolean or "logical" indexing is also ambiguous with scalar and non-scalar index ```julia julia> isodd.(dict) 3-element HashDictionary{String,Bool} - "c" │ true - "b" │ false "a" │ true + "b" │ false + "c" │ true julia> getindices(dict, findall(isodd.(dict))) 2-element HashDictionary{String,Int64} - "c" │ 3 "a" │ 1 + "c" │ 3 ``` (Who knows - maybe we need syntax for this, too?) @@ -280,8 +296,6 @@ julia> getindices(dict, findall(isodd.(dict))) The `Dictionary` container is a simple, iteration-based dictionary that may be faster for smaller collections. It's `keys` are the corresponding `Indices` type. By default these contain `Vector`s which support mutation, insertion and tokenization, but they can contain other iterables such as `Tuple`s (which make for good statically-sized dictionaries, with similarities with `Base.ImmutableDict` or [StaticArrays.jl](https://github.com/JuliaArrays/StaticArrays.jl)). -It is planned to add new dictionary types that support an ordering (such as sorted by the values, or the columns of a `DataFrame`, similar to [OrderedCollections.jl](https://github.com/JuliaCollections/OrderedCollections.jl)). - Indices that are based on sort ordering instead of hashing (both in a dense sorted form and as a B-tree or similar) are also planned. ### Factories for dictionary creation @@ -293,9 +307,9 @@ The `similar` function is used to create a dictionary with defined indices, but ```julia julia> similar(dict, Vector{Int}) 3-element HashDictionary{String,Array{Int64,1}} - "c" │ #undef - "b" │ #undef "a" │ #undef + "b" │ #undef + "c" │ #undef ``` The behaviour is the same if `dict` is an `AbstractIndices` - you always get a dictionary with settable/mutable elements. Preserving the indices using `similar` and setting the values provides a huge performance advantage compared to iteratively constructing a new dictionary via insertion (see the bottom of this README). @@ -305,9 +319,9 @@ On the other hand, values can be initialized with the `fill(value, dict)` functi ```julia julia> fill(42, dict) 3-element HashDictionary{String,Int64} - "c" │ 42 - "b" │ 42 "a" │ 42 + "b" │ 42 + "c" │ 42 ``` The `fill` function can optionally define a wider type than the value, helpful for if you want to assign a default value like `missing` but allow this to be updated later. @@ -315,9 +329,9 @@ The `fill` function can optionally define a wider type than the value, helpful f ```julia julia> fill(missing, dict, Union{Missing, Int64}) 3-element HashDictionary{String,Union{Missing, Int64}} - "c" │ missing - "b" │ missing "a" │ missing + "b" │ missing + "c" │ missing ``` Functions `zeros`, `ones`, `falses` and `trues` are defined as a handy alternative to the above in common cases, as are `rand` and `randn`. @@ -325,15 +339,15 @@ Functions `zeros`, `ones`, `falses` and `trues` are defined as a handy alternati ```julia julia> zeros(dict) 3-element HashDictionary{String,Float64} - "c" │ 0.0 - "b" │ 0.0 "a" │ 0.0 + "b" │ 0.0 + "c" │ 0.0 julia> zeros(UInt8, dict) 3-element HashDictionary{String,UInt8} - "c" │ 0x00 - "b" │ 0x00 "a" │ 0x00 + "b" │ 0x00 + "c" │ 0x00 ``` Note that the *indices* of the output are not guaranteed to be mutable/insertable - in fact, in the current implementation inserting or deleting indices to the output of the above can corrupt the input container (Julia suffers similar restrictions with `AbstractArray`s with mutable indices, for example changing the size of the indices of a `SubArray` can lead to corruption and segfaults). This also holds true for the output of `map`, `broadcast`, `getindices`, `similar`, `zeros`, `ones`, `falses` and `trues`. If you want a new container with indices you can insert, by sure to `copy` the indices furst, or use `empty` instead. @@ -462,9 +476,7 @@ tokens are equivalent with a constant-time operation. When this is the case, the operation can skip lookup entirely, performing zero calls to `hash` and dealing with hash collisions. -A quick benchmark verifies the result. The `copy` below makes `keys(d1) !== keys(d2)`, -disabling token co-iteration (with results somewhat in line with typical usage of -`Base.Dict`). +A quick benchmark verifies the result. ```julia julia> using Dictionaries, BenchmarkTools @@ -474,25 +486,29 @@ julia> d1 = HashDictionary(1:10_000_000, 10_000_000:-1:1); julia> d2 = d1 .+ 1; julia> @btime map(+, d1, d2); - 155.299 ms (22 allocations: 256.00 MiB) + 23.362 ms (18 allocations: 76.29 MiB) +``` +The `copy` below makes `keys(d1) !== keys(d2)`, disabling token co-iteration (requiring +mulitple hash-table lookups per element). + +```julia julia> @btime map(+, d1, $(HashDictionary(copy(keys(d2)), d2))); - 343.394 ms (22 allocations: 256.00 MiB) + 1.485 s (18 allocations: 76.29 MiB) ``` For a comparitive baseline benchmark, we can try the same with dense vectors. ```julia -julia> v = collect(10_000_000:-1:1); +julia> v1 = collect(10_000_000:-1:1); + +julia> v2 = v1 .+ 1; -julia> @btime map(+, v, v); - 26.910 ms (5 allocations: 76.29 MiB) +julia> @btime map(+, v1, v2); + 25.449 ms (5 allocations: 76.29 MiB) ``` -Here, the operation uses SIMD, and the vector is densely packed whereas the values of the -dictionaries above are sparsely distributed into slots with a filling ratio of ~3.4. Thus, -the fact that the vector operation is 5.8x faster seems explainable (note that the gap may -narrow with more complex data types and mapping functions). +Here, the vector results are in line with the dictionary co-iteration! Using insertion, instead of preserving the existing indices, is comparitively slow. @@ -507,7 +523,7 @@ julia> function f(d1, d2) f (generic function with 1 method) julia> @btime f(d1, d2); - 2.161 s (10000073 allocations: 846.35 MiB) + 2.793 s (10000090 allocations: 668.42 MiB) ``` Unfortunately, insertion appears to be the idiomatic way of doing things with `Base.Dict`. @@ -526,7 +542,7 @@ julia> function g(d1, d2) g (generic function with 1 method) julia> @btime g(dict1, dict2); - 10.985 s (72 allocations: 541.17 MiB) + 9.362 s (72 allocations: 541.17 MiB) ``` The result is similar with generators, which is possibly the easiest way of dealing with @@ -537,5 +553,5 @@ julia> @btime Dict(i => dict1[i] + dict2[i] for i in keys(dict1)); 13.787 s (89996503 allocations: 2.02 GiB) ``` -This represents a 88x speedup between the first example with `HashDictionary` to this last +This represents a 590x speedup between the first example with `HashDictionary` to this last example with `Base.Dict`. diff --git a/benchmark/bench_indices.jl b/benchmark/bench_indices.jl index 0614228..3bcaee2 100644 --- a/benchmark/bench_indices.jl +++ b/benchmark/bench_indices.jl @@ -353,6 +353,10 @@ for n in sizes y = n ÷ 2 pred1(x) = x != y pred2(x) = x == y + mostly_unique = rand(1:n, n) + sorted_mostly_unique = sort(mostly_unique) + rarely_unique = rand(1:floor(Int, sqrt(n)), n) + sorted_rarely_unique = sort(rarely_unique) if n < cutoff vec = collect(r) @@ -382,29 +386,37 @@ for n in sizes suite_n = suite["$n"] = BenchmarkGroup() - # s = suite_n["constructor ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable Vector($r)) - # s["Set"] = @benchmarkable Set($r) - # s["OrderedSet"] = @benchmarkable OrderedSet($r) - # n < cutoff && (s["Indices"] = @benchmarkable Indices($r)) - # s["HashIndices"] = @benchmarkable HashIndices($r) - # s["OldHashIndices"] = @benchmarkable HashIndices($r) - - # s = suite_n["build by insertion ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector (push!)"] = @benchmarkable build_vector_by_insertion($n)) - # s["Set"] = @benchmarkable build_set_by_insertion($n) - # s["OrderedSet"] = @benchmarkable build_ordered_set_by_insertion($n) - # n < cutoff && (s["Set"] = @benchmarkable build_indices_by_insertion($n)) - # s["HashIndices"] = @benchmarkable build_hashindices_by_insertion($n) - # s["OldHashIndices"] = @benchmarkable build_old_hashindices_by_insertion($n) - - # s = suite_n["empty by deletion ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector (pop!)"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=collect($r)) evals=1) - # s["Set"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Set($r)) evals=1 - # s["OrderedSet"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=OrderedSet($r)) evals=1 - # n < cutoff && (s["Indices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Indices(collect($r))) evals=1) - # s["HashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=HashIndices($r)) evals=1 - # s["OldHashIndices"] = @benchmarkable empty_by_deletion(s, $n) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 + s = suite_n["constructor ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable Vector($r)) + s["Set"] = @benchmarkable Set($r) + s["OrderedSet"] = @benchmarkable OrderedSet($r) + n < cutoff && (s["Indices"] = @benchmarkable Indices($r)) + s["HashIndices"] = @benchmarkable HashIndices($r) + s["OldHashIndices"] = @benchmarkable HashIndices($r) + + s = suite_n["build by insertion ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector (push!)"] = @benchmarkable build_set_by_insertion($n)) + s["Set"] = @benchmarkable build_set_by_insertion($n) + s["OrderedSet"] = @benchmarkable build_ordered_set_by_insertion($n) + n < cutoff && (s["Set"] = @benchmarkable build_indices_by_insertion($n)) + s["HashIndices"] = @benchmarkable build_hashindices_by_insertion($n) + s["OldHashIndices"] = @benchmarkable build_old_hashindices_by_insertion($n) + + s = suite_n["copy ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector (pop!)"] = @benchmarkable copy($vec)) + s["Set"] = @benchmarkable copy($set) + s["OrderedSet"] = @benchmarkable copy($ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable copy($indices)) + s["HashIndices"] = @benchmarkable copy($hash_indices) + s["OldHashIndices"] = @benchmarkable copy($old_hash_indices) + + s = suite_n["copy and empty by deletion ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector (pop!)"] = @benchmarkable empty_by_deletion(copy($vec), $n)) + s["Set"] = @benchmarkable empty_by_deletion(copy($set), $n) + s["OrderedSet"] = @benchmarkable empty_by_deletion(copy($ordered_set), $n) + n < cutoff && (s["Indices"] = @benchmarkable empty_by_deletion(copy($indices), $n)) + s["HashIndices"] = @benchmarkable empty_by_deletion(copy($hash_indices), $n) + s["OldHashIndices"] = @benchmarkable empty_by_deletion(copy($old_hash_indices), $n) s = suite_n["insertion/deletion tests ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable basic_set_test($n) @@ -413,189 +425,220 @@ for n in sizes s["HashIndices"] = @benchmarkable basic_hash_indices_test($n) s["OldHashIndices"] = @benchmarkable basic_old_hash_indices_test($n) - # s = suite_n["in ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable all_in($vec, $n)) - # s["Set"] = @benchmarkable all_in($set, $n) - # s["OrderedSet"] = @benchmarkable all_in($ordered_set, $n) - # n < cutoff && (s["Indices"] = @benchmarkable all_in($indices, $n)) - # s["HashIndices"] = @benchmarkable all_in($hash_indices, $n) - # s["OldHashIndices"] = @benchmarkable all_in($old_hash_indices, $n) - - # s = suite_n["not in ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable not_in($vec, $n)) - # s["Set"] = @benchmarkable not_in($set, $n) - # s["OrderedSet"] = @benchmarkable not_in($ordered_set, $n) - # n < cutoff && (s["Indices"] = @benchmarkable not_in($indices, $n)) - # s["HashIndices"] = @benchmarkable not_in($hash_indices, $n) - # s["OldHashIndices"] = @benchmarkable not_in($old_hash_indices, $n) - - # s = suite_n["count ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable count(iseven, $vec)) - # s["Set"] = @benchmarkable count(iseven, $set) - # s["OrderedSet"] = @benchmarkable count(iseven, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable count(iseven, $indices)) - # s["HashIndices"] = @benchmarkable count(iseven, $hash_indices) - # s["OldHashIndices"] = @benchmarkable count(iseven, $old_hash_indices) - - # s = suite_n["sum ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable sum($vec)) - # s["Set"] = @benchmarkable sum($set) - # s["OrderedSet"] = @benchmarkable sum($ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable sum($indices)) - # s["HashIndices"] = @benchmarkable sum($hash_indices) - # s["OldHashIndices"] = @benchmarkable sum($old_hash_indices) - - # s = suite_n["foreach ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable foreachsum($vec)) - # s["Set"] = @benchmarkable foreachsum($set) - # s["OrderedSet"] = @benchmarkable foreachsum($ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable foreachsum($indices)) - # s["HashIndices"] = @benchmarkable foreachsum($hash_indices) - # s["OldHashIndices"] = @benchmarkable foreachsum($old_hash_indices) - - # s = suite_n["filter-map-reduce via generator ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x)))) - # s["Set"] = @benchmarkable sum($(2x for x in set if isodd(x))) - # s["OrderedSet"] = @benchmarkable sum($(2x for x in ordered_set if isodd(x))) - # n < cutoff && (s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x)))) - # s["HashIndices"] = @benchmarkable sum($(2x for x in hash_indices if isodd(x))) - # s["OldHashIndices"] = @benchmarkable sum($(2x for x in old_hash_indices if isodd(x))) - - # s = suite_n["filter (most) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable filter($pred1, $vec)) - # s["Set"] = @benchmarkable filter($pred1, $set) - # s["OrderedSet"] = @benchmarkable filter($pred1, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable filter($pred1, $indices)) - # s["HashIndices"] = @benchmarkable filter($pred1, $hash_indices) - # s["OldHashIndices"] = @benchmarkable filter($pred1, $old_hash_indices) - - # s = suite_n["filter (half) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable filter(iseven, $vec)) - # s["Set"] = @benchmarkable filter(iseven, $set) - # s["OrderedSet"] = @benchmarkable filter(iseven, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable filter(iseven, $indices)) - # s["HashIndices"] = @benchmarkable filter(iseven, $hash_indices) - # s["OldHashIndices"] = @benchmarkable filter(iseven, $old_hash_indices) - - # s = suite_n["filter (few) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable filter($pred2, $vec)) - # s["Set"] = @benchmarkable filter($pred2, $set) - # s["OrderedSet"] = @benchmarkable filter($pred2, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable filter($pred2, $indices)) - # s["HashIndices"] = @benchmarkable filter($pred2, $hash_indices) - # s["OldHashIndices"] = @benchmarkable filter($pred2, $old_hash_indices) - - # s = suite_n["filter! (most) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable filter!($pred1, s) setup=(s=collect($r)) evals=1) - # s["Set"] = @benchmarkable filter!($pred1, s) setup=(s=Set($r)) evals=1 - # s["OrderedSet"] = @benchmarkable filter!($pred1, s) setup=(s=OrderedSet($r)) evals=1 - # n < cutoff && (s["Indices"] = @benchmarkable filter!($pred1, s) setup=(s=Indices(collect($r))) evals=1) - # s["HashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=HashIndices($r)) evals=1 - # s["OldHashIndices"] = @benchmarkable filter!($pred1, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 - - # s = suite_n["filter! (half) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable filter!($iseven, s) setup=(s=collect($r)) evals=1) - # s["Set"] = @benchmarkable filter!($iseven, s) setup=(s=Set($r)) evals=1 - # s["OrderedSet"] = @benchmarkable filter!($iseven, s) setup=(s=OrderedSet($r)) evals=1 - # n < cutoff && (s["Indices"] = @benchmarkable filter!($iseven, s) setup=(s=Indices(collect($r))) evals=1) - # s["HashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=HashIndices($r)) evals=1 - # s["OldHashIndices"] = @benchmarkable filter!($iseven, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 - - # s = suite_n["filter! (few) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable filter!($pred2, s) setup=(s=collect($r)) evals=1) - # s["Set"] = @benchmarkable filter!($pred2, s) setup=(s=Set($r)) evals=1 - # s["OrderedSet"] = @benchmarkable filter!($pred2, s) setup=(s=OrderedSet($r)) evals=1 - # n < cutoff && (s["Indices"] = @benchmarkable filter!($pred2, s) setup=(s=Indices(collect($r))) evals=1) - # s["HashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=HashIndices($r)) evals=1 - # s["OldHashIndices"] = @benchmarkable filter!($pred2, s) setup=(s=Dictionaries.OldHashIndices($r)) evals=1 - - # s = suite_n["union ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable union($even_vec, $odd_vec)) - # s["Set"] = @benchmarkable union($even_set, $odd_set) - # s["OrderedSet"] = @benchmarkable union($even_ordered_set, $odd_ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable union($even_indices, $even_indices)) - # s["HashIndices"] = @benchmarkable union($even_hash_indices, $odd_hash_indices) - # s["OldHashIndices"] = @benchmarkable union($even_old_hash_indices, $odd_old_hash_indices) - - # s = suite_n["intersect (empty) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $odd_vec)) - # s["Set"] = @benchmarkable intersect($even_set, $odd_set) - # s["OrderedSet"] = @benchmarkable intersect($even_ordered_set, $odd_ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $odd_indices)) - # s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $odd_hash_indices) - # s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $odd_old_hash_indices) - - # s = suite_n["intersect (half) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $vec)) - # s["Set"] = @benchmarkable intersect($even_set, $set) - # s["OrderedSet"] = @benchmarkable intersect($even_ordered_set, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $indices)) - # s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $hash_indices) - # s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $old_hash_indices) - - # s = suite_n["intersect (whole) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable intersect($vec, $vec)) - # s["Set"] = @benchmarkable intersect($set, $set) - # s["OrderedSet"] = @benchmarkable intersect($ordered_set, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable intersect($indices, $indices)) - # s["HashIndices"] = @benchmarkable intersect($hash_indices, $hash_indices) - # s["OldHashIndices"] = @benchmarkable intersect($old_hash_indices, $old_hash_indices) - - # s = suite_n["setdiff (whole) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $odd_vec)) - # s["Set"] = @benchmarkable setdiff($even_set, $odd_set) - # s["OrderedSet"] = @benchmarkable setdiff($even_ordered_set, $odd_ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $odd_indices)) - # s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $odd_hash_indices) - # s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $odd_old_hash_indices) - - # s = suite_n["setdiff (half) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $vec)) - # s["Set"] = @benchmarkable setdiff($even_set, $set) - # s["OrderedSet"] = @benchmarkable setdiff($even_ordered_set, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $indices)) - # s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $hash_indices) - # s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $old_hash_indices) - - # s = suite_n["setdiff (empty) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable setdiff($vec, $vec)) - # s["Set"] = @benchmarkable setdiff($set, $set) - # s["OrderedSet"] = @benchmarkable setdiff($ordered_set, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable setdiff($indices, $indices)) - # s["HashIndices"] = @benchmarkable setdiff($hash_indices, $hash_indices) - # s["OldHashIndices"] = @benchmarkable setdiff($old_hash_indices, $old_hash_indices) - - # s = suite_n["symdiff (whole) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $odd_vec)) - # s["Set"] = @benchmarkable symdiff($even_set, $odd_set) - # s["OrderedSet"] = @benchmarkable symdiff($even_ordered_set, $odd_ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $odd_indices)) - # s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) - # s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $odd_old_hash_indices) - - # s = suite_n["symdiff (left half) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $odd_vec)) - # s["Set"] = @benchmarkable symdiff($set, $odd_set) - # s["OrderedSet"] = @benchmarkable symdiff($ordered_set, $odd_ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $odd_indices)) - # s["HashIndices"] = @benchmarkable symdiff($hash_indices, $odd_hash_indices) - # s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $odd_old_hash_indices) - - # s = suite_n["symdiff (right half) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $vec)) - # s["Set"] = @benchmarkable symdiff($even_set, $set) - # s["OrderedSet"] = @benchmarkable symdiff($even_ordered_set, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $indices)) - # s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $hash_indices) - # s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $old_hash_indices) - - # s = suite_n["symdiff (empty) ($n)"] = BenchmarkGroup() - # n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $vec)) - # s["Set"] = @benchmarkable symdiff($set, $set) - # s["OrderedSet"] = @benchmarkable symdiff($ordered_set, $ordered_set) - # n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $indices)) - # s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) - # s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $old_hash_indices) + s = suite_n["insertion/deletion tests ($n)"] = BenchmarkGroup() + s["Set"] = @benchmarkable basic_set_test($n) + s["OrderedSet"] = @benchmarkable basic_ordered_set_test($n) + n < cutoff && (s["Indices"] = @benchmarkable basic_indices_test($n)) + s["HashIndices"] = @benchmarkable basic_hash_indices_test($n) + s["OldHashIndices"] = @benchmarkable basic_old_hash_indices_test($n) + + s = suite_n["unique/distinct (high uniqueness, unsorted) ($n)"] = BenchmarkGroup() + s["Vector (unique)"] = @benchmarkable unique($mostly_unique) + s["Set"] = @benchmarkable Set($mostly_unique) + s["OrderedSet"] = @benchmarkable OrderedSet($mostly_unique) + s["HashIndices (distinct)"] = @benchmarkable distinct($mostly_unique) + + s = suite_n["unique/distinct (high uniqueness, sorted) ($n)"] = BenchmarkGroup() + s["Vector (unique)"] = @benchmarkable unique($sorted_mostly_unique) + s["Set"] = @benchmarkable Set($sorted_mostly_unique) + s["OrderedSet"] = @benchmarkable OrderedSet($sorted_mostly_unique) + s["HashIndices"] = @benchmarkable distinct($sorted_mostly_unique) + + s = suite_n["unique/distinct (low uniqueness, unsorted) ($n)"] = BenchmarkGroup() + s["Vector (unique)"] = @benchmarkable unique($rarely_unique) + s["Set"] = @benchmarkable Set($rarely_unique) + s["OrderedSet"] = @benchmarkable OrderedSet($rarely_unique) + s["HashIndices"] = @benchmarkable distinct($rarely_unique) + + s = suite_n["unique/distinct (low uniqueness, sorted) ($n)"] = BenchmarkGroup() + s["Vector (unique)"] = @benchmarkable unique($sorted_rarely_unique) + s["Set"] = @benchmarkable Set($sorted_rarely_unique) + s["OrderedSet"] = @benchmarkable OrderedSet($sorted_rarely_unique) + s["HashIndices"] = @benchmarkable distinct($sorted_rarely_unique) + + s = suite_n["in ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable all_in($vec, $n)) + s["Set"] = @benchmarkable all_in($set, $n) + s["OrderedSet"] = @benchmarkable all_in($ordered_set, $n) + n < cutoff && (s["Indices"] = @benchmarkable all_in($indices, $n)) + s["HashIndices"] = @benchmarkable all_in($hash_indices, $n) + s["OldHashIndices"] = @benchmarkable all_in($old_hash_indices, $n) + + s = suite_n["not in ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable not_in($vec, $n)) + s["Set"] = @benchmarkable not_in($set, $n) + s["OrderedSet"] = @benchmarkable not_in($ordered_set, $n) + n < cutoff && (s["Indices"] = @benchmarkable not_in($indices, $n)) + s["HashIndices"] = @benchmarkable not_in($hash_indices, $n) + s["OldHashIndices"] = @benchmarkable not_in($old_hash_indices, $n) + + s = suite_n["count ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable count(iseven, $vec)) + s["Set"] = @benchmarkable count(iseven, $set) + s["OrderedSet"] = @benchmarkable count(iseven, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable count(iseven, $indices)) + s["HashIndices"] = @benchmarkable count(iseven, $hash_indices) + s["OldHashIndices"] = @benchmarkable count(iseven, $old_hash_indices) + + s = suite_n["sum ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable sum($vec)) + s["Set"] = @benchmarkable sum($set) + s["OrderedSet"] = @benchmarkable sum($ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable sum($indices)) + s["HashIndices"] = @benchmarkable sum($hash_indices) + s["OldHashIndices"] = @benchmarkable sum($old_hash_indices) + + s = suite_n["foreach ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable foreachsum($vec)) + s["Set"] = @benchmarkable foreachsum($set) + s["OrderedSet"] = @benchmarkable foreachsum($ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable foreachsum($indices)) + s["HashIndices"] = @benchmarkable foreachsum($hash_indices) + s["OldHashIndices"] = @benchmarkable foreachsum($old_hash_indices) + + s = suite_n["filter-map-reduce via generator ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x)))) + s["Set"] = @benchmarkable sum($(2x for x in set if isodd(x))) + s["OrderedSet"] = @benchmarkable sum($(2x for x in ordered_set if isodd(x))) + n < cutoff && (s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x)))) + s["HashIndices"] = @benchmarkable sum($(2x for x in hash_indices if isodd(x))) + s["OldHashIndices"] = @benchmarkable sum($(2x for x in old_hash_indices if isodd(x))) + + s = suite_n["filter (most) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter($pred1, $vec)) + s["Set"] = @benchmarkable filter($pred1, $set) + s["OrderedSet"] = @benchmarkable filter($pred1, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable filter($pred1, $indices)) + s["HashIndices"] = @benchmarkable filter($pred1, $hash_indices) + s["OldHashIndices"] = @benchmarkable filter($pred1, $old_hash_indices) + + s = suite_n["filter (half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter(iseven, $vec)) + s["Set"] = @benchmarkable filter(iseven, $set) + s["OrderedSet"] = @benchmarkable filter(iseven, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable filter(iseven, $indices)) + s["HashIndices"] = @benchmarkable filter(iseven, $hash_indices) + s["OldHashIndices"] = @benchmarkable filter(iseven, $old_hash_indices) + + s = suite_n["filter (few) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter($pred2, $vec)) + s["Set"] = @benchmarkable filter($pred2, $set) + s["OrderedSet"] = @benchmarkable filter($pred2, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable filter($pred2, $indices)) + s["HashIndices"] = @benchmarkable filter($pred2, $hash_indices) + s["OldHashIndices"] = @benchmarkable filter($pred2, $old_hash_indices) + + s = suite_n["copy and filter! (most) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter!($pred1, copy($vec))) + s["Set"] = @benchmarkable filter!($pred1, copy($set)) + s["OrderedSet"] = @benchmarkable filter!($pred1, copy($ordered_set)) + n < cutoff && (s["Indices"] = @benchmarkable filter!($pred1, copy($indices))) + s["HashIndices"] = @benchmarkable filter!($pred1, copy($hash_indices)) + s["OldHashIndices"] = @benchmarkable filter!($pred1, copy($old_hash_indices)) + + s = suite_n["copy and filter! (half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter!(iseven, copy($vec))) + s["Set"] = @benchmarkable filter!(iseven, copy($set)) + s["OrderedSet"] = @benchmarkable filter!(iseven, copy($ordered_set)) + n < cutoff && (s["Indices"] = @benchmarkable filter!(iseven, copy($indices))) + s["HashIndices"] = @benchmarkable filter!(iseven, copy($hash_indices)) + s["OldHashIndices"] = @benchmarkable filter!(iseven, copy($old_hash_indices)) + + s = suite_n["copy and filter! (few) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable filter!($pred2, copy($vec))) + s["Set"] = @benchmarkable filter!($pred2, copy($set)) + s["OrderedSet"] = @benchmarkable filter!($pred2, copy($ordered_set)) + n < cutoff && (s["Indices"] = @benchmarkable filter!($pred2, copy($indices))) + s["HashIndices"] = @benchmarkable filter!($pred2, copy($hash_indices)) + s["OldHashIndices"] = @benchmarkable filter!($pred2, copy($old_hash_indices)) + + s = suite_n["union ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable union($even_vec, $odd_vec)) + s["Set"] = @benchmarkable union($even_set, $odd_set) + s["OrderedSet"] = @benchmarkable union($even_ordered_set, $odd_ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable union($even_indices, $even_indices)) + s["HashIndices"] = @benchmarkable union($even_hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable union($even_old_hash_indices, $odd_old_hash_indices) + + s = suite_n["intersect (empty) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $odd_vec)) + s["Set"] = @benchmarkable intersect($even_set, $odd_set) + s["OrderedSet"] = @benchmarkable intersect($even_ordered_set, $odd_ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $odd_indices)) + s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $odd_old_hash_indices) + + s = suite_n["intersect (half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $vec)) + s["Set"] = @benchmarkable intersect($even_set, $set) + s["OrderedSet"] = @benchmarkable intersect($even_ordered_set, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $indices)) + s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $old_hash_indices) + + s = suite_n["intersect (whole) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable intersect($vec, $vec)) + s["Set"] = @benchmarkable intersect($set, $set) + s["OrderedSet"] = @benchmarkable intersect($ordered_set, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable intersect($indices, $indices)) + s["HashIndices"] = @benchmarkable intersect($hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable intersect($old_hash_indices, $old_hash_indices) + + s = suite_n["setdiff (whole) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $odd_vec)) + s["Set"] = @benchmarkable setdiff($even_set, $odd_set) + s["OrderedSet"] = @benchmarkable setdiff($even_ordered_set, $odd_ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $odd_indices)) + s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $odd_old_hash_indices) + + s = suite_n["setdiff (half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $vec)) + s["Set"] = @benchmarkable setdiff($even_set, $set) + s["OrderedSet"] = @benchmarkable setdiff($even_ordered_set, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $indices)) + s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $old_hash_indices) + + s = suite_n["setdiff (empty) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable setdiff($vec, $vec)) + s["Set"] = @benchmarkable setdiff($set, $set) + s["OrderedSet"] = @benchmarkable setdiff($ordered_set, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable setdiff($indices, $indices)) + s["HashIndices"] = @benchmarkable setdiff($hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable setdiff($old_hash_indices, $old_hash_indices) + + s = suite_n["symdiff (whole) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $odd_vec)) + s["Set"] = @benchmarkable symdiff($even_set, $odd_set) + s["OrderedSet"] = @benchmarkable symdiff($even_ordered_set, $odd_ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $odd_indices)) + s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $odd_old_hash_indices) + + s = suite_n["symdiff (left half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $odd_vec)) + s["Set"] = @benchmarkable symdiff($set, $odd_set) + s["OrderedSet"] = @benchmarkable symdiff($ordered_set, $odd_ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $odd_indices)) + s["HashIndices"] = @benchmarkable symdiff($hash_indices, $odd_hash_indices) + s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $odd_old_hash_indices) + + s = suite_n["symdiff (right half) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $vec)) + s["Set"] = @benchmarkable symdiff($even_set, $set) + s["OrderedSet"] = @benchmarkable symdiff($even_ordered_set, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $indices)) + s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $old_hash_indices) + + s = suite_n["symdiff (empty) ($n)"] = BenchmarkGroup() + n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $vec)) + s["Set"] = @benchmarkable symdiff($set, $set) + s["OrderedSet"] = @benchmarkable symdiff($ordered_set, $ordered_set) + n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $indices)) + s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) + s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $old_hash_indices) end end # module diff --git a/src/AbstractDictionary.jl b/src/AbstractDictionary.jl index 7937371..15a0bab 100644 --- a/src/AbstractDictionary.jl +++ b/src/AbstractDictionary.jl @@ -235,6 +235,80 @@ function Base.unique(d::AbstractDictionary) return out end +# TODO think of a name for this. This matches Base.unique but ideally `distinct` could be +# be an abstract factory method, like `distinct(BTreeIndices{Int}, itr)`. +#= +""" + distinct(f, itr; to=HashDictionary) + +Collect the first element of iterator `itr` for each unique value produced by `f` applied to +elements of `itr` into a new collection, defaulting to `HashDictionary`. Similar to +`Base.unique`, except returning a dictionary instead of an array. + +# Example + +```julia +julia> distinct(first, ["Alice", "Bob", "Charlie"]) +3-element HashDictionary{Char,String} + 'A' │ "Alice" + 'B' │ "Bob" + 'C' │ "Charlie" + +julia> distinct(first, ["Alice", "Bob", "Charlie", "Chaz"]) +3-element HashDictionary{Char,String} + 'A' │ "Alice" + 'B' │ "Bob" + 'C' │ "Charlie" +``` +""" +distinct(f, itr) = _distinct(f, HashDictionary, itr) +=# + +function _distinct(f, ::Type{T}, itr) where T + out = T() + for x in itr + i = f(x) + (hadtoken, token) = gettoken!(out, x) + if !hadtoken + @inbounds settokenvalue!(out, token, i) + end + end + return out +end + +# An auto-widening AbstractDictionary constructor +function __distinct(f, dict, itr, s) + I = keytype(dict) + T = eltype(dict) + tmp = iterate(itr, s) + while tmp !== nothing + (x, s) = tmp + i = f(x) + if !(i isa I) + new_inds = copy(keys(dict), promote_type(I, typeof(i))) + new_dict = similar(new_inds, promote_type(T, typeof(x))) + (hadtoken, token) = gettoken!(new_dict, i) + if !hadtoken + @inbounds settokenvalue!(new_dict, token, x) + end + return __distinct(f, new_dict, itr, s) + elseif !(x isa T) + new_dict = copy(dict, promote_type(T, typeof(x))) + (hadtoken, token) = gettoken!(new_dict, i) + if !hadtoken + @inbounds settokenvalue!(new_dict, token, x) + end + return __distinct(f, new_dict, itr, s) + end + (hadtoken, token) = gettoken!(dict, i) + if !hadtoken + @inbounds settokenvalue!(dict, token, x) + end + tmp = iterate(itr, s) + end + return dict +end + ### Settable interface """ @@ -268,7 +342,7 @@ function Base.isassigned(dict::AbstractDictionary{I}, i::I) where {I} end end -function Base.setindex!(dict::AbstractDictionary{I, T}, value::T, i::I) where {I, T} +@propagate_inbounds function Base.setindex!(dict::AbstractDictionary{I, T}, value::T, i::I) where {I, T} if !(istokenizable(dict)) error("Every settable AbstractDictionary type must define a method for `setindex!`: $(typeof(dict))") end @@ -293,6 +367,10 @@ Construct a new `issettable` dictionary with identical `keys` as `d` and an elem Base.similar(d::AbstractDictionary) = similar(keys(d), eltype(d)) Base.similar(d::AbstractDictionary, ::Type{T}) where {T} = similar(keys(d), T) +function Base.similar(indices::AbstractIndices{I}, ::Type{T}) where {I, T} + return similar(convert(HashIndices{I}, indices), T) +end + # fill! and fill function Base.fill!(d::AbstractDictionary, value) @@ -381,8 +459,17 @@ end # Copying - note that this doesn't necessarily copy the indices! (`copy(keys(dict))` can do that) -function Base.copy(d::AbstractDictionary) - out = similar(d) +""" + copy(dict::AbstractDictionary) + copy(dict::AbstractDictionary, ::Type{T}) + +Create a shallow copy of the values of `dict`. Note that `keys(dict)` is not copied, and +therefore care must be taken that inserting/deleting elements. A new element type `T` can +optionally be specified. +""" +Base.copy(dict::AbstractDictionary) = copy(dict, eltype(dict)) +function Base.copy(d::AbstractDictionary, ::Type{T}) where {T} + out = similar(d, T) copyto!(out, d) return out end diff --git a/src/AbstractIndices.jl b/src/AbstractIndices.jl index 6296652..61d16e5 100644 --- a/src/AbstractIndices.jl +++ b/src/AbstractIndices.jl @@ -79,6 +79,49 @@ end Base.unique(i::AbstractIndices) = i +""" + copy(inds::AbstractIndices) + copy(inds::AbstractIndices, I::Type) + +Construct a shallow copy of `inds`, possibly specifying a new element type `I`. The output +container is not guaranteed to be the same type as the input. +""" +Base.copy(inds::AbstractIndices) = copy(inds, eltype(inds)) + +function Base.copy(inds::AbstractIndices, ::Type{I}) where I + out = empty(inds, I) + for i in inds + insert!(out, i) + end + return out +end + +Base.empty(::AbstractIndices, ::Type{I}) where {I} = HashIndices{I}() + +""" + distinct(itr) + +Collect the distinct elements of iterator `itr` into a new collection. Similar to +`Base.unique`, except returning a set (`HashIndices`) instead of an array. + +# Example + +```julia +julia> distinct([1,2,3,3]) +3-element HashIndices{Int64} + 1 + 2 + 3 +``` +""" +distinct(itr) = _distinct(HashIndices, itr) + +function _distinct(::Type{T}, itr) where T + out = T() + union!(out, itr) + return out +end + struct IndexError <: Exception msg::String end @@ -225,7 +268,7 @@ function Base.intersect(i::AbstractIndices, itr) intersect!(out, itr) else out = empty(i) - intersect!(out, i) + union!(out, i) intersect!(out, itr) end return out @@ -237,7 +280,7 @@ function Base.setdiff(i::AbstractIndices, itr) setdiff!(out, itr) else out = empty(i) - setdiff!(out, i) + union!(out, i) setdiff!(out, itr) end return out @@ -249,8 +292,26 @@ function Base.symdiff(i::AbstractIndices, itr) symdiff!(out, itr) else out = empty(i) - symdiff!(out, i) + union!(out, i) symdiff!(out, itr) end return out end + +# issetequal and issubset(equal) should work already + +""" + disjoint(set1, set2) + +Return `true` if `set1` and `set2` are disjoint or `false`. Two sets are disjoint if no +elements of `set1` is in `set2`, and vice-versa. Somewhat equivalent to, but faster than, +`isempty(intersect(set1, set2))`. +""" +function disjoint(set1, set2) + for i in set1 + if i in set2 + return false + end + end + return true +end diff --git a/src/Dictionaries.jl b/src/Dictionaries.jl index d12d217..71dd505 100644 --- a/src/Dictionaries.jl +++ b/src/Dictionaries.jl @@ -1,20 +1,17 @@ module Dictionaries using Random - using Indexing - using Base: @propagate_inbounds, Callable export getindices, setindices! export AbstractDictionary, AbstractIndices, IndexError, Indices, HashIndices, HashDictionary, Dictionary, MappedDictionary, DictionaryView, FilteredDictionary, FilteredIndices, BroadcastedDictionary -export issettable, isinsertable, set!, unset!, dictionary +export dictionary, distinct, disjoint, filterview +export issettable, isinsertable, set!, unset! export istokenizable, tokentype, tokens, tokenized, gettoken, gettokenvalue, istokenassigned, settokenvalue!, gettoken!, deletetoken!, sharetokens -export filterview # TODO move to SplitApplyCombine.jl (and re-order project dependencies?) - include("AbstractDictionary.jl") include("AbstractIndices.jl") diff --git a/src/Dictionary.jl b/src/Dictionary.jl index 4e00478..d0ee71a 100644 --- a/src/Dictionary.jl +++ b/src/Dictionary.jl @@ -77,8 +77,6 @@ issettable(::VectorDictionary) = true return d end -Base.copy(d::VectorDictionary) = Dictionary(d.indices, copy(d.values)) - function Base.similar(inds::VectorIndices, ::Type{T}) where {T} return Dictionary(inds.inds, Vector{T}(undef, length(inds))) end diff --git a/src/HashDictionary.jl b/src/HashDictionary.jl index cd5da27..3543301 100644 --- a/src/HashDictionary.jl +++ b/src/HashDictionary.jl @@ -3,6 +3,7 @@ struct HashDictionary{I, T} <: AbstractDictionary{I, T} values::Vector{T} function HashDictionary{I, T}(inds::HashIndices{I}, values::Vector{T}) where {I, T} + # TODO make sure sizes match, deal with the fact that inds.holes might be nonzero return new(inds, values) end end @@ -164,8 +165,20 @@ end # Factories -Base.empty(::AbstractIndices, ::Type{I}, ::Type{T}) where {I, T} = HashDictionary{I, T}() +function Base.similar(indices::HashIndices{I}, ::Type{T}) where {I, T} + return HashDictionary(indices, Vector{T}(undef, length(indices.values))) +end -function Base.similar(indices::AbstractIndices{I}, ::Type{T}) where {I, T} - return HashDictionary(indices, Vector{T}(undef, length(indices))) +function _distinct(f, ::Type{HashDictionary}, itr) + tmp = iterate(itr) + if tmp === nothing + T = Base.@default_eltype(itr) + I = Core.Compiler.return_type(f, Tuple{T}) + return HashDictionary{I, T}() + end + (x, s) = tmp + i = f(x) + dict = HashDictionary{typeof(i), typeof(x)}() + insert!(dict, i, x) + return __distinct(f, dict, itr, s) end diff --git a/src/HashIndices.jl b/src/HashIndices.jl index 80f0f48..2112345 100644 --- a/src/HashIndices.jl +++ b/src/HashIndices.jl @@ -24,6 +24,32 @@ end HashIndices{I}(iter) Construct a `HashIndices` with indices from iterable container `iter`. + +Note that the elements of `iter` must be distinct/unique. Instead, the `distinct` function +can be used for finding the unique elements. + +# Examples + +```julia +julia> HashIndices([1,2,3]) +3-element HashIndices{Int64} + 1 + 2 + 3 + +julia> HashIndices([1,2,3,3]) +ERROR: IndexError("Indices are not unique (inputs at positions 3 and 4) - consider using the distinct function") +Stacktrace: + [1] HashIndices{Int64}(::Array{Int64,1}) at /home/ferris/.julia/dev/Dictionaries/src/HashIndices.jl:92 + [2] HashIndices(::Array{Int64,1}) at /home/ferris/.julia/dev/Dictionaries/src/HashIndices.jl:53 + [3] top-level scope at REPL[12]:1 + +julia> distinct([1,2,3,3]) +3-element HashIndices{Int64} + 1 + 2 + 3 +``` """ function HashIndices(iter) if Base.IteratorEltype(iter) === Base.EltypeUnknown() @@ -40,31 +66,96 @@ function HashIndices{I}(iter) where {I} @inbounds for (i, value) in enumerate(iter) values[i] = value end - return HashIndices{I}(values) else - h = HashIndices{I}() - for i in iter - insert!(h, i) + values = Vector{I}() + @inbounds for value in iter + push!(values, value) end - return h end + return HashIndices{I}(values) end function HashIndices{I}(values::Vector{I}) where {I} - # TODO Incrementally build the hashmap removing duplicates + # The input must have unique elements (the constructor is not to be used in place of `distinct`) + hashes = map(v -> hash(v) & hash_mask, values) + + # Incrementally build the hashmap and throw if duplicates detected + newsize = Base._tablesz(3*length(values) >> 0x01) + bit_mask = newsize - 1 # newsize is a power of two + slots = zeros(Int, newsize) + @inbounds for index in keys(hashes) + full_hash = hashes[index] + trial_slot = reinterpret(Int, full_hash) & bit_mask + @inbounds while true + trial_slot = (trial_slot + 1) + if slots[trial_slot] == 0 + slots[trial_slot] = index + break + else + # TODO make this check optional + if isequal(values[index], values[slots[trial_slot]]) + throw(IndexError("Indices are not unique (inputs at positions $(slots[trial_slot]) and $index) - consider using the distinct function")) + end + end + trial_slot = trial_slot & bit_mask + # This is potentially an infinte loop and care must be taken not to overfill the container + end + end + return HashIndices{I}(slots, hashes, values, 0) +end + +Base.convert(::Type{HashIndices}, inds::AbstractIndices{I}) where {I} = convert(HashIndices{I}, inds) + +function Base.convert(::Type{HashIndices{I}}, inds::AbstractIndices) where {I} + # Fast path + if inds isa HashIndices && inds.holes == 0 + # Note: `convert` doesn't have copy semantics + return HashIndices{I}(inds.slots, inds.hashes, convert(Vector{I}, inds.values), 0) + end + + # The input is already unique + values = collect(I, inds) hashes = map(v -> hash(v) & hash_mask, values) - slots = Vector{Int}() - out = HashIndices{I}(slots, hashes, values, 0) + + # Incrementally build the hashmap newsize = Base._tablesz(3*length(values) >> 0x01) - rehash!(out, newsize) - return out + bit_mask = newsize - 1 # newsize is a power of two + slots = zeros(Int, newsize) + @inbounds for index in keys(hashes) + full_hash = hashes[index] + trial_slot = reinterpret(Int, full_hash) & bit_mask + @inbounds while true + trial_slot = (trial_slot + 1) + if slots[trial_slot] == 0 + slots[trial_slot] = index + break + else + # TODO make this check optional + if isequal(values[index], values[slots[trial_slot]]) + throw(IndexError("Indices are not unique (inputs at positions $(slots[trial_slot]) and $index)")) + end + end + trial_slot = trial_slot & bit_mask + # This is potentially an infinte loop and care must be taken not to overfill the container + end + end + return HashIndices{I}(slots, hashes, values, 0) end -function Base.copy(indices::HashIndices{I}) where {I} +""" + copy(inds::AbstractIndices) + copy(inds::AbstractIndices, ::Type{I}) + +Create a shallow copy of the indices, optionally changing the element type. + +(Note that `copy` on a dictionary does not copy its indices). +""" +function Base.copy(indices::HashIndices, ::Type{I}) where {I} + _copy = I === eltype(indices) ? copy : identity # the constructor will call `convert` if indices.holes == 0 - return HashIndices{I}(copy(indices.slots), copy(indices.hashes), copy(indices.values), 0) + return HashIndices{I}(_copy(indices.slots), _copy(indices.hashes), _copy(indices.values), 0) else - out = HashIndices{I}(Vector{Int}(), copy(indices.hashes), copy(indices.values), indices.holes) + out = HashIndices{I}(Vector{Int}(), _copy(indices.hashes), _copy(indices.values), indices.holes) newsize = Base._tablesz(3*length(indices) >> 0x01) rehash!(out, newsize) end @@ -330,4 +421,38 @@ end # Factories -Base.empty(::AbstractIndices, ::Type{I}) where {I} = HashIndices{I}() +# TODO make this generic... maybe a type-based `empty`? +function _distinct(::Type{HashIndices}, itr) + if Base.IteratorEltype(itr) === Base.HasEltype() + return _distinct(HashIndices{eltype(itr)}, itr) + end + + tmp = iterate(itr) + if tmp === nothing + return HashIndices{Base.@default_eltype(itr)}() + end + (x, s) = tmp + indices = HashIndices{typeof(x)}() + insert!(indices, x) + return __distinct!(indices, itr, s, x) +end + +# An auto-widening constructor for insertable AbstractIndices +function __distinct!(indices::AbstractIndices, itr, s, x_old) + T = eltype(indices) + tmp = iterate(itr, s) + while tmp !== nothing + (x, s) = tmp + if !isequal(x, x_old) # Optimized for repeating elements of `itr`, e.g. if `itr` is sorted + if !(x isa T) && promote_type(typeof(x), T) != T + new_indices = copy(indices, promote_type(T, typeof(x))) + set!(new_indices, x) + return __distinct!(new_indices, itr, s, x) + end + set!(indices, x) + x_old = x + end + tmp = iterate(itr, s) + end + return indices +end diff --git a/src/Indices.jl b/src/Indices.jl index 0a7855a..6ae91ae 100644 --- a/src/Indices.jl +++ b/src/Indices.jl @@ -68,7 +68,13 @@ end Base.empty(inds::VectorIndices, ::Type{I}) where {I} = Indices{I, Vector{I}}(Vector{I}()) -Base.copy(inds::VectorIndices) = Indices(copy(inds.inds)) +function Base.copy(inds::VectorIndices, ::Type{I}) where {I} + if I === eltype(inds) + Indices{I}(copy(inds.inds)) + else + Indices{I}(inds.inds) # The constructor will call convert + end +end function Base.filter!(pred, inds::VectorIndices) filter!(pred, inds.inds) diff --git a/src/OldHashDictionary.jl b/src/OldHashDictionary.jl index 0f293bb..ae3389f 100644 --- a/src/OldHashDictionary.jl +++ b/src/OldHashDictionary.jl @@ -132,7 +132,7 @@ function gettoken!(d::OldHashDictionary{T}, key::T) where {T} end end -function Base.copy(d::OldHashDictionary{I, T}) where {I, T} +function Base.copy(d::OldHashDictionary{I, T}, ::Type{I}, ::Type{T}) where {I, T} return OldHashDictionary{I, T}(d.indices, copy(d.values), nothing) end diff --git a/src/OldHashIndices.jl b/src/OldHashIndices.jl index 858fbf3..4708104 100644 --- a/src/OldHashIndices.jl +++ b/src/OldHashIndices.jl @@ -57,7 +57,7 @@ function OldHashIndices{T}(iter) where {T} return h end -function Base.copy(h::OldHashIndices{T}) where {T} +function Base.copy(h::OldHashIndices{T}, ::Type{T}) where {T} return OldHashIndices{T}(copy(h.slots), copy(h.inds), h.ndel, h.count, h.idxfloor, h.maxprobe) end diff --git a/src/foreach.jl b/src/foreach.jl index af6296b..eac751c 100644 --- a/src/foreach.jl +++ b/src/foreach.jl @@ -4,9 +4,12 @@ function Base.foreach(f, d::AbstractDictionary, d2::AbstractDictionary, ds::Abst f(gettokenvalue(d, t), gettokenvalue(d2, t), map(x -> @inbounds(gettokenvalue(x, t)), ds)...) end else - @inbounds for i in keys(d) - f(d[i], d2[i], map(x -> @inbounds(x[i]), ds)...) - end + @boundscheck if !isequal(keys(d), keys(d2)) || any(dict -> !isequal(keys(d), keys(dict)), ds) + throw(IndexError("Indices do not match")) + end + @inbounds for xs in zip(d, d2, ds...) + f(xs...) + end end return nothing end @@ -18,9 +21,12 @@ function Base.foreach(f, d::AbstractDictionary, d2::AbstractDictionary) f(gettokenvalue(d, t), gettokenvalue(d2, t)) end else - @inbounds for i in keys(d) - f(d[i], d2[i]) - end + @boundscheck if !isequal(keys(d), keys(d2)) + throw(IndexError("Indices do not match")) + end + @inbounds for (x, x2) in zip(d, d2) + f(x, x2) + end end return nothing end diff --git a/src/insertion.jl b/src/insertion.jl index d9e7f54..ffe9724 100644 --- a/src/insertion.jl +++ b/src/insertion.jl @@ -369,9 +369,20 @@ end ### Indices ("sets") versions of above -function Base.union!(s1::AbstractIndices, s2::AbstractIndices) - for i in s2 - set!(s1, i) +function Base.union!(s1::AbstractIndices, itr) + # Optimized to handle repeated values in `itr` - e.g. if `itr` is already sorted + x = iterate(itr) + if x === nothing + return s1 + end + (i, s) = x + set!(s1, i) + i_old = i + while x !== nothing + (i, s) = x + !isequal(i, i_old) && set!(s1, i) + i_old = i + x = iterate(itr, s) end return s1 end @@ -462,4 +473,6 @@ elements of type `eltype(inds)`. """ Base.empty(d::AbstractDictionary) = empty(keys(d), keytype(d), eltype(d)) -Base.empty(d::AbstractDictionary, ::Type{I}) where {I} = empty(keys(d), I) \ No newline at end of file +Base.empty(d::AbstractDictionary, ::Type{I}) where {I} = empty(keys(d), I) + +Base.empty(::AbstractIndices, ::Type{I}, ::Type{T}) where {I, T} = HashDictionary{I, T}() diff --git a/src/map.jl b/src/map.jl index e885c6c..7adf686 100644 --- a/src/map.jl +++ b/src/map.jl @@ -1,16 +1,27 @@ # Make `map!` fast if the inputs and output share tokens -# TODO consider if `map` should respect iteration order or indices?? - function Base.map!(f, out::AbstractDictionary, d::AbstractDictionary, d2::AbstractDictionary, ds::AbstractDictionary...) if sharetokens(out, d, d2, ds...) @inbounds for t in tokens(out) settokenvalue!(out, t, f(gettokenvalue(d, t), gettokenvalue(d2, t), map(x -> @inbounds(gettokenvalue(x, t)), ds)...)) end + elseif istokenizable(out) + @boundscheck if !isequal(keys(out), keys(d)) || !isequal(keys(out), keys(d2)) || any(dict -> !isequal(keys(out), keys(dict)), ds) + throw(IndexError("Indices do not match")) + end + @inbounds for txs in zip(tokens(out), d, d2, ds...) + t = txs[1] + xs = Base.tail(txs) + settokenvalue!(out, t, f(xs...)) + end else - @boundscheck nothing # TODO check that indices match - @inbounds for i in keys(out) - out[i] = f(d[i], d2[i], map(x -> @inbounds(x[i]), ds)...) + @boundscheck if !isequal(keys(out), keys(d)) || !isequal(keys(out), keys(d2)) || any(dict -> !isequal(keys(out), keys(dict)), ds) + throw(IndexError("Indices do not match")) + end + @inbounds for ixs in zip(keys(out), d, d2, ds...) + i = ixs[1] + xs = Base.tail(ixs) + out[i] = f(xs...) end end return out @@ -22,8 +33,17 @@ function Base.map!(f, out::AbstractDictionary, d::AbstractDictionary, d2::Abstra @inbounds for t in tokens(out) settokenvalue!(out, t, f(gettokenvalue(d, t), gettokenvalue(d2, t))) end + elseif istokenizable(out) + @boundscheck if !isequal(keys(out), keys(d)) || !isequal(keys(out), keys(d2)) + throw(IndexError("Indices do not match")) + end + @inbounds for (t, x, x2) in zip(tokens(out), d, d2) + settokenvalue!(out, t, f(x, x2)) + end else - @boundscheck nothing # TODO check that indices match + @boundscheck if !isequal(keys(out), keys(d)) || !isequal(keys(out), keys(d2)) + throw(IndexError("Indices do not match")) + end @inbounds for i in keys(out) out[i] = f(d[i], d2[i]) end @@ -36,10 +56,19 @@ function Base.map!(f, out::AbstractDictionary, d::AbstractDictionary) @inbounds for t in tokens(out) settokenvalue!(out, t, f(gettokenvalue(d, t))) end + elseif istokenizable(out) + @boundscheck if !isequal(keys(out), keys(d)) + throw(IndexError("Indices do not match")) + end + @inbounds for (t, x) in zip(tokens(out), d) + settokenvalue!(out, t, f(x)) + end else - @boundscheck nothing # TODO check that indices match - @inbounds for i in keys(out) - out[i] = f(d[i]) + @boundscheck if !isequal(keys(out), keys(d)) + throw(IndexError("Indices do not match")) + end + @inbounds for (i, x) in zip(keys(out), d) + out[i] = f(x) end end return out @@ -60,12 +89,14 @@ end function Base.map(f, d::AbstractDictionary) out = similar(d, Core.Compiler.return_type(f, Tuple{eltype(d)})) - map!(f, out, d) + @inbounds map!(f, out, d) return out end function Base.map(f, d::AbstractDictionary, ds::AbstractDictionary...) out = similar(d, Core.Compiler.return_type(f, Tuple{eltype(d), map(eltype, ds)...})) - map!(f, out, d, ds...) + @inbounds map!(f, out, d, ds...) return out end + +# TODO mapreduce (mapfoldl) From cf9d94544788e45423515da6974dbdeab16b4dfd Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Wed, 10 Jun 2020 12:49:25 +1000 Subject: [PATCH 10/20] Many changes --- README.md | 29 ++- {src => contrib}/DenseHashDictionary.jl | 38 ++-- src/Dictionaries.jl | 2 +- src/Dictionary.jl | 14 +- src/HashDictionary.jl | 230 +++++++++++++++++++----- src/MappedDictionary.jl | 2 +- src/OldHashDictionary.jl | 2 +- test/HashDictionary.jl | 14 ++ test/HashIndices.jl | 7 + 9 files changed, 264 insertions(+), 74 deletions(-) rename {src => contrib}/DenseHashDictionary.jl (79%) diff --git a/README.md b/README.md index 7fd1215..b2a19f1 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ In this package we aim to devise a cohesive interface for abstract dictionaries Dictionaries share the common supertype `AbstractDictionary`, and the go-to container in this package is `HashDictionary` - which is a new hash-based implementation that serves as a replacement of Julia's inbuilt `Dict` type (using `hash` and `isequal` for key lookup and comparison). The three main difference to `Dict` are that it preserves the order of elements, it iterates much faster, and it iterates values rather than key-value pairs. +### Constructing dictionaries + You can construct one from a list of indices (or keys) and a list of values. ```julia @@ -29,7 +31,19 @@ julia> dict = HashDictionary(["a", "b", "c"], [1, 2, 3]) julia> dict["a"] 1 ``` -If you prefer, you can use the `dictionary` function to create a dictionary from something that iterates key-value pairs (note: this includes `Dict`s). + +The constructor also accepts any indexable container, preserving the keys and values. +```julia +julia> HashDictionary(Dict("a"=>1, "b"=>2, "c"=>3)) +3-element HashDictionary{String,Int64} + "c" │ 3 + "b" │ 2 + "a" │ 1 +``` + +If you prefer, you can use the `dictionary` function to create a dictionary from something +that iterates key-value pairs (either as a `Pair` or a two-tuple, etc), somewhat like a +`Dict` constructor. ```julia julia> dictionary(["a" => 1, "b" => 2, "c" => 3]) @@ -39,6 +53,19 @@ julia> dictionary(["a" => 1, "b" => 2, "c" => 3]) "c" │ 3 ``` +One final way to construct a dictionary is using the `index` function, which accepts a +function that constructs a "key" for each element in the collection. + +```julia +julia> index(first, ["Alice", "Bob", "Charlie"]) +3-element HashDictionary{Char,String} + 'A' │ "Alice" + 'B' │ "Bob" + 'C' │ "Charlie" +``` + +### Accessing dictionaries + The values of `HashDictionary` are mutable, or "settable", and can be modified via `setindex!`. However, just like for `Array`s, new indices (keys) are *never* created or rearranged this way. diff --git a/src/DenseHashDictionary.jl b/contrib/DenseHashDictionary.jl similarity index 79% rename from src/DenseHashDictionary.jl rename to contrib/DenseHashDictionary.jl index 23efffa..a270801 100644 --- a/src/DenseHashDictionary.jl +++ b/contrib/DenseHashDictionary.jl @@ -1,3 +1,7 @@ +module DenseHashDictionaries + +import Base: @propagate_inbounds +using Dictionaries export DenseHashIndices, DenseHashDictionary perfect_hash(::Any) = false @@ -41,19 +45,19 @@ end Base.length(indices::DenseHashIndices) = length(indices.values) # Token interface -istokenizable(::DenseHashIndices) = true +Dictionaries.istokenizable(::DenseHashIndices) = true -tokentype(::DenseHashIndices) = Int +Dictionaries.tokentype(::DenseHashIndices) = Int # Duration iteration the token cannot be used for deletion - we do not worry about the slots -@propagate_inbounds function iteratetoken(indices::DenseHashIndices) +@propagate_inbounds function Dictionaries.iteratetoken(indices::DenseHashIndices) if isempty(indices.values) return nothing end return ((0, 1), 1) end -@propagate_inbounds function iteratetoken(indices::DenseHashIndices, index::Int) +@propagate_inbounds function Dictionaries.iteratetoken(indices::DenseHashIndices, index::Int) if index == length(indices.values) return nothing end @@ -62,7 +66,7 @@ end end -function gettoken(indices::DenseHashIndices{I}, i::I) where {I} +function Dictionaries.gettoken(indices::DenseHashIndices{I}, i::I) where {I} full_hash = hash(i) n_slots = length(indices.slots) bit_mask = n_slots - 1 # n_slots is always a power of two @@ -85,14 +89,14 @@ function gettoken(indices::DenseHashIndices{I}, i::I) where {I} end end -@propagate_inbounds function gettokenvalue(indices::DenseHashIndices, (_slot, index)) +@propagate_inbounds function Dictionaries.gettokenvalue(indices::DenseHashIndices, (_slot, index)) return indices.values[index] end # Insertion interface -isinsertable(::DenseHashIndices) = true +Dictionaries.isinsertable(::DenseHashIndices) = true -function gettoken!(indices::DenseHashIndices{I}, i::I) where {I} +function Dictionaries.gettoken!(indices::DenseHashIndices{I}, i::I) where {I} full_hash = hash(i) n_slots = length(indices.slots) bit_mask = n_slots - 1 # n_slots is always a power of two @@ -131,7 +135,7 @@ function gettoken!(indices::DenseHashIndices{I}, i::I) where {I} return (false, (trial_slot, n_values + 1)) end -@propagate_inbounds function deletetoken!(indices::DenseHashIndices, (slot, index)) +@propagate_inbounds function Dictionaries.deletetoken!(indices::DenseHashIndices, (slot, index)) indices.slots[slot] = -1 splice!(indices.hashes, index) splice!(indices.values, index) @@ -168,28 +172,28 @@ Base.keys(dict::DenseHashDictionary) = dict.indices # tokens -tokenized(dict::DenseHashDictionary) = dict.values +Dictionaries.tokenized(dict::DenseHashDictionary) = dict.values # values -function istokenassigned(dict::DenseHashDictionary, (_slot, index)) +function Dictionaries.istokenassigned(dict::DenseHashDictionary, (_slot, index)) return isassigned(dict.values, index) end -@propagate_inbounds function gettokenvalue(dict::DenseHashDictionary, (_slot, index)) +@propagate_inbounds function Dictionaries.gettokenvalue(dict::DenseHashDictionary, (_slot, index)) return dict.values[index] end -issettable(::DenseHashDictionary) = true +Dictionaries.issettable(::DenseHashDictionary) = true -@propagate_inbounds function settokenvalue!(dict::DenseHashDictionary{<:Any, T}, (_slot, index), value::T) where {T} +@propagate_inbounds function Dictionaries.settokenvalue!(dict::DenseHashDictionary{<:Any, T}, (_slot, index), value::T) where {T} dict.values[index] = value return dict end # insertion -function gettoken!(dict::DenseHashDictionary{I}, i::I) where {I} +function Dictionaries.gettoken!(dict::DenseHashDictionary{I}, i::I) where {I} (hadtoken, (slot, index)) = gettoken!(keys(dict), i) if !hadtoken resize!(dict.values, length(dict.values) + 1) @@ -197,7 +201,7 @@ function gettoken!(dict::DenseHashDictionary{I}, i::I) where {I} return (hadtoken, (slot, index)) end -function deletetoken!(dict::DenseHashDictionary, (slot, index)) +function Dictionaries.deletetoken!(dict::DenseHashDictionary, (slot, index)) deletetoken!(dict.indices, (slot, index)) splice!(dict.values, index) return dict @@ -211,3 +215,5 @@ Base.empty(::DenseHashIndices, ::Type{I}, ::Type{T}) where {I, T} = DenseHashDic function Base.similar(indices::DenseHashIndices{I}, ::Type{T}) where {I, T} return DenseHashDictionary(indices, Vector{T}(undef, length(indices))) end + +end # module \ No newline at end of file diff --git a/src/Dictionaries.jl b/src/Dictionaries.jl index 71dd505..a0b0c64 100644 --- a/src/Dictionaries.jl +++ b/src/Dictionaries.jl @@ -8,7 +8,7 @@ export getindices, setindices! export AbstractDictionary, AbstractIndices, IndexError, Indices, HashIndices, HashDictionary, Dictionary, MappedDictionary, DictionaryView, FilteredDictionary, FilteredIndices, BroadcastedDictionary -export dictionary, distinct, disjoint, filterview +export dictionary, index, distinct, disjoint, filterview export issettable, isinsertable, set!, unset! export istokenizable, tokentype, tokens, tokenized, gettoken, gettokenvalue, istokenassigned, settokenvalue!, gettoken!, deletetoken!, sharetokens diff --git a/src/Dictionary.jl b/src/Dictionary.jl index d0ee71a..6478ec2 100644 --- a/src/Dictionary.jl +++ b/src/Dictionary.jl @@ -38,13 +38,15 @@ undefined/unitialized. Dictionary{I, T}(inds, ::UndefInitializer) where {I, T} = Dictionary{I, T}(inds, Vector{T}(undef, length(inds))) """ - Dictionary(dict::AbstractDictionary) + Dictionary(indexable) -Construct a `Dictionary` copy of `dict` with the same keys and values. +Construct a `Dictionary` from an indexable container `indexable` with the same `keys` and +`values`, equivalent to `Dictionary(keys(indexable), values(indexable))`. Note that +`indexable` may not be copied. """ -Dictionary(dict::AbstractDictionary) = Dictionary(keys(dict), dict) -Dictionary{I}(dict::AbstractDictionary) where {I} = Dictionary{I}(keys(dict), dict) -Dictionary{I, T}(dict::AbstractDictionary) where {I, T} = Dictionary{I, T}(keys(dict), dict) +Dictionary(indexable) = Dictionary(keys(indexable), values(indexable)) +Dictionary{I}(indexable) where {I} = Dictionary{I}(keys(indexable), values(indexable)) +Dictionary{I, T}(indexable) where {I, T} = Dictionary{I, T}(keys(indexable), values(indexable)) function Base.keys(d::Dictionary{I}) where {I} @@ -66,7 +68,7 @@ const VectorDictionary{I, T} = Dictionary{I, T, Vector{I}, Vector{T}} # token interface istokenizable(::VectorDictionary) = true -istokenassigned(d::VectorDictionary, t::Int) = isassigned(d.values, t) +istokenassigned(d::VectorDictionary, t) = isassigned(d.values, t) @propagate_inbounds gettokenvalue(d::VectorDictionary{<:Any, T}, t::Int) where {T} = d.values[t]::T # settable interface diff --git a/src/HashDictionary.jl b/src/HashDictionary.jl index 3543301..b003646 100644 --- a/src/HashDictionary.jl +++ b/src/HashDictionary.jl @@ -2,19 +2,89 @@ struct HashDictionary{I, T} <: AbstractDictionary{I, T} indices::HashIndices{I} values::Vector{T} - function HashDictionary{I, T}(inds::HashIndices{I}, values::Vector{T}) where {I, T} - # TODO make sure sizes match, deal with the fact that inds.holes might be nonzero + function HashDictionary{I, T}(inds::HashIndices{I}, values::Vector{T}, ::Nothing) where {I, T} + @assert length(values) == length(inds.values) return new(inds, values) end end +""" + HashDictionary{I,T}(;sizehint = 8) + +Construct an empty hash-based dictionary. `I` and `T` default to `Any` if not specified. A +`sizehint` may be specified to set the initial size of the hash table, which may speed up +subsequent `insert!` operations. + +# Example + +```julia +julia> d = HashDictionary{Int, Int}() +0-element HashDictionary{Int64,Int64} +``` +""" HashDictionary(; sizehint = 8) = HashDictionary{Any, Any}(; sizehint = sizehint) HashDictionary{I}(; sizehint = 8) where {I} = HashDictionary{I, Any}(; sizehint = sizehint) function HashDictionary{I, T}(; sizehint = 8) where {I, T} - HashDictionary{I, T}(HashIndices{I}(; sizehint = sizehint), Vector{T}()) + HashDictionary{I, T}(HashIndices{I}(; sizehint = sizehint), Vector{T}(), nothing) end +""" + HashDictionary(indexable) + HashDictionary{I}(indexable) + HashDictionary{I,T}(indexable) + +Construct a hash-based dictionary from an indexable input `indexable`, equivalent to +`HashDictionary(keys(indexable), values(indexable))`. The input might not be copied. + +Note: to construct a dictionary from `Pair`s use the `dictionary` function. See also the +`index` function. + +# Examples + +```julia +julia> HashDictionary(Dict(:a=>1, :b=>2)) +2-element HashDictionary{Symbol,Int64} + :a │ 1 + :b │ 2 + +julia> HashDictionary(3:-1:1) +3-element HashDictionary{Int64,Int64} + 1 │ 3 + 2 │ 2 + 3 │ 1 +``` +""" +HashDictionary(indexable) = HashDictionary(keys(indexable), values(indexable)) +HashDictionary{I}(indexable) where {I} = HashDictionary{I}(keys(indexable), values(indexable)) +HashDictionary{I, T}(indexable) where {I, T} = HashDictionary{I, T}(keys(indexable), values(indexable)) + +""" + HashDictionary(inds, values) + HashDictionary{I}(inds, values) + HashDictionary{I, T}(inds, values) + +Construct a hash-based dictionary from two iterable inputs `inds` and `values`. The first +value of `inds` will be the index for the first value of `values`. The input might not be +copied. + +Note: the values of `inds` must be distinct. Consider using `dictionary(zip(inds, values))` +if they are not. See also the `index` function. + +# Example + +julia> HashDictionary(["a", "b", "c"], [1, 2, 3]) +3-element HashDictionary{String,Int64} + "a" │ 1 + "b" │ 2 + "c" │ 3 + +julia> HashDictionary{String, Float64}(["a", "b", "c"], [1, 2, 3]) +3-element HashDictionary{String,Float6464} + "a" │ 1.0 + "b" │ 2.0 + "c" │ 3.0 +""" function HashDictionary(inds, values) return HashDictionary(HashIndices(inds), values) end @@ -40,63 +110,140 @@ function HashDictionary{I, T}(inds, values) where {I, T} end function HashDictionary{I, T}(inds::HashIndices{I}, values) where {I, T} + if inds.holes != 0 + inds = copy(inds) + end + iter_size = Base.IteratorSize(values) if iter_size isa Union{Base.HasLength, Base.HasShape} vs = Vector{T}(undef, length(values)) @inbounds for (i, v) in enumerate(values) vs[i] = v end - return HashDictionary{I, T}(inds, vs) + return HashDictionary{I, T}(inds, vs, nothing) else vs = Vector{T}() for v in values push!(vs, v) end - return HashDictionary{I, T}(inds, vs) + return HashDictionary{I, T}(inds, vs, nothing) end end """ dictionary(iter) -Construct a new `AbstractDictionary` from an iterable `iter` of key-value `Pair`s. The -default container type is `HashDictionary`. +Construct a new `AbstractDictionary` from an iterable `iter` of key-value `Pair`s (or other +iterables of two elements, such as a two-tuples). The default container type is +`HashDictionary`. If duplicate keys are detected, the first encountered value is retained. + +See also the `index` function. + +# Examples + +```julia +julia> dictionary(["a"=>1, "b"=>2, "c"=>3]) +3-element HashDictionary{String,Int64} + "a" │ 1 + "b" │ 2 + "c" │ 3 + +julia> dictionary(["a"=>1, "b"=>2, "c"=>3, "a"=>4]) +3-element HashDictionary{String,Int64} + "a" │ 1 + "b" │ 2 + "c" │ 3 + +julia> dictionary(zip(["a","b","c"], [1,2,3])) +3-element HashDictionary{String,Int64} + "a" │ 1 + "b" │ 2 + "c" │ 3 +``` """ function dictionary(iter) - if Base.IteratorEltype(iter) === Base.EltypeUnknown() - # TODO: implement automatic widening from iterators of Base.EltypeUnkown - iter = collect(iter) - end - _dictionary(eltype(iter), iter) + return _dictionary(first, last, HashDictionary, iter) end -dictionary(p1::Pair, p2::Pair...) = dictionary((p1, p2...)) +# An auto-widening HashDictionary constructor +function _dictionary(key, value, ::Type{HashDictionary}, iter) + tmp = iterate(iter) + if tmp === nothing + IT = Base.@default_eltype(iter) + I = Core.Compiler.return_type(first, Tuple{IT}) + T = Core.Compiler.return_type(last, Tuple{IT}) + return HashDictionary{I, T}() + end + (x, s) = tmp + i = key(x) + v = value(x) + dict = HashDictionary{typeof(i), typeof(v)}() + insert!(dict, i, v) + return __dictionary(key, value, dict, iter, s) +end -function _dictionary(::Type{Pair{I, T}}, iter) where {I, T} - iter_size = Base.IteratorSize(iter) - if iter_size isa Union{Base.HasLength, Base.HasShape} - n = length(iter) - inds = Vector{I}(undef, n) - vals = Vector{T}(undef, n) - j = 1 - @inbounds for (i, v) in iter - inds[j] = i - vals[j] = v - j += 1 +# An auto-widening AbstractDictionary constructor +function __dictionary(key, value, dict, iter, s) + I = keytype(dict) + T = eltype(dict) + tmp = iterate(iter, s) + while tmp !== nothing + (x, s) = tmp + i = key(x) + v = value(x) + if !(i isa I) + new_inds = copy(keys(dict), promote_type(I, typeof(i))) + new_dict = similar(new_inds, promote_type(T, typeof(v))) + (hadtoken, token) = gettoken!(new_dict, i) + if !hadtoken + @inbounds settokenvalue!(new_dict, token, v) + end + return __dictionary(key, value, new_dict, iter, s) + elseif !(v isa T) + new_dict = copy(dict, promote_type(T, typeof(v))) + (hadtoken, token) = gettoken!(new_dict, i) + if !hadtoken + @inbounds settokenvalue!(new_dict, token, v) + end + return __dictionary(key, value, new_dict, iter, s) end - return HashDictionary{I, T}(inds, vals) - else - inds = Vector{I}() - vals = Vector{T}() - @inbounds for (i, v) in iter - push!(inds, i) - push!(vals, v) - end - return HashDictionary{I, T}(inds, vals) + (hadtoken, token) = gettoken!(dict, i) + if !hadtoken + @inbounds settokenvalue!(dict, token, v) + end + tmp = iterate(iter, s) end + return dict +end + +""" + index(f, iter) + +Return a dictionary associating the values `x` of iterable collection `iter` with the key +`f(x)`. If keys are repeated, only the first is kept. Somewhat similar to `unique(f, iter)` + +See also the `dictionary` function. + +# Examples + +```julia +julia> index(first, ["Alice", "Bob", "Charlie"]) +3-element HashDictionary{Char,String} + 'A' │ "Alice" + 'B' │ "Bob" + 'C' │ "Charlie" + +julia> index(iseven, 1:10) +2-element HashDictionary{Bool,Int64} + false │ 1 + true │ 2 +``` +""" +function index(f, iter) + _dictionary(f, identity, HashDictionary, iter) end -# indices +# indicesi Base.keys(dict::HashDictionary) = dict.indices @@ -166,19 +313,6 @@ end # Factories function Base.similar(indices::HashIndices{I}, ::Type{T}) where {I, T} - return HashDictionary(indices, Vector{T}(undef, length(indices.values))) + return HashDictionary{I, T}(indices, Vector{T}(undef, length(indices.values)), nothing) end -function _distinct(f, ::Type{HashDictionary}, itr) - tmp = iterate(itr) - if tmp === nothing - T = Base.@default_eltype(itr) - I = Core.Compiler.return_type(f, Tuple{T}) - return HashDictionary{I, T}() - end - (x, s) = tmp - i = f(x) - dict = HashDictionary{typeof(i), typeof(x)}() - insert!(dict, i, x) - return __distinct(f, dict, itr, s) -end diff --git a/src/MappedDictionary.jl b/src/MappedDictionary.jl index 7df76a1..71ef4a9 100644 --- a/src/MappedDictionary.jl +++ b/src/MappedDictionary.jl @@ -26,7 +26,7 @@ end # TODO FIXME what do about tokens when there is more than one mapped dictioanry? For now, we disable them... istokenizable(d::MappedDictionary) = false function istokenizable(d::MappedDictionary{I, T, <:Any, <:Tuple{AbstractDictionary{<:I}}}) where {I, T} - return istokenizable(d.maps[1]) + return istokenizable(d.dicts[1]) end @propagate_inbounds function gettokenvalue(d::MappedDictionary{I, T, <:Any, <:Tuple{AbstractDictionary{<:I}}}, t) where {I, T} diff --git a/src/OldHashDictionary.jl b/src/OldHashDictionary.jl index ae3389f..dc39c85 100644 --- a/src/OldHashDictionary.jl +++ b/src/OldHashDictionary.jl @@ -156,7 +156,7 @@ function Base.sizehint!(d::OldHashDictionary, sz::Int) return d end -function Base.rehash!(d::OldHashDictionary, newsz::Int = length(d.inds)) +function Base.rehash!(d::OldHashDictionary, newsz::Int = length(d.indices)) _rehash!(d.indices, d.values, newsz) return d end diff --git a/test/HashDictionary.jl b/test/HashDictionary.jl index f1da44d..5d45eba 100644 --- a/test/HashDictionary.jl +++ b/test/HashDictionary.jl @@ -168,4 +168,18 @@ @test h[i] == i+1 end end + + @testset "dictionary" begin + res = HashDictionary(['a','b','c'], [1,2,3]) + @test isequal(dictionary(pairs(res)), res) + @test isequal(dictionary(['a'=>1, 'b'=>2, 'c'=>3]), res) + @test isequal(dictionary(['a'=>1, 'b'=>2, 'c'=>3, 'a'=>4]), res) + @test isequal(dictionary((k,v) for (k,v) in pairs(res)), res) + end + + @testset "index" begin + res = HashDictionary(['A','B','C'], ["Alice","Bob","Charlie"]) + @test isequal(index(first, ["Alice", "Bob", "Charlie"]), res) + @test isequal(index(first, ["Alice", "Bob", "Charlie", "Conner"]), res) + end end \ No newline at end of file diff --git a/test/HashIndices.jl b/test/HashIndices.jl index fec66fa..c99bdc2 100644 --- a/test/HashIndices.jl +++ b/test/HashIndices.jl @@ -94,5 +94,12 @@ end end + @testset "distinct" begin + res = HashIndices([1,2,3]) + @test distinct(res) === res + @test isequal(distinct([1,2,3]), res) + @test isequal(distinct([1,2,3,1]), res) + @test isequal(distinct([1,2,3]), res) + end # TODO: token interface end \ No newline at end of file From 0dbbbf5f1b005bd8a9b744dc2bec88872e78e6ec Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Wed, 10 Jun 2020 13:26:44 +1000 Subject: [PATCH 11/20] Fix co-iteration --- README.md | 14 +++++++------- src/HashDictionary.jl | 4 ++-- src/HashIndices.jl | 4 ++++ src/tokens.jl | 3 ++- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b2a19f1..4e81282 100644 --- a/README.md +++ b/README.md @@ -513,7 +513,7 @@ julia> d1 = HashDictionary(1:10_000_000, 10_000_000:-1:1); julia> d2 = d1 .+ 1; julia> @btime map(+, d1, d2); - 23.362 ms (18 allocations: 76.29 MiB) + 25.712 ms (20 allocations: 76.29 MiB) ``` The `copy` below makes `keys(d1) !== keys(d2)`, disabling token co-iteration (requiring @@ -521,7 +521,7 @@ mulitple hash-table lookups per element). ```julia julia> @btime map(+, d1, $(HashDictionary(copy(keys(d2)), d2))); - 1.485 s (18 allocations: 76.29 MiB) + 61.615 ms (20 allocations: 76.29 MiB) ``` For a comparitive baseline benchmark, we can try the same with dense vectors. @@ -532,7 +532,7 @@ julia> v1 = collect(10_000_000:-1:1); julia> v2 = v1 .+ 1; julia> @btime map(+, v1, v2); - 25.449 ms (5 allocations: 76.29 MiB) + 27.587 ms (5 allocations: 76.29 MiB) ``` Here, the vector results are in line with the dictionary co-iteration! @@ -550,7 +550,7 @@ julia> function f(d1, d2) f (generic function with 1 method) julia> @btime f(d1, d2); - 2.793 s (10000090 allocations: 668.42 MiB) + 2.819 s (10000091 allocations: 668.42 MiB) ``` Unfortunately, insertion appears to be the idiomatic way of doing things with `Base.Dict`. @@ -569,7 +569,7 @@ julia> function g(d1, d2) g (generic function with 1 method) julia> @btime g(dict1, dict2); - 9.362 s (72 allocations: 541.17 MiB) + 9.507 s (72 allocations: 541.17 MiB) ``` The result is similar with generators, which is possibly the easiest way of dealing with @@ -577,8 +577,8 @@ The result is similar with generators, which is possibly the easiest way of deal ```julia julia> @btime Dict(i => dict1[i] + dict2[i] for i in keys(dict1)); - 13.787 s (89996503 allocations: 2.02 GiB) + 13.046 s (89996503 allocations: 2.02 GiB) ``` -This represents a 590x speedup between the first example with `HashDictionary` to this last +This represents a 500x speedup between the first example with `HashDictionary` to this last example with `Base.Dict`. diff --git a/src/HashDictionary.jl b/src/HashDictionary.jl index b003646..ce86caf 100644 --- a/src/HashDictionary.jl +++ b/src/HashDictionary.jl @@ -3,8 +3,8 @@ struct HashDictionary{I, T} <: AbstractDictionary{I, T} values::Vector{T} function HashDictionary{I, T}(inds::HashIndices{I}, values::Vector{T}, ::Nothing) where {I, T} - @assert length(values) == length(inds.values) - return new(inds, values) + @assert length(values) == length(inds.values) + return new{I,T}(inds, values) end end diff --git a/src/HashIndices.jl b/src/HashIndices.jl index 2112345..bb062bd 100644 --- a/src/HashIndices.jl +++ b/src/HashIndices.jl @@ -456,3 +456,7 @@ function __distinct!(indices::AbstractIndices, itr, s, x_old) end return indices end + +# CAUTION: I have observed Julia 1.4.2 fail to preserve the object identity of HashIndices +# (or perhaps there is a coding error I don't understand that causes it to be recreated) +sharetokens(i1::HashIndices, i2::HashIndices) = i1.slots === i2.slots \ No newline at end of file diff --git a/src/tokens.jl b/src/tokens.jl index 2bdd003..1b6686d 100644 --- a/src/tokens.jl +++ b/src/tokens.jl @@ -222,5 +222,6 @@ performed quickly (e.g. O(1) rather than O(N)). Return `false` otherwise. Note: the test may not be precise, this defaults to `tokens(dict1) === tokens(dict2)`. """ -sharetokens(d1, d2) = istokenizable(d1) && istokenizable(d2) && tokens(d1) === tokens(d2) +sharetokens(i1::AbstractIndices, i2::AbstractIndices) = istokenizable(i1) && istokenizable(i2) && i1 === i2 +sharetokens(d1, d2) = sharetokens(keys(d1), keys(d2)) sharetokens(d1, d2, ds...) = sharetokens(d1, d2) && sharetokens(d1, ds...) From 2426c3065b515a74b5102ecedcd74e6117986f0b Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Wed, 10 Jun 2020 15:05:38 +1000 Subject: [PATCH 12/20] Fix a test --- src/AbstractIndices.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/AbstractIndices.jl b/src/AbstractIndices.jl index 61d16e5..08db8e6 100644 --- a/src/AbstractIndices.jl +++ b/src/AbstractIndices.jl @@ -115,6 +115,7 @@ julia> distinct([1,2,3,3]) ``` """ distinct(itr) = _distinct(HashIndices, itr) +distinct(inds::AbstractIndices) = inds function _distinct(::Type{T}, itr) where T out = T() From cb905c678e00db144a34da23eb0a34e588ea8bdd Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Thu, 11 Jun 2020 12:53:33 +1000 Subject: [PATCH 13/20] Move OldHashIndices/OldHashDictionaries to contrib/ --- benchmark/bench_indices.jl | 177 +++++++-------- ...Dictionary.jl => DenseHashDictionaries.jl} | 0 .../OldHashDictionaries.jl | 204 +++++++++++++++++- src/Dictionaries.jl | 8 +- src/OldHashDictionary.jl | 173 --------------- test/group.jl | 39 ---- 6 files changed, 285 insertions(+), 316 deletions(-) rename contrib/{DenseHashDictionary.jl => DenseHashDictionaries.jl} (100%) rename src/OldHashIndices.jl => contrib/OldHashDictionaries.jl (56%) delete mode 100644 src/OldHashDictionary.jl delete mode 100644 test/group.jl diff --git a/benchmark/bench_indices.jl b/benchmark/bench_indices.jl index 3bcaee2..c168fb8 100644 --- a/benchmark/bench_indices.jl +++ b/benchmark/bench_indices.jl @@ -3,11 +3,12 @@ module BenchHashIndices using BenchmarkTools using Dictionaries using OrderedCollections +# include("../contrib/OldHashDictionaries.jl") +# using .OldHashDictionaries const suite = BenchmarkGroup() -#sizes = [(8 .^ (0:8))...] -sizes = [10, 100, 1000, 10_000] #, 10_000, 10_000_000] +sizes = [10, 100, 1000, 10_000] cutoff = 101 function build_vector_by_insertion(n) @@ -50,13 +51,13 @@ function build_hashindices_by_insertion(n) return out end -function build_old_hashindices_by_insertion(n) - out = Dictionaries.OldHashIndices{Int}() - for i in 1:n - insert!(out, i) - end - return out -end +# function build_old_hashindices_by_insertion(n) +# out = OldHashIndices{Int}() +# for i in 1:n +# insert!(out, i) +# end +# return out +# end function empty_by_deletion(set::Vector, n) for i in 1:n @@ -300,53 +301,53 @@ function basic_hash_indices_test(N) return out end -function basic_old_hash_indices_test(N) - h = Dictionaries.OldHashIndices{Int}() - out = true - for i in 1:N - insert!(h, i) - end - for i in 1:N - out &= i in h - end - for i in 1:2:N - delete!(h, i) - end - for i in 1:N - out &= (i in h) == iseven(i) - end - for i in 1:2:N - insert!(h, i) - end - for i in 1:N - out &= i in h - end - for i in 1:N - delete!(h, i) - end - out &= isempty(h) - insert!(h, 7) - out &= 7 in h - for i in 1:N - set!(h, i) - end - for i in 1:N - out &= i in h - end - for i in 1:2:N - delete!(h, i) - end - for i in 1:N - out &= (i in h) == iseven(i) - end - for i in N+1:2N - insert!(h, i) - end - for i in 1:2N - out &= (i in h) == (i > N || iseven(i)) - end - return out -end +# function basic_old_hash_indices_test(N) +# h = OldHashIndices{Int}() +# out = true +# for i in 1:N +# insert!(h, i) +# end +# for i in 1:N +# out &= i in h +# end +# for i in 1:2:N +# delete!(h, i) +# end +# for i in 1:N +# out &= (i in h) == iseven(i) +# end +# for i in 1:2:N +# insert!(h, i) +# end +# for i in 1:N +# out &= i in h +# end +# for i in 1:N +# delete!(h, i) +# end +# out &= isempty(h) +# insert!(h, 7) +# out &= 7 in h +# for i in 1:N +# set!(h, i) +# end +# for i in 1:N +# out &= i in h +# end +# for i in 1:2:N +# delete!(h, i) +# end +# for i in 1:N +# out &= (i in h) == iseven(i) +# end +# for i in N+1:2N +# insert!(h, i) +# end +# for i in 1:2N +# out &= (i in h) == (i > N || iseven(i)) +# end +# return out +# end for n in sizes r = 1:n @@ -380,9 +381,9 @@ for n in sizes even_hash_indices = HashIndices(2:2:n) odd_hash_indices = HashIndices(1:2:n) - old_hash_indices = Dictionaries.OldHashIndices(r) - even_old_hash_indices = Dictionaries.OldHashIndices(2:2:n) - odd_old_hash_indices = Dictionaries.OldHashIndices(1:2:n) + # old_hash_indices = OldHashIndices(r) + # even_old_hash_indices = OldHashIndices(2:2:n) + # odd_old_hash_indices = OldHashIndices(1:2:n) suite_n = suite["$n"] = BenchmarkGroup() @@ -392,7 +393,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable OrderedSet($r) n < cutoff && (s["Indices"] = @benchmarkable Indices($r)) s["HashIndices"] = @benchmarkable HashIndices($r) - s["OldHashIndices"] = @benchmarkable HashIndices($r) + #s["OldHashIndices"] = @benchmarkable HashIndices($r) s = suite_n["build by insertion ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector (push!)"] = @benchmarkable build_set_by_insertion($n)) @@ -400,7 +401,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable build_ordered_set_by_insertion($n) n < cutoff && (s["Set"] = @benchmarkable build_indices_by_insertion($n)) s["HashIndices"] = @benchmarkable build_hashindices_by_insertion($n) - s["OldHashIndices"] = @benchmarkable build_old_hashindices_by_insertion($n) + #s["OldHashIndices"] = @benchmarkable build_old_hashindices_by_insertion($n) s = suite_n["copy ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector (pop!)"] = @benchmarkable copy($vec)) @@ -408,7 +409,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable copy($ordered_set) n < cutoff && (s["Indices"] = @benchmarkable copy($indices)) s["HashIndices"] = @benchmarkable copy($hash_indices) - s["OldHashIndices"] = @benchmarkable copy($old_hash_indices) + #s["OldHashIndices"] = @benchmarkable copy($old_hash_indices) s = suite_n["copy and empty by deletion ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector (pop!)"] = @benchmarkable empty_by_deletion(copy($vec), $n)) @@ -416,21 +417,21 @@ for n in sizes s["OrderedSet"] = @benchmarkable empty_by_deletion(copy($ordered_set), $n) n < cutoff && (s["Indices"] = @benchmarkable empty_by_deletion(copy($indices), $n)) s["HashIndices"] = @benchmarkable empty_by_deletion(copy($hash_indices), $n) - s["OldHashIndices"] = @benchmarkable empty_by_deletion(copy($old_hash_indices), $n) + #s["OldHashIndices"] = @benchmarkable empty_by_deletion(copy($old_hash_indices), $n) s = suite_n["insertion/deletion tests ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable basic_set_test($n) s["OrderedSet"] = @benchmarkable basic_ordered_set_test($n) n < cutoff && (s["Indices"] = @benchmarkable basic_indices_test($n)) s["HashIndices"] = @benchmarkable basic_hash_indices_test($n) - s["OldHashIndices"] = @benchmarkable basic_old_hash_indices_test($n) + #s["OldHashIndices"] = @benchmarkable basic_old_hash_indices_test($n) s = suite_n["insertion/deletion tests ($n)"] = BenchmarkGroup() s["Set"] = @benchmarkable basic_set_test($n) s["OrderedSet"] = @benchmarkable basic_ordered_set_test($n) n < cutoff && (s["Indices"] = @benchmarkable basic_indices_test($n)) s["HashIndices"] = @benchmarkable basic_hash_indices_test($n) - s["OldHashIndices"] = @benchmarkable basic_old_hash_indices_test($n) + #s["OldHashIndices"] = @benchmarkable basic_old_hash_indices_test($n) s = suite_n["unique/distinct (high uniqueness, unsorted) ($n)"] = BenchmarkGroup() s["Vector (unique)"] = @benchmarkable unique($mostly_unique) @@ -462,7 +463,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable all_in($ordered_set, $n) n < cutoff && (s["Indices"] = @benchmarkable all_in($indices, $n)) s["HashIndices"] = @benchmarkable all_in($hash_indices, $n) - s["OldHashIndices"] = @benchmarkable all_in($old_hash_indices, $n) + #s["OldHashIndices"] = @benchmarkable all_in($old_hash_indices, $n) s = suite_n["not in ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable not_in($vec, $n)) @@ -470,7 +471,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable not_in($ordered_set, $n) n < cutoff && (s["Indices"] = @benchmarkable not_in($indices, $n)) s["HashIndices"] = @benchmarkable not_in($hash_indices, $n) - s["OldHashIndices"] = @benchmarkable not_in($old_hash_indices, $n) + #s["OldHashIndices"] = @benchmarkable not_in($old_hash_indices, $n) s = suite_n["count ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable count(iseven, $vec)) @@ -478,7 +479,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable count(iseven, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable count(iseven, $indices)) s["HashIndices"] = @benchmarkable count(iseven, $hash_indices) - s["OldHashIndices"] = @benchmarkable count(iseven, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable count(iseven, $old_hash_indices) s = suite_n["sum ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable sum($vec)) @@ -486,7 +487,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable sum($ordered_set) n < cutoff && (s["Indices"] = @benchmarkable sum($indices)) s["HashIndices"] = @benchmarkable sum($hash_indices) - s["OldHashIndices"] = @benchmarkable sum($old_hash_indices) + #s["OldHashIndices"] = @benchmarkable sum($old_hash_indices) s = suite_n["foreach ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable foreachsum($vec)) @@ -494,7 +495,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable foreachsum($ordered_set) n < cutoff && (s["Indices"] = @benchmarkable foreachsum($indices)) s["HashIndices"] = @benchmarkable foreachsum($hash_indices) - s["OldHashIndices"] = @benchmarkable foreachsum($old_hash_indices) + #s["OldHashIndices"] = @benchmarkable foreachsum($old_hash_indices) s = suite_n["filter-map-reduce via generator ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable sum($(2x for x in vec if isodd(x)))) @@ -502,7 +503,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable sum($(2x for x in ordered_set if isodd(x))) n < cutoff && (s["Indices"] = @benchmarkable sum($(2x for x in indices if isodd(x)))) s["HashIndices"] = @benchmarkable sum($(2x for x in hash_indices if isodd(x))) - s["OldHashIndices"] = @benchmarkable sum($(2x for x in old_hash_indices if isodd(x))) + #s["OldHashIndices"] = @benchmarkable sum($(2x for x in old_hash_indices if isodd(x))) s = suite_n["filter (most) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable filter($pred1, $vec)) @@ -510,7 +511,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable filter($pred1, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable filter($pred1, $indices)) s["HashIndices"] = @benchmarkable filter($pred1, $hash_indices) - s["OldHashIndices"] = @benchmarkable filter($pred1, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable filter($pred1, $old_hash_indices) s = suite_n["filter (half) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable filter(iseven, $vec)) @@ -518,7 +519,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable filter(iseven, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable filter(iseven, $indices)) s["HashIndices"] = @benchmarkable filter(iseven, $hash_indices) - s["OldHashIndices"] = @benchmarkable filter(iseven, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable filter(iseven, $old_hash_indices) s = suite_n["filter (few) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable filter($pred2, $vec)) @@ -526,7 +527,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable filter($pred2, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable filter($pred2, $indices)) s["HashIndices"] = @benchmarkable filter($pred2, $hash_indices) - s["OldHashIndices"] = @benchmarkable filter($pred2, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable filter($pred2, $old_hash_indices) s = suite_n["copy and filter! (most) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable filter!($pred1, copy($vec))) @@ -534,7 +535,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable filter!($pred1, copy($ordered_set)) n < cutoff && (s["Indices"] = @benchmarkable filter!($pred1, copy($indices))) s["HashIndices"] = @benchmarkable filter!($pred1, copy($hash_indices)) - s["OldHashIndices"] = @benchmarkable filter!($pred1, copy($old_hash_indices)) + #s["OldHashIndices"] = @benchmarkable filter!($pred1, copy($old_hash_indices)) s = suite_n["copy and filter! (half) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable filter!(iseven, copy($vec))) @@ -542,7 +543,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable filter!(iseven, copy($ordered_set)) n < cutoff && (s["Indices"] = @benchmarkable filter!(iseven, copy($indices))) s["HashIndices"] = @benchmarkable filter!(iseven, copy($hash_indices)) - s["OldHashIndices"] = @benchmarkable filter!(iseven, copy($old_hash_indices)) + #s["OldHashIndices"] = @benchmarkable filter!(iseven, copy($old_hash_indices)) s = suite_n["copy and filter! (few) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable filter!($pred2, copy($vec))) @@ -550,7 +551,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable filter!($pred2, copy($ordered_set)) n < cutoff && (s["Indices"] = @benchmarkable filter!($pred2, copy($indices))) s["HashIndices"] = @benchmarkable filter!($pred2, copy($hash_indices)) - s["OldHashIndices"] = @benchmarkable filter!($pred2, copy($old_hash_indices)) + #s["OldHashIndices"] = @benchmarkable filter!($pred2, copy($old_hash_indices)) s = suite_n["union ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable union($even_vec, $odd_vec)) @@ -558,7 +559,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable union($even_ordered_set, $odd_ordered_set) n < cutoff && (s["Indices"] = @benchmarkable union($even_indices, $even_indices)) s["HashIndices"] = @benchmarkable union($even_hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable union($even_old_hash_indices, $odd_old_hash_indices) + #s["OldHashIndices"] = @benchmarkable union($even_old_hash_indices, $odd_old_hash_indices) s = suite_n["intersect (empty) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $odd_vec)) @@ -566,7 +567,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable intersect($even_ordered_set, $odd_ordered_set) n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $odd_indices)) s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $odd_old_hash_indices) + #s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $odd_old_hash_indices) s = suite_n["intersect (half) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable intersect($even_vec, $vec)) @@ -574,7 +575,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable intersect($even_ordered_set, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable intersect($even_indices, $indices)) s["HashIndices"] = @benchmarkable intersect($even_hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable intersect($even_old_hash_indices, $old_hash_indices) s = suite_n["intersect (whole) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable intersect($vec, $vec)) @@ -582,7 +583,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable intersect($ordered_set, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable intersect($indices, $indices)) s["HashIndices"] = @benchmarkable intersect($hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable intersect($old_hash_indices, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable intersect($old_hash_indices, $old_hash_indices) s = suite_n["setdiff (whole) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $odd_vec)) @@ -590,7 +591,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable setdiff($even_ordered_set, $odd_ordered_set) n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $odd_indices)) s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $odd_old_hash_indices) + #s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $odd_old_hash_indices) s = suite_n["setdiff (half) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable setdiff($even_vec, $vec)) @@ -598,7 +599,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable setdiff($even_ordered_set, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable setdiff($even_indices, $indices)) s["HashIndices"] = @benchmarkable setdiff($even_hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable setdiff($even_old_hash_indices, $old_hash_indices) s = suite_n["setdiff (empty) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable setdiff($vec, $vec)) @@ -606,7 +607,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable setdiff($ordered_set, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable setdiff($indices, $indices)) s["HashIndices"] = @benchmarkable setdiff($hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable setdiff($old_hash_indices, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable setdiff($old_hash_indices, $old_hash_indices) s = suite_n["symdiff (whole) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $odd_vec)) @@ -614,7 +615,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable symdiff($even_ordered_set, $odd_ordered_set) n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $odd_indices)) s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $odd_old_hash_indices) + #s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $odd_old_hash_indices) s = suite_n["symdiff (left half) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $odd_vec)) @@ -622,7 +623,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable symdiff($ordered_set, $odd_ordered_set) n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $odd_indices)) s["HashIndices"] = @benchmarkable symdiff($hash_indices, $odd_hash_indices) - s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $odd_old_hash_indices) + #s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $odd_old_hash_indices) s = suite_n["symdiff (right half) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable symdiff($even_vec, $vec)) @@ -630,7 +631,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable symdiff($even_ordered_set, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable symdiff($even_indices, $indices)) s["HashIndices"] = @benchmarkable symdiff($even_hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable symdiff($even_old_hash_indices, $old_hash_indices) s = suite_n["symdiff (empty) ($n)"] = BenchmarkGroup() n < cutoff && (s["Vector"] = @benchmarkable symdiff($vec, $vec)) @@ -638,7 +639,7 @@ for n in sizes s["OrderedSet"] = @benchmarkable symdiff($ordered_set, $ordered_set) n < cutoff && (s["Indices"] = @benchmarkable symdiff($indices, $indices)) s["HashIndices"] = @benchmarkable symdiff($hash_indices, $hash_indices) - s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $old_hash_indices) + #s["OldHashIndices"] = @benchmarkable symdiff($old_hash_indices, $old_hash_indices) end end # module diff --git a/contrib/DenseHashDictionary.jl b/contrib/DenseHashDictionaries.jl similarity index 100% rename from contrib/DenseHashDictionary.jl rename to contrib/DenseHashDictionaries.jl diff --git a/src/OldHashIndices.jl b/contrib/OldHashDictionaries.jl similarity index 56% rename from src/OldHashIndices.jl rename to contrib/OldHashDictionaries.jl index 4708104..b45521e 100644 --- a/src/OldHashIndices.jl +++ b/contrib/OldHashDictionaries.jl @@ -1,3 +1,10 @@ +module OldHashDictionaries + +using Dictionaries +using Base: @propagate_inbounds + +export OldHashIndices, OldHashDictionary + # These can be changed, to trade off better performance for space const global maxallowedprobe = 16 const global maxprobeshift = 6 @@ -67,8 +74,8 @@ Base.length(h::OldHashIndices) = h.count ## Token interface -istokenizable(::OldHashIndices) = true -tokentype(::OldHashIndices) = Int +Dictionaries.istokenizable(::OldHashIndices) = true +Dictionaries.tokentype(::OldHashIndices) = Int @propagate_inbounds isslotempty(h::OldHashIndices, i::Int) = h.slots[i] == 0x0 @propagate_inbounds isslotfilled(h::OldHashIndices, i::Int) = h.slots[i] == 0x1 @@ -86,7 +93,7 @@ function skip_deleted(h::OldHashIndices, i) return i end -@propagate_inbounds function iteratetoken(h::OldHashIndices{T}) where {T} +@propagate_inbounds function Dictionaries.iteratetoken(h::OldHashIndices{T}) where {T} idx = skip_deleted(h, h.idxfloor) h.idxfloor = idx # An optimization to skip unnecessary elements when iterating multiple times @@ -97,7 +104,7 @@ end end end -@propagate_inbounds function iteratetoken(h::OldHashIndices{T}, idx::Int) where {T} +@propagate_inbounds function Dictionaries.iteratetoken(h::OldHashIndices{T}, idx::Int) where {T} idx = skip_deleted(h, idx) if idx > length(h.inds) @@ -113,7 +120,7 @@ function hashtoken(key, sz::Int) (((hash(key)%Int) & (sz-1)) + 1)::Int end -function gettoken(h::OldHashIndices{T}, key::T) where {T} +function Dictionaries.gettoken(h::OldHashIndices{T}, key::T) where {T} sz = length(h.inds) iter = 0 maxprobe = h.maxprobe @@ -136,13 +143,13 @@ function gettoken(h::OldHashIndices{T}, key::T) where {T} end # gettokenvalue -@propagate_inbounds function gettokenvalue(h::OldHashIndices, token::Int) +@propagate_inbounds function Dictionaries.gettokenvalue(h::OldHashIndices, token::Int) return h.inds[token] end # insertable interface -isinsertable(::OldHashIndices) = true +Dictionaries.isinsertable(::OldHashIndices) = true function Base.empty!(h::OldHashIndices{T}) where {T} fill!(h.slots, 0x0) # It should be OK to reduce this back to some smaller size. @@ -155,7 +162,7 @@ function Base.empty!(h::OldHashIndices{T}) where {T} return h end -function Base.rehash!(h::OldHashIndices, newsz::Int = length(h.inds)) +function rehash!(h::OldHashIndices, newsz::Int = length(h.inds)) _rehash!(h, nothing, newsz) return h end @@ -226,7 +233,7 @@ end -function gettoken!(h::OldHashIndices{T}, key::T) where {T} +function Dictionaries.gettoken!(h::OldHashIndices{T}, key::T) where {T} (token, _) = _gettoken!(h, nothing, key) # This will make sure a slot is available at `token` (or `-token` if it is new) if token < 0 @@ -316,7 +323,7 @@ end end -function deletetoken!(h::OldHashIndices{T}, token::Int) where {T} +function Dictionaries.deletetoken!(h::OldHashIndices{T}, token::Int) where {T} h.slots[token] = 0x2 isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), h.inds, token-1) @@ -330,3 +337,180 @@ Base.filter!(pred, h::OldHashIndices) = Base.unsafe_filter!(pred, h) # The default insertable indices Base.empty(d::OldHashIndices, ::Type{T}) where {T} = OldHashIndices{T}() + + +mutable struct OldHashDictionary{I,T} <: AbstractDictionary{I, T} + indices::OldHashIndices{I} + values::Vector{T} + + OldHashDictionary{I, T}(indices::OldHashIndices{I}, values::Vector{T}, ::Nothing) where {I, T} = new(indices, values) +end + +""" + OldHashDictionary{I, T}() + +Construct an empty `OldHashDictionary` with index type `I` and element type `T`. This type of +dictionary uses hashes for fast lookup and insertion, and is both mutable and insertable. +(See `issettable` and `isinsertable`). +""" +function OldHashDictionary{I, T}(; sizehint::Int = 16) where {I, T} + indices = OldHashIndices{I}(; sizehint=sizehint) + OldHashDictionary{I, T}(indices, Vector{T}(undef, length(indices.slots)), nothing) +end +OldHashDictionary{I}() where {I} = OldHashDictionary{I, Any}() +OldHashDictionary() = OldHashDictionary{Any}() + +""" + OldHashDictionary{I, T}(indices, undef::UndefInitializer) + +Construct a `OldHashDictionary` with index type `I` and element type `T`. The container is +initialized with `keys` that match the values of `indices`, but the values are unintialized. +""" +function OldHashDictionary{I, T}(indices, ::UndefInitializer) where {I, T} + return OldHashDictionary{I, T}(OldHashIndices{I}(indices), undef) +end + +function OldHashDictionary{I, T}(h::OldHashIndices{I}, ::UndefInitializer) where {I, T} + return OldHashDictionary{I, T}(h, Vector{T}(undef, length(h.slots)), nothing) +end + +function OldHashDictionary{I, T}(indices::OldHashIndices{I}, values) where {I, T} + vals = Vector{T}(undef, length(indices.slots)) + d = OldHashDictionary{I, T}(indices, vals, nothing) + + @inbounds for (i, v) in zip(tokens(indices), values) + vals[i] = v + end + + return d +end + +""" + OldHashDictionary(indices, values) + OldHashDictionary{I}(indices, values) + OldHashDictionary{I, T}(indices, values) + +Construct a `OldHashDictionary` with indices from `indices` and values from `values`, matched +in iteration order. +""" +function OldHashDictionary{I, T}(indices, values) where {I, T} + iter_size = Base.IteratorSize(indices) + if iter_size isa Union{Base.HasLength, Base.HasShape} + d = OldHashDictionary{I, T}(; sizehint = length(indices)*2) + else + d = OldHashDictionary{I, T}() + end + + for (i, v) in zip(indices, values) + insert!(d, i, v) + end + + return d +end +function OldHashDictionary{I}(indices, values) where {I} + if Base.IteratorEltype(values) === Base.EltypeUnknown() + # TODO: implement automatic widening from iterators of Base.EltypeUnkown + values = collect(values) + end + + return OldHashDictionary{I, eltype(values)}(indices, values) +end + +function OldHashDictionary(indices, values) + if Base.IteratorEltype(indices) === Base.EltypeUnknown() + # TODO: implement automatic widening from iterators of Base.EltypeUnkown + indices = collect(indices) + end + + return OldHashDictionary{eltype(indices)}(indices, values) +end + +""" + OldHashDictionary(dict::AbstractDictionary) + OldHashDictionary{I}(dict::AbstractDictionary) + OldHashDictionary{I, T}(dict::AbstractDictionary) + +Construct a copy of `dict` with the same keys and values. +(For copying an `AbstractDict` or other iterable of `Pair`s, see `dictionary`). +""" +OldHashDictionary(dict::AbstractDictionary) = OldHashDictionary(keys(dict), dict) +OldHashDictionary{I}(dict::AbstractDictionary) where {I} = OldHashDictionary{I}(keys(dict), dict) +OldHashDictionary{I, T}(dict::AbstractDictionary) where {I, T} = OldHashDictionary{I, T}(keys(dict), dict) + +## Implementation + +Base.keys(d::OldHashDictionary) = d.indices +Dictionaries.isinsertable(d::OldHashDictionary) = true +Dictionaries.issettable(d::OldHashDictionary) = true + +@propagate_inbounds function Dictionaries.gettoken(d::OldHashDictionary{I}, i::I) where {I} + return gettoken(keys(d), i) +end + +@inline function Dictionaries.gettokenvalue(d::OldHashDictionary, token) + return @inbounds d.values[token] +end + +function Dictionaries.istokenassigned(d::OldHashDictionary, token) + return isassigned(d.values, token) +end + +@inline function Dictionaries.settokenvalue!(d::OldHashDictionary{I, T}, token, value::T) where {I, T} + @inbounds d.values[token] = value + return d +end + +function Dictionaries.gettoken!(d::OldHashDictionary{T}, key::T) where {T} + indices = keys(d) + (token, values) = _gettoken!(indices, d.values, key) + if token < 0 + (token, values) = _insert!(indices, values, key, -token) + d.values = values + return (false, token) + else + d.values = values + return (true, token) + end +end + +function Base.copy(d::OldHashDictionary{I, T}, ::Type{I}, ::Type{T}) where {I, T} + return OldHashDictionary{I, T}(d.indices, copy(d.values), nothing) +end + +Dictionaries.tokenized(d::OldHashDictionary) = d.values + +function Base.empty!(d::OldHashDictionary) + empty!(d.indices) + empty!(d.values) + resize!(d.values, length(keys(d).slots)) + return d +end + +function Dictionaries.deletetoken!(d::OldHashDictionary{I, T}, token) where {I, T} + deletetoken!(keys(d), token) + isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), d.values, token-1) + return d +end + +function Base.sizehint!(d::OldHashDictionary, sz::Int) + d.values = _sizehint!(d.indices, d.values, sz) + return d +end + +function Base.rehash!(d::OldHashDictionary, newsz::Int = length(d.indices)) + _rehash!(d.indices, d.values, newsz) + return d +end + +Base.filter!(pred, d::OldHashDictionary) = Base.unsafe_filter!(pred, d) + +# For `OldHashIndices` we don't copy the indices, we allow the `keys` to remain identical (`===`) +function Base.similar(indices::OldHashIndices{I}, ::Type{T}) where {I, T} + return OldHashDictionary{I, T}(indices, undef) +end + +function Base.empty(indices::OldHashIndices, ::Type{I}, ::Type{T}) where {I, T} + return OldHashDictionary{I, T}() +end + +end # module \ No newline at end of file diff --git a/src/Dictionaries.jl b/src/Dictionaries.jl index a0b0c64..92c978c 100644 --- a/src/Dictionaries.jl +++ b/src/Dictionaries.jl @@ -32,20 +32,16 @@ include("HashIndices.jl") include("HashDictionary.jl") include("MappedDictionary.jl") -include("OldHashIndices.jl") -include("OldHashDictionary.jl") - end # module # # TODO # # * Improved printing - don't calculate length (beyond some cutoff) if it is `SizeUnknown` and limit=true, fix indentiation problems for wider values -# * `hash` and `isless` -# * TODO: have `delete!` return next element, `deletetoken!` return next token. +# * TODO: have `delete!` return next key, `deletetoken!` return next token. # For these kinds of algorithms, probably need: firstindex, firsttoken, nextind, prevind, # nexttoken, prevtoken, lastindex, lasttoken. # * A surface interface for updates like https://github.com/JuliaLang/julia/pull/31367 -# * Soon we will have the concept of "ordered" indices/sets (sort-based dictionaries and +# * More operations for "ordered" indices/sets (sort-based dictionaries and # B-trees). We can probably formalize an interface around a trait here. Certain operations # like slicing out an interval or performing a sort-merge co-iteration for `merge` become # feasible. diff --git a/src/OldHashDictionary.jl b/src/OldHashDictionary.jl deleted file mode 100644 index dc39c85..0000000 --- a/src/OldHashDictionary.jl +++ /dev/null @@ -1,173 +0,0 @@ -mutable struct OldHashDictionary{I,T} <: AbstractDictionary{I, T} - indices::OldHashIndices{I} - values::Vector{T} - - OldHashDictionary{I, T}(indices::OldHashIndices{I}, values::Vector{T}, ::Nothing) where {I, T} = new(indices, values) -end - -""" - OldHashDictionary{I, T}() - -Construct an empty `OldHashDictionary` with index type `I` and element type `T`. This type of -dictionary uses hashes for fast lookup and insertion, and is both mutable and insertable. -(See `issettable` and `isinsertable`). -""" -function OldHashDictionary{I, T}(; sizehint::Int = 16) where {I, T} - indices = OldHashIndices{I}(; sizehint=sizehint) - OldHashDictionary{I, T}(indices, Vector{T}(undef, length(indices.slots)), nothing) -end -OldHashDictionary{I}() where {I} = OldHashDictionary{I, Any}() -OldHashDictionary() = OldHashDictionary{Any}() - -""" - OldHashDictionary{I, T}(indices, undef::UndefInitializer) - -Construct a `OldHashDictionary` with index type `I` and element type `T`. The container is -initialized with `keys` that match the values of `indices`, but the values are unintialized. -""" -function OldHashDictionary{I, T}(indices, ::UndefInitializer) where {I, T} - return OldHashDictionary{I, T}(OldHashIndices{I}(indices), undef) -end - -function OldHashDictionary{I, T}(h::OldHashIndices{I}, ::UndefInitializer) where {I, T} - return OldHashDictionary{I, T}(h, Vector{T}(undef, length(h.slots)), nothing) -end - -function OldHashDictionary{I, T}(indices::OldHashIndices{I}, values) where {I, T} - vals = Vector{T}(undef, length(indices.slots)) - d = OldHashDictionary{I, T}(indices, vals, nothing) - - @inbounds for (i, v) in zip(tokens(indices), values) - vals[i] = v - end - - return d -end - -""" - OldHashDictionary(indices, values) - OldHashDictionary{I}(indices, values) - OldHashDictionary{I, T}(indices, values) - -Construct a `OldHashDictionary` with indices from `indices` and values from `values`, matched -in iteration order. -""" -function OldHashDictionary{I, T}(indices, values) where {I, T} - iter_size = Base.IteratorSize(indices) - if iter_size isa Union{Base.HasLength, Base.HasShape} - d = OldHashDictionary{I, T}(; sizehint = length(indices)*2) - else - d = OldHashDictionary{I, T}() - end - - for (i, v) in zip(indices, values) - insert!(d, i, v) - end - - return d -end -function OldHashDictionary{I}(indices, values) where {I} - if Base.IteratorEltype(values) === Base.EltypeUnknown() - # TODO: implement automatic widening from iterators of Base.EltypeUnkown - values = collect(values) - end - - return OldHashDictionary{I, eltype(values)}(indices, values) -end - -function OldHashDictionary(indices, values) - if Base.IteratorEltype(indices) === Base.EltypeUnknown() - # TODO: implement automatic widening from iterators of Base.EltypeUnkown - indices = collect(indices) - end - - return OldHashDictionary{eltype(indices)}(indices, values) -end - -""" - OldHashDictionary(dict::AbstractDictionary) - OldHashDictionary{I}(dict::AbstractDictionary) - OldHashDictionary{I, T}(dict::AbstractDictionary) - -Construct a copy of `dict` with the same keys and values. -(For copying an `AbstractDict` or other iterable of `Pair`s, see `dictionary`). -""" -OldHashDictionary(dict::AbstractDictionary) = OldHashDictionary(keys(dict), dict) -OldHashDictionary{I}(dict::AbstractDictionary) where {I} = OldHashDictionary{I}(keys(dict), dict) -OldHashDictionary{I, T}(dict::AbstractDictionary) where {I, T} = OldHashDictionary{I, T}(keys(dict), dict) - -## Implementation - -Base.keys(d::OldHashDictionary) = d.indices -isinsertable(d::OldHashDictionary) = true -issettable(d::OldHashDictionary) = true - -@propagate_inbounds function gettoken(d::OldHashDictionary{I}, i::I) where {I} - return gettoken(keys(d), i) -end - -@inline function gettokenvalue(d::OldHashDictionary, token) - return @inbounds d.values[token] -end - -function istokenassigned(d::OldHashDictionary, token) - return isassigned(d.values, token) -end - -@inline function settokenvalue!(d::OldHashDictionary{I, T}, token, value::T) where {I, T} - @inbounds d.values[token] = value - return d -end - -function gettoken!(d::OldHashDictionary{T}, key::T) where {T} - indices = keys(d) - (token, values) = _gettoken!(indices, d.values, key) - if token < 0 - (token, values) = _insert!(indices, values, key, -token) - d.values = values - return (false, token) - else - d.values = values - return (true, token) - end -end - -function Base.copy(d::OldHashDictionary{I, T}, ::Type{I}, ::Type{T}) where {I, T} - return OldHashDictionary{I, T}(d.indices, copy(d.values), nothing) -end - -tokenized(d::OldHashDictionary) = d.values - -function Base.empty!(d::OldHashDictionary) - empty!(d.indices) - empty!(d.values) - resize!(d.values, length(keys(d).slots)) - return d -end - -function deletetoken!(d::OldHashDictionary{I, T}, token) where {I, T} - deletetoken!(keys(d), token) - isbitstype(T) || ccall(:jl_arrayunset, Cvoid, (Any, UInt), d.values, token-1) - return d -end - -function Base.sizehint!(d::OldHashDictionary, sz::Int) - d.values = _sizehint!(d.indices, d.values, sz) - return d -end - -function Base.rehash!(d::OldHashDictionary, newsz::Int = length(d.indices)) - _rehash!(d.indices, d.values, newsz) - return d -end - -Base.filter!(pred, d::OldHashDictionary) = Base.unsafe_filter!(pred, d) - -# For `OldHashIndices` we don't copy the indices, we allow the `keys` to remain identical (`===`) -function Base.similar(indices::OldHashIndices{I}, ::Type{T}) where {I, T} - return OldHashDictionary{I, T}(indices, undef) -end - -function Base.empty(indices::OldHashIndices, ::Type{I}, ::Type{T}) where {I, T} - return OldHashDictionary{I, T}() -end \ No newline at end of file diff --git a/test/group.jl b/test/group.jl deleted file mode 100644 index 77f1e78..0000000 --- a/test/group.jl +++ /dev/null @@ -1,39 +0,0 @@ -@testset "group" begin - @test Dictionaries.group(identity, 11:20) == HashDictionary(11:20, (x->[x]).(11:20)) - @test Dictionaries.group(iseven, 1:10) == dictionary(true => [2,4,6,8,10], false => [1,3,5,7,9]) - - @test Dictionaries.group(iseven, x -> x*2, 1:10) == dictionary(true => [4,8,12,16,20], false => [2,6,10,14,18]) - - @test Dictionaries.group((x,y) -> iseven(x+y), (x,y) -> x, 1:10, [1,3,4,2,5,6,4,2,3,9]) == dictionary(true => [1,4,5,6,8,9], false => [2,3,7,10]) -end - -#@testset "groupunique" begin - #@test Dictionaries.groupunique(identity, 11:20)::HashIndices{Int} == HashIndices(11:20) - #@test Dictionaries.groupunique(identity, 11:20)::HashIndices{Int} == HashIndices(11:20) -#end -#= -@testset "groupinds" begin - @test Dictionaries.groupinds(identity, 11:20) == dictionary(Pair.(11:20, (x->[x]).(1:10))) - @test Dictionaries.groupinds(iseven, 11:20) == dictionary(true => [2,4,6,8,10], false => [1,3,5,7,9]) -end - -@testset "groupview" begin - @test Dictionaries.groupview(identity, 11:20)::Groups == group(identity, 11:20)::dictionary - @test Dictionaries.groupview(iseven, 11:20)::Groups == group(iseven, 11:20)::dictionary -end -=# -@testset "groupreduce" begin - @test Dictionaries.groupreduce(identity, +, 1:10) == dictionary(Pair.(1:10, 1:10)) - @test Dictionaries.groupreduce(iseven, +, 1:10) == dictionary(true => 30, false => 25) - - @test Dictionaries.groupreduce(iseven, x -> x*2, +, 1:10) == dictionary(true => 60, false => 50) - - @test Dictionaries.groupreduce(iseven, x -> x*2, +, 1:10; init=10) == dictionary(true => 70, false => 60) - - @test Dictionaries.groupreduce((x,y) -> iseven(x+y), (x,y) -> x+y, +, 1:10, 1:10; init=10) == dictionary(true => 120) - @test Dictionaries.groupreduce((x,y) -> iseven(x+y), (x,y) -> x+y, +, 1:10, [1,3,4,2,5,6,4,2,3,9]; init=10) == dictionary(true => 62, false => 52) - - @test Dictionaries.groupcount(iseven, 1:10) == dictionary(true => 5, false => 5) - @test Dictionaries.groupsum(iseven, 1:10) == dictionary(true => 30, false => 25) - @test Dictionaries.groupprod(iseven, 1:10) == dictionary(true => 2*4*6*8*10, false => 1*3*5*7*9) -end \ No newline at end of file From 5bc53c53ed9bfa72001044762da3c38e2c119f41 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Thu, 11 Jun 2020 14:54:50 +1000 Subject: [PATCH 14/20] More tests --- LICENSE.md | 2 +- README.md | 2 +- src/AbstractDictionary.jl | 42 ++++++------------------- src/HashDictionary.jl | 8 ++--- src/insertion.jl | 2 +- test/HashDictionary.jl | 55 ++++++++++++++++++++++++++++---- test/HashIndices.jl | 66 +++++++++++++++++++++++++++++++++++++-- test/filter.jl | 22 ++++++++----- 8 files changed, 142 insertions(+), 57 deletions(-) diff --git a/LICENSE.md b/LICENSE.md index ecd7077..68713a4 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ The Dictionaries.jl package is licensed under the MIT "Expat" License: -> Copyright (c) 2018-2019: Andy Ferris. +> Copyright (c) 2018-2020: Andy Ferris. > > Permission is hereby granted, free of charge, to any person obtaining a copy > of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 4e81282..4a75cde 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ ![Test Status](https://github.com/andyferris/Dictionaries.jl/workflows/Test/badge.svg) [![Codecov](https://codecov.io/gh/andyferris/Dictionaries.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/andyferris/Dictionaries.jl) -This package is still quite young - new features are being added and some (low-level) interfaces may be tweaked in the future, but things should be stable enough for general usage. Contributions welcome - please submit an issue or PR! +This package is somewhat young - new features are being added and some (low-level) interfaces may be tweaked in the future, but things should be stable enough for general usage. Contributions welcome - please submit an issue or PR! ## Motivation diff --git a/src/AbstractDictionary.jl b/src/AbstractDictionary.jl index 15a0bab..8617c8e 100644 --- a/src/AbstractDictionary.jl +++ b/src/AbstractDictionary.jl @@ -276,38 +276,6 @@ function _distinct(f, ::Type{T}, itr) where T return out end -# An auto-widening AbstractDictionary constructor -function __distinct(f, dict, itr, s) - I = keytype(dict) - T = eltype(dict) - tmp = iterate(itr, s) - while tmp !== nothing - (x, s) = tmp - i = f(x) - if !(i isa I) - new_inds = copy(keys(dict), promote_type(I, typeof(i))) - new_dict = similar(new_inds, promote_type(T, typeof(x))) - (hadtoken, token) = gettoken!(new_dict, i) - if !hadtoken - @inbounds settokenvalue!(new_dict, token, x) - end - return __distinct(f, new_dict, itr, s) - elseif !(x isa T) - new_dict = copy(dict, promote_type(T, typeof(x))) - (hadtoken, token) = gettoken!(new_dict, i) - if !hadtoken - @inbounds settokenvalue!(new_dict, token, x) - end - return __distinct(f, new_dict, itr, s) - end - (hadtoken, token) = gettoken!(dict, i) - if !hadtoken - @inbounds settokenvalue!(dict, token, x) - end - tmp = iterate(itr, s) - end - return dict -end ### Settable interface @@ -362,7 +330,7 @@ end similar(d::AbstractDictionary, [T=eltype(d)]) Construct a new `issettable` dictionary with identical `keys` as `d` and an element type of -`T`. The initial values are unitialized/undefined. +`T`. The initial values are3unitialized/undefined. """ Base.similar(d::AbstractDictionary) = similar(keys(d), eltype(d)) Base.similar(d::AbstractDictionary, ::Type{T}) where {T} = similar(keys(d), T) @@ -371,6 +339,14 @@ function Base.similar(indices::AbstractIndices{I}, ::Type{T}) where {I, T} return similar(convert(HashIndices{I}, indices), T) end +function Base.merge(d1::AbstractDictionary, d2::AbstractDictionary) + # Note: need to copy the keys + out = similar(copy(keys(d1)), eltype(d1)) + copyto!(out, d1) + merge!(out, d2) + return out +end + # fill! and fill function Base.fill!(d::AbstractDictionary, value) diff --git a/src/HashDictionary.jl b/src/HashDictionary.jl index ce86caf..3f61c4f 100644 --- a/src/HashDictionary.jl +++ b/src/HashDictionary.jl @@ -293,8 +293,8 @@ end function Base.filter!(pred, dict::HashDictionary) indices = keys(dict) - _filter!(i -> pred(@inbounds dict.values[i]), keys(indices.values), indices.values, indices.hashes, (dict.values,)) - indices.deleted = 0 + _filter!(i -> pred(@inbounds dict.values[i]), indices.values, indices.hashes, (dict.values,)) + indices.holes = 0 newsize = Base._tablesz(3*length(indices.values) >> 0x01) rehash!(indices, newsize, (dict.values,)) return dict @@ -303,8 +303,8 @@ end function Base.filter!(pred, dict::PairDictionary{<:Any, <:Any, <:HashDictionary}) d = dict.d indices = keys(d) - _filter!(i -> pred(@inbounds indices.values[i] => d.values[i]), keys(indices.values), indices.values, indices.hashes, (d.values,)) - indices.deleted = 0 + _filter!(i -> pred(@inbounds indices.values[i] => d.values[i]), indices.values, indices.hashes, (d.values,)) + indices.holes = 0 newsize = Base._tablesz(3*length(indices.values) >> 0x01) rehash!(indices, newsize, (d.values,)) return dict diff --git a/src/insertion.jl b/src/insertion.jl index ffe9724..ca6f3af 100644 --- a/src/insertion.jl +++ b/src/insertion.jl @@ -475,4 +475,4 @@ Base.empty(d::AbstractDictionary) = empty(keys(d), keytype(d), eltype(d)) Base.empty(d::AbstractDictionary, ::Type{I}) where {I} = empty(keys(d), I) -Base.empty(::AbstractIndices, ::Type{I}, ::Type{T}) where {I, T} = HashDictionary{I, T}() +Base.empty(::AbstractDictionary, ::Type{I}, ::Type{T}) where {I, T} = HashDictionary{I, T}() diff --git a/test/HashDictionary.jl b/test/HashDictionary.jl index 5d45eba..702ebc9 100644 --- a/test/HashDictionary.jl +++ b/test/HashDictionary.jl @@ -31,7 +31,7 @@ @test_throws IndexError d[10] = 11 @test_throws IndexError delete!(d, 10) - insert!(d, 10, 11) + insert!(d, 10.0, 11.0) @test d[10] == 11 @test get(d, 10, 15) == 11 @@ -77,11 +77,11 @@ @test cmp(fill(0, copy(keys(d))), d) == -1 @test cmp(d, fill(0, copy(keys(d)))) == 1 @test_throws IndexError insert!(d, 10, 12) - @test d[10] == 11 - set!(d, 10, 12) + @test d[10.0] == 11 + set!(d, 10.0, 12.0) @test length(d) == 1 @test d[10] == 12 - d[10] = 13 + d[10.0] = 13.0 @test length(d) == 1 @test d[10] == 13 io = IOBuffer(); print(io, d); @test String(take!(io)) == "{10 │ 13}" @@ -90,10 +90,10 @@ @test isequal(d, copy(d)) @test isempty(empty(d)) - delete!(d, 10) + delete!(d, 10.0) @test isequal(d, HashDictionary{Int64, Int64}()) - @test get!(d, 10, 14) == 14 + @test get!(d, 10, 14.0) == 14 @test d[10] == 14 delete!(d, 10) @@ -105,6 +105,29 @@ @test all(in(i, keys(d)) == iseven(i) for i in 2:2:1000) @test isempty(empty!(d)) + @test get!(() -> 15, d, 10) == 15 + @test get!(() -> 16, d, 10) == 15 + + d = HashDictionary([:a, :b], [1, 2]) + d2 = HashDictionary((a=1, b=2)) + @test isequal(d, d2) + d3 = dictionary([:a=>1, :b=>2]) + @test isequal(d, d3) + d4 = dictionary(zip([:a, :b], [1, 2])) + @test isequal(d, d4) + @test !isless(d, d4) + @test !isless(d4, d) + @test hash(d) == hash(d4) + + @test isequal(merge(d, d), d) + @test isequal(merge(d, d2), d) + + @test isequal(merge(d, HashDictionary([:c], [3])), HashDictionary([:a, :b, :c], [1, 2, 3])) + @test isequal(merge(d, HashDictionary([:b, :c], [4, 3])), HashDictionary([:a, :b, :c], [1, 4, 3])) + + @test isequal(index(first, ["Alice", "Bob", "Charlie"]), HashDictionary(['A', 'B', 'C'], ["Alice", "Bob", "Charlie"])) + @test isequal(index(first, ["Alice", "Bob", "Charlie", "Conner"]), HashDictionary(['A', 'B', 'C'], ["Alice", "Bob", "Charlie"])) + # TODO token interface @testset "Dict tests from Base" begin @@ -182,4 +205,24 @@ @test isequal(index(first, ["Alice", "Bob", "Charlie"]), res) @test isequal(index(first, ["Alice", "Bob", "Charlie", "Conner"]), res) end + + @testset "Factories" begin + d = HashDictionary(['a','b','c'], [1,2,3]) + @test similar(d) isa HashDictionary{Char, Int} + @test similar(d, Float64) isa HashDictionary{Char, Float64} + @test sharetokens(d, similar(d)) + + @test isempty(empty(d)::HashDictionary{Char, Int}) + @test isempty(empty(d, Float64)::HashIndices{Float64}) + @test isempty(empty(d, String, Float64)::HashDictionary{String, Float64}) + + @test isequal(zeros(d)::HashDictionary{Char, Float64}, HashDictionary(['a','b','c'],[0.0,0.0,0.0])) + @test isequal(zeros(Int64, d)::HashDictionary{Char, Int64}, HashDictionary(['a','b','c'],[0,0,0])) + + @test isequal(ones(d)::HashDictionary{Char, Float64}, HashDictionary(['a','b','c'],[1.0,1.0,1.0])) + @test isequal(ones(Int64, d)::HashDictionary{Char, Int64}, HashDictionary(['a','b','c'],[1,1,1])) + + @test isequal(keys(rand(1:10, d)::HashDictionary{Char, Int}), HashIndices(['a','b','c'])) + @test isequal(keys(randn(d)::HashDictionary{Char, Float64}), HashIndices(['a','b','c'])) + end end \ No newline at end of file diff --git a/test/HashIndices.jl b/test/HashIndices.jl index c99bdc2..de28fa5 100644 --- a/test/HashIndices.jl +++ b/test/HashIndices.jl @@ -16,7 +16,7 @@ io = IOBuffer(); show(io, MIME"text/plain"(), h); @test String(take!(io)) == "0-element HashIndices{Int64}" @test_throws IndexError delete!(h, 10) - insert!(h, 10) + insert!(h, 10.0) @test length(h) == 1 @test keys(h) === h @@ -25,7 +25,7 @@ @test h == copy(h) @test !isempty(h) @test isequal(copy(h), h) - @test h[10] == 10 + @test h[10.0] == 10 @test_throws IndexError insert!(h, 10) @test length(set!(h, 10)) == 1 @test_throws IndexError insert!(h, 10) @@ -35,7 +35,7 @@ @test isequal(h, copy(h)) @test isempty(empty(h)) - delete!(h, 10) + delete!(h, 10.0) @test isequal(h, HashIndices{Int64}()) @@ -46,6 +46,39 @@ @test all(in(i, h) == iseven(i) for i in 2:1000) @test isempty(empty!(h)) + # set + @test length(set!(h, 1)) == 1 + @test length(set!(h, 2, 2)) == 2 + @test length(set!(h, 3.0, 3.0)) == 3 + @test_throws ErrorException set!(h, 4, 5) + + @testset "Comparison" begin + i1 = HashIndices([1,2,3]) + i2 = HashIndices([1,2]) + i3 = HashIndices([1,2,3,4]) + i4 = HashIndices([3,2,1]) + + @test isequal(i1, i1) + @test hash(i1) == hash(copy(i1)) + + @test isless(i2, i1) + @test !isless(i1, i2) + @test !isequal(i1, i2) + @test hash(i1) != hash(i2) + + @test isless(i2, i1) + @test !isless(i1, i2) + @test !isequal(i1, i2) + + @test isless(i1, i3) + @test !isless(i3, i1) + @test !isequal(i1, i3) + + @test isless(i1, i4) + @test !isless(i4, i1) + @test !isequal(i1, i4) + end + @testset "Adapated from Dict tests from Base" begin h = HashIndices{Int}() N = 10000 @@ -101,5 +134,32 @@ @test isequal(distinct([1,2,3,1]), res) @test isequal(distinct([1,2,3]), res) end + + @testset "set logic" begin + i1 = HashIndices([1,2]) + i2 = HashIndices([2,3]) + i3 = HashIndices([3,4]) + i4 = HashIndices([2]) + + @test !issetequal(i1, i2) + @test !(i1 ⊆ i2) + @test i4 ⊆ i1 + + @test !disjoint(i1, i2) + @test disjoint(i1, i3) + + @test isequal(union(i1, i2), HashIndices([1,2,3])) + @test isequal(union(i2, i1), HashIndices([2,3,1])) + @test isequal(union(i1, i3), HashIndices([1,2,3,4])) + + @test isequal(intersect(i1, i2), HashIndices([2])) + @test isequal(intersect(i1, i3), HashIndices([])) + + @test isequal(setdiff(i1, i2), HashIndices([1])) + @test isequal(setdiff(i1, i3), HashIndices([1, 2])) + + @test isequal(symdiff(i1, i2), HashIndices([1, 3])) + @test isequal(symdiff(i1, i3), HashIndices([1, 2, 3, 4])) + end # TODO: token interface end \ No newline at end of file diff --git a/test/filter.jl b/test/filter.jl index ae70b40..e4b899d 100644 --- a/test/filter.jl +++ b/test/filter.jl @@ -1,17 +1,23 @@ @testset "filter" begin i = HashIndices([1,2,3,4,5]) - @test issetequal(filter(iseven, i)::HashIndices, [2, 4]) - @test issetequal(filter(isodd, i)::HashIndices, [1, 3, 5]) + @test isequal(filter(iseven, i)::HashIndices, HashIndices([2, 4])) + @test isequal(filter(isodd, i)::HashIndices, HashIndices([1, 3, 5])) - @test issetequal(filterview(iseven, i)::Dictionaries.FilteredIndices, [2, 4]) - @test issetequal(filterview(isodd, i)::Dictionaries.FilteredIndices, [1, 3, 5]) + @test isequal(filterview(iseven, i)::Dictionaries.FilteredIndices, HashIndices([2, 4])) + @test isequal(filterview(isodd, i)::Dictionaries.FilteredIndices, HashIndices([1, 3, 5])) + + filter!(iseven, i) + @test isequal(i, HashIndices([2, 4])) d = HashDictionary([1,2,3,4,5], [1,3,2,4,5]) - @test issetequal(pairs(filter(iseven, d)::HashDictionary), [3=>2, 4=>4]) - @test issetequal(pairs(filter(isodd, d)::HashDictionary), [1=>1, 2=>3, 5=>5]) + @test isequal(filter(iseven, d)::HashDictionary, dictionary([3=>2, 4=>4])) + @test isequal(filter(isodd, d)::HashDictionary, dictionary([1=>1, 2=>3, 5=>5])) + + @test isequal(filterview(iseven, d)::Dictionaries.FilteredDictionary, dictionary([3=>2, 4=>4])) + @test isequal(filterview(isodd, d)::Dictionaries.FilteredDictionary, dictionary([1=>1, 2=>3, 5=>5])) - @test issetequal(pairs(filterview(iseven, d)::Dictionaries.FilteredDictionary), [3=>2, 4=>4]) - @test issetequal(pairs(filterview(isodd, d)::Dictionaries.FilteredDictionary), [1=>1, 2=>3, 5=>5]) + filter!(iseven, d) + @test isequal(d, HashDictionary([3,4],[2,4])) end \ No newline at end of file From bdc07ed0ec2afde3ea0e22b7f8d0fa8073e7a0bc Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Thu, 11 Jun 2020 15:22:57 +1000 Subject: [PATCH 15/20] Forbid IteratorSize of HasShape for Indices Fixes #20 --- src/AbstractIndices.jl | 2 +- src/Indices.jl | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/AbstractIndices.jl b/src/AbstractIndices.jl index 08db8e6..27f5236 100644 --- a/src/AbstractIndices.jl +++ b/src/AbstractIndices.jl @@ -63,7 +63,7 @@ function Base.in(i, indices::AbstractIndices{I}) where I end # Match the default setting from Base - the majority of containers will know their size -Base.IteratorSize(indices::AbstractIndices) = Base.HasLength() +Base.IteratorSize(::AbstractIndices) = Base.HasLength() function Base.length(indices::AbstractIndices) if Base.IteratorSize(indices) isa Base.SizeUnknown diff --git a/src/Indices.jl b/src/Indices.jl index 6ae91ae..0025b5f 100644 --- a/src/Indices.jl +++ b/src/Indices.jl @@ -26,7 +26,13 @@ Indices{I}(iter) where {I} = Indices{I, typeof(iter)}(iter) # There is a corner end Base.in(i::I, inds::Indices{I}) where {I} = in(i, inds.inds) -Base.IteratorSize(i::Indices) = Base.IteratorSize(i.inds) +function Base.IteratorSize(i::Indices) + out = Base.IteratorSize(i.inds) + if out isa Base.HasShape + return Base.HasLength() + end + return out +end Base.length(i::Indices) = length(i.inds) # Specialize for `Vector` elements. Satisfy the tokenization and insertion interface From a9a98667e22660ae2bd12ac8f8f5b249d5af7518 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Thu, 11 Jun 2020 15:31:32 +1000 Subject: [PATCH 16/20] Fix merge / mergewith Closes #18 --- src/AbstractDictionary.jl | 10 +++++++++ src/insertion.jl | 43 +++++++++++---------------------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/src/AbstractDictionary.jl b/src/AbstractDictionary.jl index 8617c8e..ab254cf 100644 --- a/src/AbstractDictionary.jl +++ b/src/AbstractDictionary.jl @@ -347,6 +347,16 @@ function Base.merge(d1::AbstractDictionary, d2::AbstractDictionary) return out end +if isdefined(Base, :mergewith) # Julia 1.5+ + function Base.mergewith(combner, d1::AbstractDictionary, d2::AbstractDictionary) + # Note: need to copy the keys + out = similar(copy(keys(d1)), eltype(d1)) + copyto!(out, d1) + mergewith!(combner, out, d2) + return out + end +end + # fill! and fill function Base.fill!(d::AbstractDictionary, value) diff --git a/src/insertion.jl b/src/insertion.jl index ca6f3af..ee7752e 100644 --- a/src/insertion.jl +++ b/src/insertion.jl @@ -323,44 +323,25 @@ function unset!(d::AbstractDictionary{I}, i::I) where {I} end ### Non-scalar insertion/deletion - -Base.merge!(d::AbstractDictionary, ds::AbstractDictionary...) = merge!(last, d, ds...) - -function Base.merge!(combiner::Callable, d::AbstractDictionary, d2::AbstractDictionary) - for (i, v) in pairs(d2) - (hasindex, token) = gettoken!(d, i) - if hasindex - @inbounds settokenvalue!(d, token, combiner(gettokenvalue(d, token), v)) - else - @inbounds settokenvalue!(d, token, v) - end - end - return d -end - -# TODO `last` is incorrect, it should be `latter(x,y) = y` -function Base.merge!(::typeof(last), d::AbstractDictionary, d2::AbstractDictionary) +function Base.merge!(d::AbstractDictionary, d2::AbstractDictionary) for (i, v) in pairs(d2) set!(d, i, v) end return d end -# TODO `first` is incorrect, it should be `former(x,y) = y` -function Base.merge!(::typeof(first), d::AbstractDictionary, d2::AbstractDictionary) - for (i, v) in pairs(d2) - get!(d, i, v) +if isdefined(Base, :mergewith) # Julia 1.5+ + function Base.mergewith!(combiner::Callable, d::AbstractDictionary, d2::AbstractDictionary) + for (i, v) in pairs(d2) + (hasindex, token) = gettoken!(d, i) + if hasindex + @inbounds settokenvalue!(d, token, combiner(gettokenvalue(d, token), v)) + else + @inbounds settokenvalue!(d, token, v) + end + end + return d end - return d -end - -function Base.merge!(combiner::Callable, d::AbstractDictionary, d2::AbstractDictionary, ds::AbstractDictionary...) - merge!(combiner, merge!(combiner, d, d2), ds...) -end - -function Base.merge!(combiner::Callable, d::AbstractIndices, d2::AbstractIndices) - # Hopefully no-one provides a bad combiner - union!(d, d2) end # TODO some kind of exclusive merge (throw on key clash like `insert!`) From f891d95238e3cf8bfcb0cd1625d3655846f68cc3 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Thu, 11 Jun 2020 16:06:22 +1000 Subject: [PATCH 17/20] Finalize == semantics Fixes #11 --- src/AbstractDictionary.jl | 76 ++++++++++++++++++++++++++++++--------- src/AbstractIndices.jl | 24 ++++++++++--- src/Dictionaries.jl | 2 +- test/HashDictionary.jl | 17 +++++++++ test/HashIndices.jl | 6 ++++ 5 files changed, 103 insertions(+), 22 deletions(-) diff --git a/src/AbstractDictionary.jl b/src/AbstractDictionary.jl index ab254cf..9117d7d 100644 --- a/src/AbstractDictionary.jl +++ b/src/AbstractDictionary.jl @@ -94,41 +94,85 @@ function Base.isequal(d1::AbstractDictionary, d2::AbstractDictionary) return true end -# `==` doesn't care about the iteration order. Keys must be `isequal` and values `==` +# The indices must be isequal and the values ==, same ordering function Base.:(==)(d1::AbstractDictionary, d2::AbstractDictionary) - if d1 === d2 - return true - end + out = true if sharetokens(d1, d2) @inbounds for t in tokens(d1) - if gettokenvalue(d1, t) != gettokenvalue(d2, t) + out &= gettokenvalue(d1, t) == gettokenvalue(d2, t) + if out === false return false end end - return true + return out end if length(d1) != length(d2) return false end - if istokenizable(d2) - for (i,v) in pairs(d1) - (hastoken, token) = gettoken(d2, i) - if !hastoken || v != gettokenvalue(d2, token) - return false - end + for ((i1,x1), (i2,x2)) in zip(pairs(d1), pairs(d2)) + if !isequal(i1, i2) + return false end - else - for (i,v) in pairs(d1) - if !haskey(d2, i) || v != d2[i] + out &= x1 == x2 # make sure it works for `missing` + if out === false + return false + end + end + + return out +end + +""" + isdictequal(d1, d2) + +Determine if two dictionaries are equivalent. Dictionaries `d1` and `d2` are equivalent if +`issetequal(keys(d1), keys(d2))` and for each key `i`, `d1[i] == d2[i]`. + +Example + +```julia +julia> isdictequal(HashDictionary(['a','b'],[1,2]), HashDictionary(['b','a'],[2,1])) +true + +julia> isdictequal(HashDictionary(['a','b'],[1,2]), HashDictionary(['b','a'],[2,3])) +false + +julia> isdictequal(HashDictionary(['a','b'],[1,2]), HashDictionary(['b','a'],[2,missing])) +missing +``` +""" +function isdictequal(d1::AbstractDictionary, d2::AbstractDictionary) + out = true + + if sharetokens(d1, d2) + @inbounds for t in tokens(d1) + out &= gettokenvalue(d1, t) == gettokenvalue(d2, t) + if out === false return false end end + return out end - return true + if length(d1) != length(d2) + return false + end + + for (i,x1) in pairs(d1) + (hastoken, t) = gettoken(d2, i) + if !hastoken + return false + end + out &= x1 == gettokenvalue(d2, t) # make sure it works for `missing` + if out === false + return false + end + end + + return out end # Lexical ordering based on iteration (of pairs - lesser key takes priority over lesser value, as implmeneted in `cmp(::Pair)`) diff --git a/src/AbstractIndices.jl b/src/AbstractIndices.jl index 27f5236..eba4cce 100644 --- a/src/AbstractIndices.jl +++ b/src/AbstractIndices.jl @@ -159,23 +159,37 @@ function Base.isequal(i1::AbstractIndices, i2::AbstractIndices) return true end -# Use `issetequal` semantics +# The indices must be isequal and the values ==, same ordering function Base.:(==)(i1::AbstractIndices, i2::AbstractIndices) + out = true + if sharetokens(i1, i2) - return true + # TODO - can we get rid of this loop for reflexive == element types? + @inbounds for t in tokens(i1) + # make sure it works for `missing` + out &= gettokenvalue(i1, t) == gettokenvalue(i2, t) + if out === false + return false + end + end + return out end if length(i1) != length(i2) return false end - for i in i1 - if !(i in i2) + for (j1, j2) in zip(i1, i2) + if !isequal(j1, j2) + return false + end + out &= j1 == j2 # make sure it works for `missing` + if out === false return false end end - return true + return out end # Lexical ordering based on iteration diff --git a/src/Dictionaries.jl b/src/Dictionaries.jl index 92c978c..bb8117a 100644 --- a/src/Dictionaries.jl +++ b/src/Dictionaries.jl @@ -8,7 +8,7 @@ export getindices, setindices! export AbstractDictionary, AbstractIndices, IndexError, Indices, HashIndices, HashDictionary, Dictionary, MappedDictionary, DictionaryView, FilteredDictionary, FilteredIndices, BroadcastedDictionary -export dictionary, index, distinct, disjoint, filterview +export dictionary, index, distinct, disjoint, isdictequal, filterview export issettable, isinsertable, set!, unset! export istokenizable, tokentype, tokens, tokenized, gettoken, gettokenvalue, istokenassigned, settokenvalue!, gettoken!, deletetoken!, sharetokens diff --git a/test/HashDictionary.jl b/test/HashDictionary.jl index 702ebc9..e384c4d 100644 --- a/test/HashDictionary.jl +++ b/test/HashDictionary.jl @@ -109,8 +109,12 @@ @test get!(() -> 16, d, 10) == 15 d = HashDictionary([:a, :b], [1, 2]) + @test isequal(d, d) + @test d == d + @test !isless(d, d) d2 = HashDictionary((a=1, b=2)) @test isequal(d, d2) + @test d == d d3 = dictionary([:a=>1, :b=>2]) @test isequal(d, d3) d4 = dictionary(zip([:a, :b], [1, 2])) @@ -119,6 +123,19 @@ @test !isless(d4, d) @test hash(d) == hash(d4) + @test isdictequal(d, copy(d)) + @test isdictequal(HashDictionary(['a','b'],[1,2]), HashDictionary(['b','a'],[2,1])) + @test !isdictequal(HashDictionary(['a','b'],[1,2]), HashDictionary(['b','c'],[2,1])) + @test !isdictequal(HashDictionary(['a','b'],[1,2]), HashDictionary(['a','b','c'],[1,2,3])) + @test !isdictequal(HashDictionary(['a','b'],[1,2]), HashDictionary(['b','a'],[2,3])) + + d5 = HashDictionary(['a','b'],[1,missing]) + @test isdictequal(d5, d5) === missing + @test (d5 == d5) === missing + d6 = HashDictionary(['a','b'],[1,missing]) + @test isdictequal(d5, d5) === missing + @test (d5 == d5) === missing + @test isequal(merge(d, d), d) @test isequal(merge(d, d2), d) diff --git a/test/HashIndices.jl b/test/HashIndices.jl index de28fa5..ad219da 100644 --- a/test/HashIndices.jl +++ b/test/HashIndices.jl @@ -77,6 +77,12 @@ @test isless(i1, i4) @test !isless(i4, i1) @test !isequal(i1, i4) + + i5 = HashIndices([1,2,missing]) + @test isequal(i5, i5) + @test !isless(i5, i5) + @test (i5 == i5) === missing + @test (i5 == HashIndices([1,2,missing])) === missing end @testset "Adapated from Dict tests from Base" begin From 456b90ad41f6ee0607923a04a6b17618678cdc9e Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Thu, 11 Jun 2020 16:34:23 +1000 Subject: [PATCH 18/20] Auto-convert settokenvalue! --- src/tokens.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tokens.jl b/src/tokens.jl index 1b6686d..d1a5f3c 100644 --- a/src/tokens.jl +++ b/src/tokens.jl @@ -198,7 +198,11 @@ end return isassigned(d, token) end -@propagate_inbounds function settokenvalue!(d::AbstractDictionary, i, value) +@propagate_inbounds function settokenvalue!(d::AbstractDictionary{<:Any,T}, t, value::T) where {T} + return settokenvalue!(d, t, convert(T, value)) +end + +@propagate_inbounds function settokenvalue!(d::AbstractDictionary{<:Any,T}, i, value::T) where {T} if !issettable(d) error("Cannot mutate values of dictionary: $(typeof(d))") end From ba4bd052123076d6cfb763a7fb784c363892b728 Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Thu, 11 Jun 2020 16:35:08 +1000 Subject: [PATCH 19/20] Small cleanup --- src/tokens.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokens.jl b/src/tokens.jl index d1a5f3c..50c3beb 100644 --- a/src/tokens.jl +++ b/src/tokens.jl @@ -226,6 +226,6 @@ performed quickly (e.g. O(1) rather than O(N)). Return `false` otherwise. Note: the test may not be precise, this defaults to `tokens(dict1) === tokens(dict2)`. """ -sharetokens(i1::AbstractIndices, i2::AbstractIndices) = istokenizable(i1) && istokenizable(i2) && i1 === i2 +sharetokens(i1::AbstractIndices, i2::AbstractIndices) = istokenizable(i1) && i1 === i2 sharetokens(d1, d2) = sharetokens(keys(d1), keys(d2)) sharetokens(d1, d2, ds...) = sharetokens(d1, d2) && sharetokens(d1, ds...) From 0d34363b6220102271e7a636fd7032e4441462ee Mon Sep 17 00:00:00 2001 From: Andy Ferris Date: Thu, 11 Jun 2020 16:44:41 +1000 Subject: [PATCH 20/20] Fix settokenvalue! --- src/tokens.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokens.jl b/src/tokens.jl index 50c3beb..e36c60c 100644 --- a/src/tokens.jl +++ b/src/tokens.jl @@ -198,7 +198,7 @@ end return isassigned(d, token) end -@propagate_inbounds function settokenvalue!(d::AbstractDictionary{<:Any,T}, t, value::T) where {T} +@propagate_inbounds function settokenvalue!(d::AbstractDictionary{<:Any,T}, t, value) where {T} return settokenvalue!(d, t, convert(T, value)) end