From ca2824522415366665656e0db9d11bfe2219d402 Mon Sep 17 00:00:00 2001 From: Johannes Terblanche <6612981+Affie@users.noreply.github.com> Date: Wed, 13 Nov 2024 17:41:50 +0200 Subject: [PATCH] Implement BagOfWords --- src/bagofwords/BagOfWords.jl | 345 +++++++++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 src/bagofwords/BagOfWords.jl diff --git a/src/bagofwords/BagOfWords.jl b/src/bagofwords/BagOfWords.jl new file mode 100644 index 000000000..e6d5ca8e6 --- /dev/null +++ b/src/bagofwords/BagOfWords.jl @@ -0,0 +1,345 @@ +using MetaGraphsNext +using Graphs +using Clustering +using StaticArrays +using TensorCast +using Distances + +# References: +# [SZ 2003]: Sivic and Zisserman, 2003, October. Video Google: A text retrieval approach to object matching in videos. In Proceedings ninth IEEE international conference on computer vision (pp. 1470-1477). IEEE. +# [Wang 2011] Wang, X., Yang, M., Cour, T., Zhu, S., Yu, K., & Han, T. X. (2011, November). Contextual weighting for vocabulary tree based image retrieval. In 2011 International conference on computer vision (pp. 209-216). IEEE. +# [Gálvez-López, 2012] Gálvez-López, D., & Tardos, J. D. (2012). Bags of binary words for fast place recognition in image sequences. IEEE Transactions on robotics, 28(5), 1188-1197. +# [Nister, 2006] Nister, David, and Henrik Stewenius. "Scalable recognition with a vocabulary tree." 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR'06). Vol. 2. Ieee, 2006. + +## ============================================================================= +## Vocabulary +## ============================================================================= +const Wordnode = @NamedTuple{ + leaveId::Int, word::SVector{128, Float32}, count::Int, level::Int, weight::Float64} +const Word = @NamedTuple{leaveId::Int, weight::Float64} +function Base.zero(::Type{Wordnode}) + (leaveId = 0, word = zeros(SVector{128, Float32}), count = 0, level = 0, weight = 0.0) +end + +function add_voc_children!(tree, descriptors::Matrix, parent = 1, level = 0; progress) + children = Clustering.kmeans(descriptors, tree.graph_data[:n_children]) + level += 1 + for (i, (centre, cnt)) in enumerate(zip(eachcol(children.centers), children.counts)) + next!(progress) + idx = nv(tree) + 1 + + # only crete children if count enough for n_children or not last level + isLeaveNode = cnt < tree.graph_data[:n_children] || + level >= tree.graph_data[:n_levels] + + if isLeaveNode + tree.graph_data[:n_leaves] += 1 + leaveId = tree.graph_data[:n_leaves] + else + leaveId = 0 + end + + tree[idx] = ( + leaveId = leaveId, word = centre, count = cnt, level = level, weight = 0.0) + add_edge!(tree, parent, idx) + + if isLeaveNode + # push!(tree.graph_data[:leaveIds], idx) + continue + else + next_idxs = findall(children.assignments .== i) + add_voc_children!(tree, descriptors[:, next_idxs], idx, level; progress) + end + end + + return nothing +end + +function countOccurance(voctree, image_descriptors) + image_word_count = [Set() for _ in 1:voctree.graph_data[:n_leaves]] + # FIXME make thread save if needed for speed, got undef # Threads.@threads + for (idx, image_desc) in enumerate(image_descriptors) + img_words = getWords(voctree, image_desc) + for img_word in img_words + push!(image_word_count[img_word.leaveId], idx) + end + end + return length.(image_word_count) +end + +# set inverse document frequency idf weights for the vocabulary [SZ 2003] +function setVocabularyWeigths!(voctree, all_desc) + N = length(all_desc) # how many images used to train the vocabulary + occs = countOccurance(voctree, all_desc) + _weights = map(occs) do n_i + log(N / n_i) # [SZ 2003, Sec 4] + end + @showprogress for l in labels(voctree) + node = voctree[l] + if node.leaveId > 0 + voctree[l] = (node..., weight = _weights[node.leaveId]) + end + end +end + +""" + buildVocabulary + +Build a vocabulary tree from a set of descriptors. +image_descriptors is a vector of vectors of descriptors, each vector of descriptors is from one image. +""" +function buildVocabulary(image_descriptors::Vector, n_children = 9, n_levels = 5) + descriptors = reduce(vcat, image_descriptors) + + # tree and root vertex + voctree = MetaGraph( + DiGraph(), + Int, + Wordnode, + Nothing, + Dict( + :n_children => n_children, + :n_levels => n_levels, + :n_leaves => 0 + ) + ) + + voctree[1] = (leaveId = 0, word = mean(descriptors), + count = length(descriptors), level = 0, weight = 0.0) + @cast desc_mat[j, i] := descriptors[i][j] + + n_nodes = sum(n_children .^ collect(0:n_levels)) + progress = Progress(n_nodes; dt = 1.0) + + add_voc_children!(voctree, desc_mat; progress) + + setVocabularyWeigths!(voctree, image_descriptors) + + finish!(progress) + return voctree +end + +## ============================================================================= +## Vocabulary Lookup +## ============================================================================= +""" + getWord(tree, lookmeup, nodeIdx=1, level=0, MAX_LEVEL=tree.graph_data[:n_levels]; dist=Distances.Euclidean()) + +Recursively traverse the vocabulary tree structure to find the closest word to `lookmeup` using a specified distance metric. + +# Arguments +- `tree`: The vocabulary tree structure containing nodes with words. +- `lookmeup`: The features to look up in the tree. +- `nodeIdx`: The current node index (default is 1). +- `level`: The current level in the tree (default is 0). +- `MAX_LEVEL`: The maximum level to traverse in the tree (default is `tree.graph_data[:n_levels]`). +- `dist`: The distance metric to use for comparison (default is `Distances.Euclidean()`). + +# Returns +- A named tuple containing: + - `leaveId`: The ID of the leaf node. + - `weight`: The weight associated with the leaf node. +""" +function getWord(tree, lookmeup, nodeIdx = 1, level = 0, MAX_LEVEL = tree.graph_data[:n_levels]; + dist = Distances.Euclidean() +) + level += 1 + children = outneighbors(tree, nodeIdx) + # @debug level nodeIdx children + if level <= MAX_LEVEL && !isempty(children) + dists = map(children) do i + dist(tree[i].word, lookmeup) + end + getWord(tree, lookmeup, children[argmin(dists)], level, MAX_LEVEL; dist) + else + (; leaveId, weight) = tree[nodeIdx] + return (leaveId = leaveId, weight = weight) + end +end + +""" + getWords(tree, lookupvec, nodeIdx=1, level=0, MAX_LEVEL=tree.graph_data[:n_levels]; dist=Distances.Euclidean()) + +Given a vocabulary tree and a vector of lookup features, this function computes the corresponding words for each value in the lookup vector. + +# Arguments +- `tree`: The tree structure containing the vocabulary. +- `lookupvec`: A vector of values for which words need to be found. +- `dist`: The distance metric to use (default is `Distances.Euclidean()`). + +# Returns +- `words`: A vector of words corresponding to each value in the lookup vector. +""" +function getWords(tree, + lookupvec, + nodeIdx = 1, + level = 0, + MAX_LEVEL = tree.graph_data[:n_levels]; + dist = Distances.Euclidean() +) + # words = Vector{Wordnode}(undef, length(lookupvec)) + # words = zeros(Wordnode, length(lookupvec)) + words = Vector{Word}(undef, length(lookupvec)) + Threads.@threads for idx in eachindex(lookupvec) + words[idx] = getWord(tree, lookupvec[idx], nodeIdx, level, MAX_LEVEL; dist) + end + return words +end + +function getWords(tree::MetaGraph, lookupsift::Vector{SIFTDescriptor}, args...; kwargs...) + return getWords(tree, getproperty.(lookupsift, :value)) +end + +""" + getBowvector(voctree, image_words) + +Compute the Bag of Words (BoW) vector for a given image. + +# Arguments +- `voctree`: The vocabulary tree used to generate the words. +- `image_words`: The words extracted from the image. + +# Returns +- A sparse vector representing the Term Frequency-Inverse Document Frequency (TF-IDF) of the image words. + +""" +function getBowvector(voctree, image_words) + tfvec = spzeros(voctree[][:n_leaves]) + idfvec = spzeros(voctree[][:n_leaves]) # IDF weights + # TODO can improve, bit inefficient, but easy + for (i, w) in image_words + # calculate n_id number of occurences of word i in image d + tfvec[i] += 1 + idfvec[i] = w + end + n_d = length(image_words) #total number of words in image d + # n_id/n_d*log(N/n_i), # [SZ 2003, Sec 4] + return (tfvec / n_d) .* idfvec # TF_IDF bowvec +end + +""" + score_L1(v1, v2) + +Compute the L1 score between two vectors `v1` and `v2`. + +The L1 score is calculated as `1 - 0.5 * norm(v1 / norm(v1) .- v2 / norm(v2))`, which measures the similarity between the two vectors. + +# Arguments +- `v1::AbstractVector`: The first input vector. +- `v2::AbstractVector`: The second input vector. + +# Returns +- `Float64`: The L1 score between the two input vectors. + +# References +- [Nister, 2006] +- [Gálvez-López, 2014] +""" +function score_L1(v1, v2) + # [Gálvez-López, 2014] eq2 #TODO can optimize if needed with [Nister, 2006] eq 5 + return 1 - 0.5 * norm(v1 / norm(v1, 1) .- v2 / norm(v2, 1), 1) +end + +function score_L2(v1, v2) + # [Nister, 2006] eq 6 + # return 2.0 - sqrt(1.0 - dot(v1, v2)) + return 1 - 0.5 * norm(v1 / norm(v1) .- v2 / norm(v2)) +end + +function score_norm(p=2) + return (v1,v2) -> 1 - 0.5 * norm(v1 / norm(v1, p) .- v2 / norm(v2, p), p) +end +## ============================================================================= +## Image DB +## ============================================================================= +# +""" + createImageInverseIndex(voctree, image_descriptors) + +Create an image inverse index using Term Frequency-Inverse Document Frequency (TF-IDF) weighting. + +# Arguments +- `voctree`: A vocabulary tree structure containing the graph data and other relevant information. +- `image_descriptors`: A collection of image descriptors, where each descriptor is a pair consisting of an image identifier and its corresponding feature descriptors. + +# Returns +- `image_index`: A sparse matrix where each column corresponds to an image and each row corresponds to a word in the vocabulary. The values are the TF-IDF weights. +- `image_ids`: A vector of image identifiers corresponding to the columns of the `image_index`. +""" +function createImageInverseIndex(voctree, image_descriptors) + #creation is a bit slower this way, but should be easier to create faster lookups + image_index = spzeros(voctree.graph_data[:n_leaves], length(image_descriptors)) + @showprogress for (l, image_desc) in enumerate(image_descriptors) + img_words = getWords(voctree, image_desc.second) + bow_vec = getBowvector(voctree, img_words) + for (i, bv) in zip(findnz(bow_vec)...) + image_index[i,l] = bv + end + end + return image_index, first.(image_descriptors) +end + +function createImageInverseIndex_idf(voctree, image_descriptors) + image_index = [Tuple{Symbol, Float64}[] for _ in 1:voctree.graph_data[:n_leaves]] + word_index = Dict{Symbol, Vector{Int64}}() + for image_desc in image_descriptors + img_words = getWords(voctree, getproperty.(image_desc.second, :value)) + for img_word in img_words + push!(image_index[img_word.leaveId], (image_desc.first, img_word.weight)) + push!(get!(word_index, image_desc.first, Int[]), img_word.leaveId) + end + end + return image_index, word_index +end + + +## ============================================================================= +## Lookup +## ============================================================================= + +""" + findkImages_BF_binary(image_inverse_index, image_word; k=10) + +Finds the top `k` images that match the given `image_word` using a brute-force search using binary weights. + +# Returns +- `Array`: An array of pairs where each pair consists of an image identifier and its corresponding score, sorted by score in descending order. +""" +function findkImages_BF_binary(image_inverse_index, image_word; k=10) + worddict = OrderedDict{Symbol, Float64}() + # Threads.@threads + for words in image_word + poses = image_inverse_index[words[1]] + foreach(poses) do p + get!(worddict, p[1], 0) + worddict[p[1]] += 1 + end + end + sort!(worddict; byvalue=true, rev=true) + return collect(pairs(worddict))[1:k] +end + +""" + findkImages_BF(image_index, image_labels, bowvec, score=score_L1; k=10) + +Finds the top `k` images that best match the given bag-of-words vector (`bowvec`) using a brute-force approach. + +# Arguments +- `image_index::Matrix{Float64}`: A matrix where each column represents the bag-of-words vector of an image in the DB. +- `image_labels::Vector{Symbol}`: A vector containing the labels of the images in the DB. +- `bowvec::Vector{Float64}`: The bag-of-words vector of the query image. +- `score::Function`: A function to compute the similarity score between two bag-of-words vectors. Defaults to `score_L1`. +- `k::Int`: The number of top matches to return. Defaults to 10. + +# Returns +- `Vector{Pair{Symbol, Float64}}`: A vector of pairs where each pair consists of an image label and its corresponding similarity score, sorted in descending order of similarity. Only the top `k` matches are returned. +""" +function findkImages_BF(image_index, image_labels, bowvec, score=score_L1; k=10) + matches = Vector{Pair{Symbol, Float64}}(undef, size(image_index,2)) + Threads.@threads for i in eachindex(matches) + vdb = image_index[:,i] + matches[i] = image_labels[i]=>score(vdb, bowvec) + end + sort!(matches, by=last, rev=true) + return matches[1:k] +end