Hamming distance starting node labels (#47)

Refactor and add original method to find initial node labels from paper
SDS-EPFL · Nov 28, 2023 · f6c7982 · f6c7982 · dufourc1 · Nov 28, 2023
1 parent 8cb8036
commit f6c7982
Show file tree

Hide file tree

Showing 16 changed files with 492 additions and 107 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,14 +1,16 @@
 name = "NetworkHistogram"
 uuid = "7806f430-7229-459c-b2e6-df35e8e4eb5d"
 authors = ["Charles Dufour", "Jake Grainger"]
-version = "0.5.0"
+version = "0.5.1"
 
 [deps]
+ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d"
 Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8"
+Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -17,7 +19,6 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"
 
 [compat]
-julia = "1.8"
 Arpack = "0.5.4"
 BenchmarkTools = "1.3.2"
 CodecZstd = "0.7.2"
@@ -27,7 +28,7 @@ ProgressMeter = "1.7.2"
 StatsBase = "0.33.21"
 TranscodingStreams = "0.9.11"
 ValueHistories = "0.5.4"
-
+julia = "1.8"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl
@@ -1,11 +1,12 @@
 module NetworkHistogram
 
-using ValueHistories, StatsBase, Random, LinearAlgebra
+using ValueHistories, StatsBase, Random, LinearAlgebra, Kronecker
 
 using Arpack: eigs
+using ArnoldiMethod: partialschur, partialeigen, SR, LR
 
 export graphhist, PreviousBestValue, Strict, RandomNodeSwap
-export OrderedStart, RandomStart, EigenStart
+export OrderedStart, RandomStart, EigenStart, DistStart
 
 include("group_numbering.jl")
 include("assignment.jl")
@@ -15,6 +16,7 @@ include("config_rules/starting_assignment_rule.jl")
 include("config_rules/swap_rule.jl")
 include("config_rules/accept_rule.jl")
 include("config_rules/stop_rule.jl")
+include("config_rules/bandwidth_selection_rule.jl")
 
 include("optimize.jl")
 include("histogram.jl")

diff --git a/src/assignment.jl b/src/assignment.jl
@@ -1,16 +1,17 @@
-mutable struct Assignment{T}
+mutable struct Assignment{T, M}
     const group_size::GroupSize{T}
 
     const node_labels::Vector{Int}
     const counts::Matrix{Int}
-    const realized::Matrix{Float64}
-    const estimated_theta::Matrix{Float64}
+    const realized::Array{Float64, M}
+    const estimated_theta::Array{Float64, M}
+    const number_layers::Int
 
     likelihood::Float64
 
     function Assignment(A, node_labels, group_size::GroupSize{T}) where {T}
+        M = ndims(A)
         number_groups = length(group_size)
-        estimated_theta = zeros(Float64, size(A, 1), number_groups)
 
         counts = zeros(Int64, number_groups, number_groups)
         realized = zeros(Int64, number_groups, number_groups)
@@ -33,15 +34,84 @@ mutable struct Assignment{T}
         likelihood = compute_log_likelihood(number_groups, estimated_theta, counts,
             size(A, 1))
 
-        new{T}(group_size,
+        new{T, M}(group_size,
             node_labels,
             counts,
             realized,
             estimated_theta,
+            1,
+            likelihood)
+    end
+
+    function Assignment(A::Array{I, 3},
+            node_labels,
+            group_size::GroupSize{T}) where {I, T}
+        M = ndims(A)
+        number_groups = length(group_size)
+
+        counts = zeros(Int64, number_groups, number_groups)
+        realized = zeros(Int64, number_groups, number_groups, 2^size(A, 3))
+
+        A_updated = zeros(Int64, size(A, 1), size(A, 2))
+        for i in 1:size(A, 1)
+            for j in (i + 1):size(A, 2)
+                A_updated[i, j] = _binary_to_index(A[i, j, :])
+                A_updated[j, i] = A_updated[i, j]
+            end
+        end
+
+        @inbounds @simd for m in 1:size(realized, 3)
+            for k in 1:number_groups
+                for l in k:number_groups
+                    realized[k, l, m] = sum(A_updated[node_labels .== k,
+                        node_labels .== l] .== m)
+                    realized[l, k, m] = realized[k, l, m]
+                    counts[k, l] = group_size[k] * group_size[l]
+                    counts[l, k] = counts[k, l]
+                end
+            end
+        end
+
+        @inbounds @simd for m in 1:size(realized, 3)
+            for k in 1:number_groups
+                counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2
+                realized[k, k, m] = sum(A_updated[node_labels .== k, node_labels .== k] .==
+                                        m) ÷ 2
+            end
+        end
+        estimated_theta = realized ./ counts
+        likelihood = compute_multivariate_log_likelihood(number_groups,
+            estimated_theta,
+            realized)
+
+        new{T, M}(group_size,
+            node_labels,
+            counts,
+            realized,
+            estimated_theta,
+            size(A, 3),
             likelihood)
     end
 end
 
+function _binary_to_index(binary_vector::Vector{Int})
+    total = 1
+    for i in 1:length(binary_vector)
+        total += binary_vector[i] * 2^(i - 1)
+    end
+    return total
+end
+
+function _index_to_binary(index::Int, M)
+    binary_vector = zeros(Int, M)
+    index -= 1
+    for i in 1:M
+        binary_vector[i] = index % 2
+        index = index ÷ 2
+    end
+    return binary_vector
+end
+
 """
     compute_log_likelihood(number_groups, estimated_theta, counts, number_nodes)
 
@@ -66,6 +136,22 @@ function compute_log_likelihood(number_groups, estimated_theta, counts, number_n
     return loglik
 end
 
+function compute_multivariate_log_likelihood(number_groups, estimated_theta, realized)
+    loglik = 0.0
+    @inbounds @simd for i in 1:number_groups
+        for j in i:number_groups
+            for m in 1:size(realized, 3)
+                if realized[i, j, m] != 0
+                    θ = estimated_theta[i, j, m]
+                    θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ)
+                    loglik += log(θ_c) * realized[i, j, m]
+                end
+            end
+        end
+    end
+    return loglik
+end
+
 """
     compute_log_likelihood(assignment::Assignment)
 
@@ -82,13 +168,24 @@ where ``\\hat{\\theta}_{ab}`` is the estimated probability of an edge between co
     \\hat{\\theta}_{ab} = \\frac{\\sum\\limits_{i<j} A_{ij} \\mathbb{1}(z_i = a, z_j = b) }{\\sum\\limits_{i<j} \\mathbb{1}(z_i = a, z_j = b)}.
 ```
 """
-function compute_log_likelihood(assignment::Assignment)
+function compute_log_likelihood(assignment::Assignment{T, 2}) where {T}
     compute_log_likelihood(length(assignment.group_size),
         assignment.estimated_theta,
         assignment.counts,
         sum(assignment.group_size))
 end
 
+function compute_log_likelihood(assignment::Assignment)
+    compute_multivariate_log_likelihood(length(assignment.group_size),
+        assignment.estimated_theta,
+        assignment.realized)
+end
+
+"""
+    deepcopy!(a::Assignment, b::Assignment)
+
+Deep copy the attributes of `b` to `a`.
+"""
 function deepcopy!(a::Assignment, b::Assignment)
     a.node_labels .= b.node_labels
     a.counts .= b.counts

diff --git a/src/config_rules/accept_rule.jl b/src/config_rules/accept_rule.jl
@@ -15,8 +15,8 @@ Return the updated `current` assignment based on the `accept_rule`.
 accept_reject_update!
 
 function accept_reject_update!(history::GraphOptimizationHistory, iteration::Int,
-    proposal::Assignment,
-    current::Assignment, accept_rule::Strict)
+        proposal::Assignment,
+        current::Assignment, ::Strict)
     if proposal.likelihood > current.likelihood
         deepcopy!(current, proposal)
     end

diff --git a/src/config_rules/bandwidth_selection_rule.jl b/src/config_rules/bandwidth_selection_rule.jl
@@ -0,0 +1,61 @@
+
+function select_bandwidth(A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T}
+    h = oracle_bandwidth(A, type, alpha, c)
+    return max(2, min(size(A)[1], round(Int, h)))
+end
+
+function select_bandwidth(A::Array{T, 3}; type = "degs", alpha = 1, c = 1)::Int where {T}
+    hs = [select_bandwidth(A[:, :, i]; type, alpha, c) for i in 1:size(A, 3)]
+    @warn "Naive bandwidth selection for multilayer graph histogram: using minimum over layers"
+    h = max(2, min(size(A, 1), round(Int, minimum(hs))))
+    return h
+end
+
+"""
+    oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8))
+
+Oracle bandwidth selection for graph histogram, using
+
+```math
+\\widehat{h^*}=\\left(2\\left(\\left(d^T d\\right)^{+}\\right)^2 d^T A d \\cdot \\hat{m} \\hat{b}\\right)^{-\\frac{1}{2}} \\hat{\\rho}_n^{\\frac{1}{4}},
+```
+
+where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n`` is the empirical edge density, and  ``m``, ``b`` are the slope and intercept fitted on ``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``.
+"""
+function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8))
+    if type ∉ ["eigs", "degs"]
+        error("Invalid input type $(type)")
+    end
+
+    if alpha != 1
+        error("Currently only supports alpha = 1")
+    end
+
+    n = size(A, 1)
+    midPt = collect(max(1, round(Int, (n ÷ 2 - c * sqrt(n)))):round(Int,
+        (n ÷ 2 + c * sqrt(n))))
+    rhoHat_inv = inv(sum(A) / (n * (n - 1)))
+
+    # Rank-1 graphon estimate via fhat(x,y) = mult*u(x)*u(y)*pinv(rhoHat);
+    if type == "eigs"
+        eig_res = eigs(A, nev = 1, which = :LM)
+        u = eig_res.vectors
+        mult = eig_res.values[1]
+    elseif type == "degs"
+        u = sum(A, dims = 2)
+        mult = (u' * A * u) / (sum(u .^ 2))^2
+    else
+        error("Invalid input type $(type)")
+    end
+
+    # Calculation bandwidth
+    u = sort(u, dims = 1)
+    uMid = u[midPt]
+    lmfit_coef = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid
+
+    h = (2^(alpha + 1) * alpha * mult^2 *
+         (lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * lmfit_coef[2]^2 *
+         rhoHat_inv)^(-1 / (2 * (alpha + 1)))
+    #estMSqrd = 2*mult^2*(lmfit_coef[2]*length(uMid)/2+lmfit_coef[1])^2*lmfit_coef[2]^2*rhoHat_inv^2*(n+1)^2
+    return h[1]
+end
diff --git a/src/config_rules/starting_assignment_rule.jl b/src/config_rules/starting_assignment_rule.jl
@@ -2,6 +2,7 @@ abstract type StartingAssignment end
 struct OrderedStart <: StartingAssignment end
 struct RandomStart <: StartingAssignment end
 struct EigenStart <: StartingAssignment end
+struct DistStart <: StartingAssignment end
 
 """
     initialize_node_labels(A, h, starting_assignment_rule::StartingAssignment)
@@ -13,6 +14,7 @@ node labels and a `GroupSize` object.
 - `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`.
 - `RandomStart()`: Randomly assign nodes to groups.
 - `EigenStart()`: Assign nodes to groups based on the second eigenvector of the normalized Laplacian.
+- `DistStart()`: Assign nodes to groups based on the Hamming distance between rows of `A`.
 """
 initialize_node_labels
 
@@ -47,3 +49,9 @@ function initialize_node_labels(A, h, ::EigenStart)
     end
     return node_labels, group_size
 end
+
+function initialize_node_labels(A, h, ::DistStart)
+    group_size = GroupSize(size(A, 1), h)
+    node_labels = spectral_clustering(A, h)
+    return node_labels, group_size
+end
diff --git a/src/group_numbering.jl b/src/group_numbering.jl
@@ -19,11 +19,11 @@ struct GroupSize{T} <: AbstractVector{Int}
         else
             remainder_group = number_nodes - number_groups * standard_group
             if remainder_group == 1
-                @warn "h has to be changes as only one node in remainder group"
+                @warn "h has to be changed, only one node in remainder group"
                 standard_group -= 1
                 remainder_group = number_groups + 1 # because equal to 1+number_groups because we take 1 from each standard group, and there are number_groups of them
                 if standard_group == 1
-                    error("Standard group size now 1, please choose a new h value.")
+                    error("Standard group size now 1, please choose a new value for h.")
                 end
             end
             new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups + 1)

diff --git a/src/histogram.jl b/src/histogram.jl
@@ -1,10 +1,11 @@
-struct GraphHist{T}
-    θ::Matrix{T}
+struct GraphHist{T, M}
+    θ::Array{T, M}
     node_labels::Vector{Int}
-    function GraphHist(a::Assignment)
+    num_layers::Int
+    function GraphHist(a::Assignment{T, M}) where {T, M}
         θ = a.estimated_theta
         node_labels = a.node_labels
-        new{typeof(θ[1])}(θ, node_labels)
+        new{typeof(θ[1]), M}(θ, node_labels, a.number_layers)
     end
 end
 
@@ -23,3 +24,20 @@ Contains the estimated network histogram and the node labels.
 blockmodel approximation." Proceedings of the National Academy of Sciences 111.41 (2014): 14722-14727.
 """
 GraphHist
+
+function get_moment_representation(g::GraphHist{T, 2}) where {T}
+    return g.θ
+end
+
+function get_moment_representation(g::GraphHist{T, 3}) where {T}
+    moments = zeros(size(g.θ, 1), size(g.θ, 2), 2^g.num_layers - 1)
+    transition = collect(kronecker([1 1; 0 1], g.num_layers))
+    for i in 1:size(g.θ, 1)
+        for j in 1:size(g.θ, 2)
+            moments[i, j, :] .= (transition * g.θ[i, j, :])[2:end]
+        end
+    end
+    indices_for_moments = [findall(x -> x == 1, _index_to_binary(e, g.num_layers))
+                           for e in 2:size(g.θ, 3)]
+    return moments, indices_for_moments
+end