Skip to content

Commit

Permalink
Hamming distance starting node labels (#47)
Browse files Browse the repository at this point in the history
Refactor and add original method to find initial node labels from paper
  • Loading branch information
dufourc1 authored Nov 28, 2023
1 parent 8cb8036 commit f6c7982
Show file tree
Hide file tree
Showing 16 changed files with 492 additions and 107 deletions.
7 changes: 4 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
name = "NetworkHistogram"
uuid = "7806f430-7229-459c-b2e6-df35e8e4eb5d"
authors = ["Charles Dufour", "Jake Grainger"]
version = "0.5.0"
version = "0.5.1"

[deps]
ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d"
Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8"
Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand All @@ -17,7 +19,6 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"

[compat]
julia = "1.8"
Arpack = "0.5.4"
BenchmarkTools = "1.3.2"
CodecZstd = "0.7.2"
Expand All @@ -27,7 +28,7 @@ ProgressMeter = "1.7.2"
StatsBase = "0.33.21"
TranscodingStreams = "0.9.11"
ValueHistories = "0.5.4"

julia = "1.8"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
6 changes: 4 additions & 2 deletions src/NetworkHistogram.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
module NetworkHistogram

using ValueHistories, StatsBase, Random, LinearAlgebra
using ValueHistories, StatsBase, Random, LinearAlgebra, Kronecker

using Arpack: eigs
using ArnoldiMethod: partialschur, partialeigen, SR, LR

export graphhist, PreviousBestValue, Strict, RandomNodeSwap
export OrderedStart, RandomStart, EigenStart
export OrderedStart, RandomStart, EigenStart, DistStart

include("group_numbering.jl")
include("assignment.jl")
Expand All @@ -15,6 +16,7 @@ include("config_rules/starting_assignment_rule.jl")
include("config_rules/swap_rule.jl")
include("config_rules/accept_rule.jl")
include("config_rules/stop_rule.jl")
include("config_rules/bandwidth_selection_rule.jl")

include("optimize.jl")
include("histogram.jl")
Expand Down
109 changes: 103 additions & 6 deletions src/assignment.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
mutable struct Assignment{T}
mutable struct Assignment{T, M}
const group_size::GroupSize{T}

const node_labels::Vector{Int}
const counts::Matrix{Int}
const realized::Matrix{Float64}
const estimated_theta::Matrix{Float64}
const realized::Array{Float64, M}
const estimated_theta::Array{Float64, M}
const number_layers::Int

likelihood::Float64

function Assignment(A, node_labels, group_size::GroupSize{T}) where {T}
M = ndims(A)
number_groups = length(group_size)
estimated_theta = zeros(Float64, size(A, 1), number_groups)

counts = zeros(Int64, number_groups, number_groups)
realized = zeros(Int64, number_groups, number_groups)
Expand All @@ -33,15 +34,84 @@ mutable struct Assignment{T}
likelihood = compute_log_likelihood(number_groups, estimated_theta, counts,
size(A, 1))

new{T}(group_size,
new{T, M}(group_size,
node_labels,
counts,
realized,
estimated_theta,
1,
likelihood)
end

function Assignment(A::Array{I, 3},
node_labels,
group_size::GroupSize{T}) where {I, T}
M = ndims(A)
number_groups = length(group_size)

counts = zeros(Int64, number_groups, number_groups)
realized = zeros(Int64, number_groups, number_groups, 2^size(A, 3))

A_updated = zeros(Int64, size(A, 1), size(A, 2))
for i in 1:size(A, 1)
for j in (i + 1):size(A, 2)
A_updated[i, j] = _binary_to_index(A[i, j, :])
A_updated[j, i] = A_updated[i, j]
end
end

@inbounds @simd for m in 1:size(realized, 3)
for k in 1:number_groups
for l in k:number_groups
realized[k, l, m] = sum(A_updated[node_labels .== k,
node_labels .== l] .== m)
realized[l, k, m] = realized[k, l, m]
counts[k, l] = group_size[k] * group_size[l]
counts[l, k] = counts[k, l]
end
end
end

@inbounds @simd for m in 1:size(realized, 3)
for k in 1:number_groups
counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2
realized[k, k, m] = sum(A_updated[node_labels .== k, node_labels .== k] .==
m) ÷ 2
end
end
estimated_theta = realized ./ counts
likelihood = compute_multivariate_log_likelihood(number_groups,
estimated_theta,
realized)

new{T, M}(group_size,
node_labels,
counts,
realized,
estimated_theta,
size(A, 3),
likelihood)
end
end

function _binary_to_index(binary_vector::Vector{Int})
total = 1
for i in 1:length(binary_vector)
total += binary_vector[i] * 2^(i - 1)
end
return total
end

function _index_to_binary(index::Int, M)
binary_vector = zeros(Int, M)
index -= 1
for i in 1:M
binary_vector[i] = index % 2
index = index ÷ 2
end
return binary_vector
end

"""
compute_log_likelihood(number_groups, estimated_theta, counts, number_nodes)
Expand All @@ -66,6 +136,22 @@ function compute_log_likelihood(number_groups, estimated_theta, counts, number_n
return loglik
end

function compute_multivariate_log_likelihood(number_groups, estimated_theta, realized)
loglik = 0.0
@inbounds @simd for i in 1:number_groups
for j in i:number_groups
for m in 1:size(realized, 3)
if realized[i, j, m] != 0
θ = estimated_theta[i, j, m]
θ_c = θ <= 0 ? 1e-14 :>= 1 ? 1 - 1e-14 : θ)
loglik += log(θ_c) * realized[i, j, m]
end
end
end
end
return loglik
end

"""
compute_log_likelihood(assignment::Assignment)
Expand All @@ -82,13 +168,24 @@ where ``\\hat{\\theta}_{ab}`` is the estimated probability of an edge between co
\\hat{\\theta}_{ab} = \\frac{\\sum\\limits_{i<j} A_{ij} \\mathbb{1}(z_i = a, z_j = b) }{\\sum\\limits_{i<j} \\mathbb{1}(z_i = a, z_j = b)}.
```
"""
function compute_log_likelihood(assignment::Assignment)
function compute_log_likelihood(assignment::Assignment{T, 2}) where {T}
compute_log_likelihood(length(assignment.group_size),
assignment.estimated_theta,
assignment.counts,
sum(assignment.group_size))
end

function compute_log_likelihood(assignment::Assignment)
compute_multivariate_log_likelihood(length(assignment.group_size),
assignment.estimated_theta,
assignment.realized)
end

"""
deepcopy!(a::Assignment, b::Assignment)
Deep copy the attributes of `b` to `a`.
"""
function deepcopy!(a::Assignment, b::Assignment)
a.node_labels .= b.node_labels
a.counts .= b.counts
Expand Down
4 changes: 2 additions & 2 deletions src/config_rules/accept_rule.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ Return the updated `current` assignment based on the `accept_rule`.
accept_reject_update!

function accept_reject_update!(history::GraphOptimizationHistory, iteration::Int,
proposal::Assignment,
current::Assignment, accept_rule::Strict)
proposal::Assignment,
current::Assignment, ::Strict)
if proposal.likelihood > current.likelihood
deepcopy!(current, proposal)
end
Expand Down
61 changes: 61 additions & 0 deletions src/config_rules/bandwidth_selection_rule.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@

function select_bandwidth(A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T}
h = oracle_bandwidth(A, type, alpha, c)
return max(2, min(size(A)[1], round(Int, h)))
end

function select_bandwidth(A::Array{T, 3}; type = "degs", alpha = 1, c = 1)::Int where {T}
hs = [select_bandwidth(A[:, :, i]; type, alpha, c) for i in 1:size(A, 3)]
@warn "Naive bandwidth selection for multilayer graph histogram: using minimum over layers"
h = max(2, min(size(A, 1), round(Int, minimum(hs))))
return h
end

"""
oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8))
Oracle bandwidth selection for graph histogram, using
```math
\\widehat{h^*}=\\left(2\\left(\\left(d^T d\\right)^{+}\\right)^2 d^T A d \\cdot \\hat{m} \\hat{b}\\right)^{-\\frac{1}{2}} \\hat{\\rho}_n^{\\frac{1}{4}},
```
where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n`` is the empirical edge density, and ``m``, ``b`` are the slope and intercept fitted on ``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``.
"""
function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8))
if type ["eigs", "degs"]
error("Invalid input type $(type)")
end

if alpha != 1
error("Currently only supports alpha = 1")
end

n = size(A, 1)
midPt = collect(max(1, round(Int, (n ÷ 2 - c * sqrt(n)))):round(Int,
(n ÷ 2 + c * sqrt(n))))
rhoHat_inv = inv(sum(A) / (n * (n - 1)))

# Rank-1 graphon estimate via fhat(x,y) = mult*u(x)*u(y)*pinv(rhoHat);
if type == "eigs"
eig_res = eigs(A, nev = 1, which = :LM)
u = eig_res.vectors
mult = eig_res.values[1]
elseif type == "degs"
u = sum(A, dims = 2)
mult = (u' * A * u) / (sum(u .^ 2))^2
else
error("Invalid input type $(type)")
end

# Calculation bandwidth
u = sort(u, dims = 1)
uMid = u[midPt]
lmfit_coef = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid

h = (2^(alpha + 1) * alpha * mult^2 *
(lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * lmfit_coef[2]^2 *
rhoHat_inv)^(-1 / (2 * (alpha + 1)))
#estMSqrd = 2*mult^2*(lmfit_coef[2]*length(uMid)/2+lmfit_coef[1])^2*lmfit_coef[2]^2*rhoHat_inv^2*(n+1)^2
return h[1]
end
8 changes: 8 additions & 0 deletions src/config_rules/starting_assignment_rule.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ abstract type StartingAssignment end
struct OrderedStart <: StartingAssignment end
struct RandomStart <: StartingAssignment end
struct EigenStart <: StartingAssignment end
struct DistStart <: StartingAssignment end

"""
initialize_node_labels(A, h, starting_assignment_rule::StartingAssignment)
Expand All @@ -13,6 +14,7 @@ node labels and a `GroupSize` object.
- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`.
- `RandomStart()`: Randomly assign nodes to groups.
- `EigenStart()`: Assign nodes to groups based on the second eigenvector of the normalized Laplacian.
- `DistStart()`: Assign nodes to groups based on the Hamming distance between rows of `A`.
"""
initialize_node_labels

Expand Down Expand Up @@ -47,3 +49,9 @@ function initialize_node_labels(A, h, ::EigenStart)
end
return node_labels, group_size
end

function initialize_node_labels(A, h, ::DistStart)
group_size = GroupSize(size(A, 1), h)
node_labels = spectral_clustering(A, h)
return node_labels, group_size
end
4 changes: 2 additions & 2 deletions src/group_numbering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ struct GroupSize{T} <: AbstractVector{Int}
else
remainder_group = number_nodes - number_groups * standard_group
if remainder_group == 1
@warn "h has to be changes as only one node in remainder group"
@warn "h has to be changed, only one node in remainder group"
standard_group -= 1
remainder_group = number_groups + 1 # because equal to 1+number_groups because we take 1 from each standard group, and there are number_groups of them
if standard_group == 1
error("Standard group size now 1, please choose a new h value.")
error("Standard group size now 1, please choose a new value for h.")
end
end
new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups + 1)
Expand Down
26 changes: 22 additions & 4 deletions src/histogram.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
struct GraphHist{T}
θ::Matrix{T}
struct GraphHist{T, M}
θ::Array{T, M}
node_labels::Vector{Int}
function GraphHist(a::Assignment)
num_layers::Int
function GraphHist(a::Assignment{T, M}) where {T, M}
θ = a.estimated_theta
node_labels = a.node_labels
new{typeof(θ[1])}(θ, node_labels)
new{typeof(θ[1]), M}(θ, node_labels, a.number_layers)
end
end

Expand All @@ -23,3 +24,20 @@ Contains the estimated network histogram and the node labels.
blockmodel approximation." Proceedings of the National Academy of Sciences 111.41 (2014): 14722-14727.
"""
GraphHist

function get_moment_representation(g::GraphHist{T, 2}) where {T}
return g.θ
end

function get_moment_representation(g::GraphHist{T, 3}) where {T}
moments = zeros(size(g.θ, 1), size(g.θ, 2), 2^g.num_layers - 1)
transition = collect(kronecker([1 1; 0 1], g.num_layers))
for i in 1:size(g.θ, 1)
for j in 1:size(g.θ, 2)
moments[i, j, :] .= (transition * g.θ[i, j, :])[2:end]
end
end
indices_for_moments = [findall(x -> x == 1, _index_to_binary(e, g.num_layers))
for e in 2:size(g.θ, 3)]
return moments, indices_for_moments
end
Loading

2 comments on commit f6c7982

@dufourc1
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/96057

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.5.1 -m "<description of version>" f6c798223e6920725787095cb6b2de86a0f916ed
git push origin v0.5.1

Please sign in to comment.