Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hamming distance starting point #47

Merged
merged 13 commits into from
Nov 28, 2023
7 changes: 4 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
name = "NetworkHistogram"
uuid = "7806f430-7229-459c-b2e6-df35e8e4eb5d"
authors = ["Charles Dufour", "Jake Grainger"]
version = "0.5.0"
version = "0.5.1"

[deps]
ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d"
Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8"
Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand All @@ -17,7 +19,6 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7"

[compat]
julia = "1.8"
Arpack = "0.5.4"
BenchmarkTools = "1.3.2"
CodecZstd = "0.7.2"
Expand All @@ -27,7 +28,7 @@ ProgressMeter = "1.7.2"
StatsBase = "0.33.21"
TranscodingStreams = "0.9.11"
ValueHistories = "0.5.4"

julia = "1.8"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
6 changes: 4 additions & 2 deletions src/NetworkHistogram.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
module NetworkHistogram

using ValueHistories, StatsBase, Random, LinearAlgebra
using ValueHistories, StatsBase, Random, LinearAlgebra, Kronecker

using Arpack: eigs
using ArnoldiMethod: partialschur, partialeigen, SR, LR

export graphhist, PreviousBestValue, Strict, RandomNodeSwap
export OrderedStart, RandomStart, EigenStart
export OrderedStart, RandomStart, EigenStart, DistStart

include("group_numbering.jl")
include("assignment.jl")
Expand All @@ -15,6 +16,7 @@ include("config_rules/starting_assignment_rule.jl")
include("config_rules/swap_rule.jl")
include("config_rules/accept_rule.jl")
include("config_rules/stop_rule.jl")
include("config_rules/bandwidth_selection_rule.jl")

include("optimize.jl")
include("histogram.jl")
Expand Down
109 changes: 103 additions & 6 deletions src/assignment.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
mutable struct Assignment{T}
mutable struct Assignment{T, M}
const group_size::GroupSize{T}

const node_labels::Vector{Int}
const counts::Matrix{Int}
const realized::Matrix{Float64}
const estimated_theta::Matrix{Float64}
const realized::Array{Float64, M}
const estimated_theta::Array{Float64, M}
const number_layers::Int

likelihood::Float64

function Assignment(A, node_labels, group_size::GroupSize{T}) where {T}
M = ndims(A)
number_groups = length(group_size)
estimated_theta = zeros(Float64, size(A, 1), number_groups)

counts = zeros(Int64, number_groups, number_groups)
realized = zeros(Int64, number_groups, number_groups)
Expand All @@ -33,15 +34,84 @@
likelihood = compute_log_likelihood(number_groups, estimated_theta, counts,
size(A, 1))

new{T}(group_size,
new{T, M}(group_size,
node_labels,
counts,
realized,
estimated_theta,
1,
likelihood)
end

function Assignment(A::Array{I, 3},
node_labels,
group_size::GroupSize{T}) where {I, T}
M = ndims(A)
number_groups = length(group_size)

counts = zeros(Int64, number_groups, number_groups)
realized = zeros(Int64, number_groups, number_groups, 2^size(A, 3))

A_updated = zeros(Int64, size(A, 1), size(A, 2))
for i in 1:size(A, 1)
for j in (i + 1):size(A, 2)
A_updated[i, j] = _binary_to_index(A[i, j, :])
A_updated[j, i] = A_updated[i, j]
end
end

@inbounds @simd for m in 1:size(realized, 3)
for k in 1:number_groups
for l in k:number_groups
realized[k, l, m] = sum(A_updated[node_labels .== k,
node_labels .== l] .== m)
realized[l, k, m] = realized[k, l, m]
counts[k, l] = group_size[k] * group_size[l]
counts[l, k] = counts[k, l]
end
end
end

Check warning on line 73 in src/assignment.jl

View check run for this annotation

Codecov / codecov/patch

src/assignment.jl#L73

Added line #L73 was not covered by tests

@inbounds @simd for m in 1:size(realized, 3)
for k in 1:number_groups
counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2
realized[k, k, m] = sum(A_updated[node_labels .== k, node_labels .== k] .==
m) ÷ 2
end
end

Check warning on line 81 in src/assignment.jl

View check run for this annotation

Codecov / codecov/patch

src/assignment.jl#L81

Added line #L81 was not covered by tests
estimated_theta = realized ./ counts
likelihood = compute_multivariate_log_likelihood(number_groups,
estimated_theta,
realized)

new{T, M}(group_size,
node_labels,
counts,
realized,
estimated_theta,
size(A, 3),
likelihood)
end
end

function _binary_to_index(binary_vector::Vector{Int})
total = 1
for i in 1:length(binary_vector)
total += binary_vector[i] * 2^(i - 1)
end
return total
end

function _index_to_binary(index::Int, M)
binary_vector = zeros(Int, M)
index -= 1
for i in 1:M
binary_vector[i] = index % 2
index = index ÷ 2
end
return binary_vector

Check warning on line 112 in src/assignment.jl

View check run for this annotation

Codecov / codecov/patch

src/assignment.jl#L105-L112

Added lines #L105 - L112 were not covered by tests
end

"""
compute_log_likelihood(number_groups, estimated_theta, counts, number_nodes)

Expand All @@ -66,6 +136,22 @@
return loglik
end

function compute_multivariate_log_likelihood(number_groups, estimated_theta, realized)
loglik = 0.0
@inbounds @simd for i in 1:number_groups
for j in i:number_groups
for m in 1:size(realized, 3)
if realized[i, j, m] != 0
θ = estimated_theta[i, j, m]
θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ)
loglik += log(θ_c) * realized[i, j, m]
end
end
end
end

Check warning on line 151 in src/assignment.jl

View check run for this annotation

Codecov / codecov/patch

src/assignment.jl#L151

Added line #L151 was not covered by tests
return loglik
end

"""
compute_log_likelihood(assignment::Assignment)

Expand All @@ -82,13 +168,24 @@
\\hat{\\theta}_{ab} = \\frac{\\sum\\limits_{i<j} A_{ij} \\mathbb{1}(z_i = a, z_j = b) }{\\sum\\limits_{i<j} \\mathbb{1}(z_i = a, z_j = b)}.
```
"""
function compute_log_likelihood(assignment::Assignment)
function compute_log_likelihood(assignment::Assignment{T, 2}) where {T}
compute_log_likelihood(length(assignment.group_size),
assignment.estimated_theta,
assignment.counts,
sum(assignment.group_size))
end

function compute_log_likelihood(assignment::Assignment)
compute_multivariate_log_likelihood(length(assignment.group_size),
assignment.estimated_theta,
assignment.realized)
end

"""
deepcopy!(a::Assignment, b::Assignment)

Deep copy the attributes of `b` to `a`.
"""
function deepcopy!(a::Assignment, b::Assignment)
a.node_labels .= b.node_labels
a.counts .= b.counts
Expand Down
4 changes: 2 additions & 2 deletions src/config_rules/accept_rule.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ Return the updated `current` assignment based on the `accept_rule`.
accept_reject_update!

function accept_reject_update!(history::GraphOptimizationHistory, iteration::Int,
proposal::Assignment,
current::Assignment, accept_rule::Strict)
proposal::Assignment,
current::Assignment, ::Strict)
if proposal.likelihood > current.likelihood
deepcopy!(current, proposal)
end
Expand Down
61 changes: 61 additions & 0 deletions src/config_rules/bandwidth_selection_rule.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@

function select_bandwidth(A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T}
h = oracle_bandwidth(A, type, alpha, c)
return max(2, min(size(A)[1], round(Int, h)))
end

function select_bandwidth(A::Array{T, 3}; type = "degs", alpha = 1, c = 1)::Int where {T}
hs = [select_bandwidth(A[:, :, i]; type, alpha, c) for i in 1:size(A, 3)]
@warn "Naive bandwidth selection for multilayer graph histogram: using minimum over layers"
h = max(2, min(size(A, 1), round(Int, minimum(hs))))
return h
end

"""
oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8))

Oracle bandwidth selection for graph histogram, using

```math
\\widehat{h^*}=\\left(2\\left(\\left(d^T d\\right)^{+}\\right)^2 d^T A d \\cdot \\hat{m} \\hat{b}\\right)^{-\\frac{1}{2}} \\hat{\\rho}_n^{\\frac{1}{4}},
```

where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n`` is the empirical edge density, and ``m``, ``b`` are the slope and intercept fitted on ``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``.
"""
function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8))
if type ∉ ["eigs", "degs"]
error("Invalid input type $(type)")

Check warning on line 27 in src/config_rules/bandwidth_selection_rule.jl

View check run for this annotation

Codecov / codecov/patch

src/config_rules/bandwidth_selection_rule.jl#L27

Added line #L27 was not covered by tests
end

if alpha != 1
error("Currently only supports alpha = 1")

Check warning on line 31 in src/config_rules/bandwidth_selection_rule.jl

View check run for this annotation

Codecov / codecov/patch

src/config_rules/bandwidth_selection_rule.jl#L31

Added line #L31 was not covered by tests
end

n = size(A, 1)
midPt = collect(max(1, round(Int, (n ÷ 2 - c * sqrt(n)))):round(Int,
(n ÷ 2 + c * sqrt(n))))
rhoHat_inv = inv(sum(A) / (n * (n - 1)))

# Rank-1 graphon estimate via fhat(x,y) = mult*u(x)*u(y)*pinv(rhoHat);
if type == "eigs"
eig_res = eigs(A, nev = 1, which = :LM)
u = eig_res.vectors
mult = eig_res.values[1]

Check warning on line 43 in src/config_rules/bandwidth_selection_rule.jl

View check run for this annotation

Codecov / codecov/patch

src/config_rules/bandwidth_selection_rule.jl#L41-L43

Added lines #L41 - L43 were not covered by tests
elseif type == "degs"
u = sum(A, dims = 2)
mult = (u' * A * u) / (sum(u .^ 2))^2
else
error("Invalid input type $(type)")

Check warning on line 48 in src/config_rules/bandwidth_selection_rule.jl

View check run for this annotation

Codecov / codecov/patch

src/config_rules/bandwidth_selection_rule.jl#L48

Added line #L48 was not covered by tests
end

# Calculation bandwidth
u = sort(u, dims = 1)
uMid = u[midPt]
lmfit_coef = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid

h = (2^(alpha + 1) * alpha * mult^2 *
(lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * lmfit_coef[2]^2 *
rhoHat_inv)^(-1 / (2 * (alpha + 1)))
#estMSqrd = 2*mult^2*(lmfit_coef[2]*length(uMid)/2+lmfit_coef[1])^2*lmfit_coef[2]^2*rhoHat_inv^2*(n+1)^2
return h[1]
end
8 changes: 8 additions & 0 deletions src/config_rules/starting_assignment_rule.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ abstract type StartingAssignment end
struct OrderedStart <: StartingAssignment end
struct RandomStart <: StartingAssignment end
struct EigenStart <: StartingAssignment end
struct DistStart <: StartingAssignment end

"""
initialize_node_labels(A, h, starting_assignment_rule::StartingAssignment)
Expand All @@ -13,6 +14,7 @@ node labels and a `GroupSize` object.
- `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`.
- `RandomStart()`: Randomly assign nodes to groups.
- `EigenStart()`: Assign nodes to groups based on the second eigenvector of the normalized Laplacian.
- `DistStart()`: Assign nodes to groups based on the Hamming distance between rows of `A`.
"""
initialize_node_labels

Expand Down Expand Up @@ -47,3 +49,9 @@ function initialize_node_labels(A, h, ::EigenStart)
end
return node_labels, group_size
end

function initialize_node_labels(A, h, ::DistStart)
group_size = GroupSize(size(A, 1), h)
node_labels = spectral_clustering(A, h)
return node_labels, group_size
end
4 changes: 2 additions & 2 deletions src/group_numbering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
else
remainder_group = number_nodes - number_groups * standard_group
if remainder_group == 1
@warn "h has to be changes as only one node in remainder group"
@warn "h has to be changed, only one node in remainder group"
standard_group -= 1
remainder_group = number_groups + 1 # because equal to 1+number_groups because we take 1 from each standard group, and there are number_groups of them
if standard_group == 1
error("Standard group size now 1, please choose a new h value.")
error("Standard group size now 1, please choose a new value for h.")

Check warning on line 26 in src/group_numbering.jl

View check run for this annotation

Codecov / codecov/patch

src/group_numbering.jl#L26

Added line #L26 was not covered by tests
end
end
new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups + 1)
Expand Down
26 changes: 22 additions & 4 deletions src/histogram.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
struct GraphHist{T}
θ::Matrix{T}
struct GraphHist{T, M}
θ::Array{T, M}
node_labels::Vector{Int}
function GraphHist(a::Assignment)
num_layers::Int
function GraphHist(a::Assignment{T, M}) where {T, M}
θ = a.estimated_theta
node_labels = a.node_labels
new{typeof(θ[1])}(θ, node_labels)
new{typeof(θ[1]), M}(θ, node_labels, a.number_layers)
end
end

Expand All @@ -23,3 +24,20 @@
blockmodel approximation." Proceedings of the National Academy of Sciences 111.41 (2014): 14722-14727.
"""
GraphHist

function get_moment_representation(g::GraphHist{T, 2}) where {T}
return g.θ

Check warning on line 29 in src/histogram.jl

View check run for this annotation

Codecov / codecov/patch

src/histogram.jl#L28-L29

Added lines #L28 - L29 were not covered by tests
end

function get_moment_representation(g::GraphHist{T, 3}) where {T}
moments = zeros(size(g.θ, 1), size(g.θ, 2), 2^g.num_layers - 1)
transition = collect(kronecker([1 1; 0 1], g.num_layers))
for i in 1:size(g.θ, 1)
for j in 1:size(g.θ, 2)
moments[i, j, :] .= (transition * g.θ[i, j, :])[2:end]
end
end
indices_for_moments = [findall(x -> x == 1, _index_to_binary(e, g.num_layers))

Check warning on line 40 in src/histogram.jl

View check run for this annotation

Codecov / codecov/patch

src/histogram.jl#L32-L40

Added lines #L32 - L40 were not covered by tests
for e in 2:size(g.θ, 3)]
return moments, indices_for_moments

Check warning on line 42 in src/histogram.jl

View check run for this annotation

Codecov / codecov/patch

src/histogram.jl#L42

Added line #L42 was not covered by tests
end
Loading
Loading