Skip to content

Commit

Permalink
Merge pull request #70 from CarloLucibello/cl/negative
Browse files Browse the repository at this point in the history
improvements to link prediction
  • Loading branch information
CarloLucibello authored Nov 5, 2021
2 parents 683c8b7 + 51e7894 commit d5c7a80
Show file tree
Hide file tree
Showing 14 changed files with 272 additions and 53 deletions.
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ makedocs(;
"Graphs" => "gnngraph.md",
"Message Passing" => "messagepassing.md",
"Model Building" => "models.md",
"Datasets" => "datasets.md",
"API Reference" =>
[
"GNNGraph" => "api/gnngraph.md",
Expand Down
4 changes: 4 additions & 0 deletions docs/src/datasets.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Datasets

GNN.jl doesn't come with its own datasets, but leverages those available in the julia (and non-julia) ecosytem. In particular, the [examples in the GNN.jl repository](https://github.com/CarloLucibello/GraphNeuralNetworks.jl/tree/master/examples) make use of the [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) package. There you will find common graph datasets sich as Cora, PubMed, and Citeseer.
Also MLDatasets gives access to the [TUDataset](https://chrsmrrs.github.io/datasets/docs/datasets/) repository and its numerous datasets.
41 changes: 22 additions & 19 deletions examples/link_prediction_pubmed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,23 +49,20 @@ function train(; kws...)
### LOAD DATA
data = Cora.dataset()
# data = PubMed.dataset()
g = GNNGraph(data.adjacency_list) |> device
g = GNNGraph(data.adjacency_list)
@info g
@show is_bidirected(g)
@show has_self_loops(g)
@show has_multi_edges(g)
@show mean(degree(g))
isbidir = is_bidirected(g)

g = g |> device
X = data.node_features |> device

#### SPLIT INTO NEGATIVE AND POSITIVE SAMPLES
s, t = edge_index(g)
eids = randperm(g.num_edges)
test_size = round(Int, g.num_edges * 0.1)

test_pos_s, test_pos_t = s[eids[1:test_size]], t[eids[1:test_size]]
test_pos_g = GNNGraph(test_pos_s, test_pos_t, num_nodes=g.num_nodes)

train_pos_s, train_pos_t = s[eids[test_size+1:end]], t[eids[test_size+1:end]]
train_pos_g = GNNGraph(train_pos_s, train_pos_t, num_nodes=g.num_nodes)

test_neg_g = negative_sample(g, num_neg_edges=test_size)

train_pos_g, test_pos_g = rand_edge_split(g, 0.9)
test_neg_g = negative_sample(g, num_neg_edges=test_pos_g.num_edges, bidirected=isbidir)

### DEFINE MODEL #########
nin, nhidden = size(X,1), args.nhidden
Expand All @@ -82,24 +79,30 @@ function train(; kws...)

### LOSS FUNCTION ############

function loss(pos_g, neg_g = nothing)
function loss(pos_g, neg_g = nothing; with_accuracy=false)
h = model(X)
if neg_g === nothing
# We sample a negative graph at each training step
neg_g = negative_sample(pos_g)
neg_g = negative_sample(pos_g, bidirected=isbidir)
end
pos_score = pred(pos_g, h)
neg_score = pred(neg_g, h)
scores = [pos_score; neg_score]
labels = [fill!(similar(pos_score), 1); fill!(similar(neg_score), 0)]
return logitbinarycrossentropy(scores, labels)
l = logitbinarycrossentropy(scores, labels)
if with_accuracy
acc = 0.5 * mean(pos_score .>= 0) + 0.5 * mean(neg_score .< 0)
return l, acc
else
return l
end
end

### LOGGING FUNCTION
function report(epoch)
train_loss = loss(train_pos_g)
test_loss = loss(test_pos_g, test_neg_g)
println("Epoch: $epoch Train: $(train_loss) Test: $(test_loss)")
train_loss, train_acc = loss(train_pos_g, with_accuracy=true)
test_loss, test_acc = loss(test_pos_g, test_neg_g, with_accuracy=true)
println("Epoch: $epoch $((; train_loss, train_acc)) $((; test_loss, test_acc))")
end

### TRAINING
Expand Down
24 changes: 15 additions & 9 deletions src/GNNGraphs/GNNGraphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,29 @@ export GNNGraph,
graph_features

include("query.jl")
export edge_index,
adjacency_list,
normalized_laplacian,
scaled_laplacian,
export adjacency_list,
edge_index,
graph_indicator,
has_multi_edges,
is_bidirected,
normalized_laplacian,
scaled_laplacian,
# from Graphs
adjacency_matrix,
degree,
outneighbors,
inneighbors
has_self_loops,
inneighbors,
outneighbors

include("transform.jl")
export add_nodes,
add_edges,
add_self_loops,
remove_self_loops,
remove_multi_edges,
add_self_loops,
getgraph,
negative_sample,
rand_edge_split,
remove_self_loops,
remove_multi_edges,
# from Flux
batch,
unbatch,
Expand All @@ -51,6 +54,9 @@ export add_nodes,
include("generate.jl")
export rand_graph

include("operators.jl")
# Base.intersect

include("convert.jl")
include("utils.jl")

Expand Down
7 changes: 4 additions & 3 deletions src/GNNGraphs/convert.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ function to_coo(coo::COO_T; dir=:out, num_nodes=nothing)
num_nodes = isnothing(num_nodes) ? max(maximum(s), maximum(t)) : num_nodes
@assert isnothing(val) || length(val) == length(s)
@assert length(s) == length(t)
@assert min(minimum(s), minimum(t)) >= 1
@assert max(maximum(s), maximum(t)) <= num_nodes

if !isempty(s)
@assert min(minimum(s), minimum(t)) >= 1
@assert max(maximum(s), maximum(t)) <= num_nodes
end
num_edges = length(s)
return coo, num_nodes, num_edges
end
Expand Down
5 changes: 5 additions & 0 deletions src/GNNGraphs/gnngraph.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ function GNNGraph(data;
ndata, edata, gdata)
end

function GNNGraph(n::T; graph_type=:coo, kws...) where {T<:Integer}
s, t = T[], T[]
return GNNGraph(s, t; graph_type, num_nodes=n, kws...)
end

# COO convenience constructors
GNNGraph(s::AbstractVector, t::AbstractVector, v = nothing; kws...) = GNNGraph((s, t, v); kws...)
GNNGraph((s, t)::NTuple{2}; kws...) = GNNGraph((s, t, nothing); kws...)
Expand Down
13 changes: 13 additions & 0 deletions src/GNNGraphs/operators.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# 2 or more args graph operators
function Base.intersect(g1::GNNGraph, g2::GNNGraph)
@assert g1.num_nodes == g2.num_nodes
@assert graph_type_symbol(g1) == graph_type_symbol(g2)
graph_type = graph_type_symbol(g1)
num_nodes = g1.num_nodes

idx1, _ = edge_encoding(edge_index(g1)..., num_nodes)
idx2, _ = edge_encoding(edge_index(g2)..., num_nodes)
idx = intersect(idx1, idx2)
s, t = edge_decoding(idx, num_nodes)
return GNNGraph(s, t; num_nodes, graph_type)
end
37 changes: 33 additions & 4 deletions src/GNNGraphs/query.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ end

Graphs.has_edge(g::GNNGraph{<:ADJMAT_T}, i::Integer, j::Integer) = g.graph[i,j] != 0

graph_type_symbol(g::GNNGraph{<:COO_T}) = :coo
graph_type_symbol(g::GNNGraph{<:SPARSE_T}) = :sparse
graph_type_symbol(g::GNNGraph{<:ADJMAT_T}) = :dense

Graphs.nv(g::GNNGraph) = g.num_nodes
Graphs.ne(g::GNNGraph) = g.num_edges
Graphs.has_vertex(g::GNNGraph, i::Int) = 1 <= i <= g.num_nodes
Expand Down Expand Up @@ -243,10 +247,35 @@ function is_bidirected(g::GNNGraph)
all((s1 .== s2) .& (t1 .== t2))
end

@non_differentiable normalized_laplacian(x...)
@non_differentiable normalized_adjacency(x...)
@non_differentiable scaled_laplacian(x...)
@non_differentiable adjacency_matrix(x...)
"""
has_self_loops(g::GNNGraph)
Return `true` if `g` has any self loops.
"""
function Graphs.has_self_loops(g::GNNGraph)
s, t = edge_index(g)
any(s .== t)
end

"""
has_multi_edges(g::GNNGraph)
Return `true` if `g` has any multiple edges.
"""
function has_multi_edges(g::GNNGraph)
s, t = edge_index(g)
idxs = edge_encoding(s, t, g.num_nodes)
length(union(idxs)) < length(idxs)
end


@non_differentiable adjacency_list(x...)
@non_differentiable adjacency_matrix(x...)
@non_differentiable degree(x...)
@non_differentiable graph_indicator(x...)
@non_differentiable has_multi_edges(x...)
@non_differentiable Graphs.has_self_loops(x...)
@non_differentiable is_bidirected(x...)
@non_differentiable normalized_adjacency(x...)
@non_differentiable normalized_laplacian(x...)
@non_differentiable scaled_laplacian(x...)
66 changes: 48 additions & 18 deletions src/GNNGraphs/transform.jl
Original file line number Diff line number Diff line change
Expand Up @@ -321,13 +321,21 @@ function getgraph(g::GNNGraph, i::AbstractVector{Int}; nmap=false)
end

"""
negative_sample(g::GNNGraph; num_neg_edges=g.num_edges)
negative_sample(g::GNNGraph;
num_neg_edges = g.num_edges,
bidirected = is_bidirected(g))
Return a graph containing random negative edges (i.e. non-edges) from graph `g` as edges.
Is `bidirected=true`, the output graph will be bidirected and there will be no
leakage from the origin graph.
See also [`is_bidirected`](@ref).
"""
function negative_sample(g::GNNGraph;
max_trials=3,
num_neg_edges=g.num_edges)
num_neg_edges=g.num_edges,
bidirected = is_bidirected(g))

@assert g.num_graphs == 1
# Consider self-loops as positive edges
Expand All @@ -344,8 +352,12 @@ function negative_sample(g::GNNGraph;
device = Flux.cpu
end
idx_pos, maxid = edge_encoding(s, t, n)

pneg = 1 - g.num_edges / maxid # prob of selecting negative edge
if bidirected
num_neg_edges = num_neg_edges ÷ 2
pneg = 1 - g.num_edges / 2maxid # prob of selecting negative edge
else
pneg = 1 - g.num_edges / 2maxid # prob of selecting negative edge
end
# pneg * sample_prob * maxid == num_neg_edges
sample_prob = min(1, num_neg_edges / (pneg * maxid) * 1.1)
idx_neg = Int[]
Expand All @@ -359,26 +371,44 @@ function negative_sample(g::GNNGraph;
end
end
s_neg, t_neg = edge_decoding(idx_neg, n)
if bidirected
s_neg, t_neg = [s_neg; t_neg], [t_neg; s_neg]
end
return GNNGraph(s_neg, t_neg, num_nodes=n) |> device
end

# each edge is represented by a number in
# 1:N^2
function edge_encoding(s, t, n)
idx = (s .- 1) .* n .+ t
maxid = n^2
return idx, maxid
end
"""
rand_edge_split(g::GNNGraph, frac) -> g1, g2
Randomly partition the edges in `g` to from two graphs, `g1`
and `g2`. Both will have the same number of nodes as `g`.
`g1` will contain a fraction `frac` of the original edges,
while `g2` wil contain the rest.
Useful for train/test splits in link prediction tasks.
"""
function rand_edge_split(g::GNNGraph, frac)
# TODO add bidirected version
s, t = edge_index(g)
eids = randperm(g.num_edges)
size1 = round(Int, g.num_edges * frac)

s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)

s, t = edge_index(g)
eids = randperm(g.num_edges)
size1 = round(Int, g.num_edges * frac)

s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)

# each edge is represented by a number in
# 1:N^2
function edge_decoding(idx, n)
# g = remove_self_loops(g)
s = (idx .- 1) n .+ 1
t = (idx .- 1) .% n .+ 1
return s, t
s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]]
g2 = GNNGraph(s2, t2, num_nodes=g.num_nodes)

return g1, g2
end


# """
# Transform vector of cartesian indexes into a tuple of vectors containing integers.
# """
Expand Down
56 changes: 56 additions & 0 deletions src/GNNGraphs/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,59 @@ end
ones_like(x::AbstractArray, T=eltype(x), sz=size(x)) = fill!(similar(x, T, sz), 1)
ones_like(x::SparseMatrixCSC, T=eltype(x), sz=size(x)) = ones(T, sz)
ones_like(x::CUMAT_T, T=eltype(x), sz=size(x)) = CUDA.ones(T, sz)


# each edge is represented by a number in
# 1:N^2
function edge_encoding(s, t, n; directed=true)
if directed
# directed edges and self-loops allowed
idx = (s .- 1) .* n .+ t
maxid = n^2
else
# Undirected edges and self-loops allowed
maxid = n * (n + 1) ÷ 2

mask = s .> t
snew = copy(s)
tnew = copy(t)
snew[mask] .= t[mask]
tnew[mask] .= s[mask]
s, t = snew, tnew

# idx = ∑_{i',i'<i} ∑_{j',j'>=i'}^n 1 + ∑_{j',i<=j'<=j} 1
# = ∑_{i',i'<i} ∑_{j',j'>=i'}^n 1 + (j - i + 1)
# = ∑_{i',i'<i} (n - i' + 1) + (j - i + 1)
# = (i - 1)*(2*(n+1)-i)÷2 + (j - i + 1)
idx = @. (s-1)*(2*(n+1)-s)÷2 + (t-s+1)
end
return idx, maxid
end

# each edge is represented by a number in
# 1:N^2
function edge_decoding(idx, n; directed=true)
if directed
# g = remove_self_loops(g)
s = (idx .- 1) n .+ 1
t = (idx .- 1) .% n .+ 1
else
# We replace j=n in
# idx = (i - 1)*(2*(n+1)-i)÷2 + (j - i + 1)
# and obtain
# idx = (i - 1)*(2*(n+1)-i)÷2 + (n - i + 1)

# OR We replace j=i and obtain??
# idx = (i - 1)*(2*(n+1)-i)÷2 + 1

# inverting we have
s = @. ceil(Int, -sqrt((n + 1/2)^2 - 2*idx) + n + 1/2)
t = @. idx - (s-1)*(2*(n+1)-s)÷2 - 1 + s
# t = (idx .- 1) .% n .+ 1
end
return s, t
end

@non_differentiable edge_encoding(x...)
@non_differentiable edge_decoding(x...)

6 changes: 6 additions & 0 deletions test/GNNGraphs/operators.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
@testset "Operators" begin
@testset "intersect" begin
g = rand_graph(10, 20, graph_type=GRAPH_T)
@test intersect(g, g).num_edges == 20
end
end
Loading

0 comments on commit d5c7a80

Please sign in to comment.