From f96f58e25e848066abbebaeb9d5f6d0f9034e676 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Fri, 5 Nov 2021 10:50:17 +0100 Subject: [PATCH 1/3] wip on rand_split_edge --- examples/link_prediction_pubmed.jl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/link_prediction_pubmed.jl b/examples/link_prediction_pubmed.jl index 1fcf0a4a0..eb1533ab8 100644 --- a/examples/link_prediction_pubmed.jl +++ b/examples/link_prediction_pubmed.jl @@ -28,7 +28,7 @@ struct DotPredictor end function (::DotPredictor)(g, x) z = apply_edges((xi, xj, e) -> sum(xi .* xj, dims=1), g, xi=x, xj=x) - # z = apply_edges(xi_dot_xj, g, xi=x, xj=x) # Same with buit-in methods + # z = apply_edges(xi_dot_xj, g, xi=x, xj=x) # Same with built-in method return vec(z) end @@ -50,6 +50,8 @@ function train(; kws...) data = Cora.dataset() # data = PubMed.dataset() g = GNNGraph(data.adjacency_list) + + # Print some info @info g @show is_bidirected(g) @show has_self_loops(g) @@ -57,11 +59,14 @@ function train(; kws...) @show mean(degree(g)) isbidir = is_bidirected(g) + # Move to device g = g |> device X = data.node_features |> device - #### SPLIT INTO NEGATIVE AND POSITIVE SAMPLES - train_pos_g, test_pos_g = rand_edge_split(g, 0.9) + #### TRAIN/TEST splits + # With bidirected graph, we make sure that an edge and its reverse + # are in the same split + train_pos_g, test_pos_g = rand_edge_split(g, 0.9, bidirected=isbidir) test_neg_g = negative_sample(g, num_neg_edges=test_pos_g.num_edges, bidirected=isbidir) ### DEFINE MODEL ######### From 446aba55cbd838889549b70203d9a2537661acab Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Fri, 5 Nov 2021 10:51:11 +0100 Subject: [PATCH 2/3] wip on rand_split_edge --- src/GNNGraphs/transform.jl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl index a1b80d603..837bcd129 100644 --- a/src/GNNGraphs/transform.jl +++ b/src/GNNGraphs/transform.jl @@ -378,19 +378,24 @@ function negative_sample(g::GNNGraph; end """ - rand_edge_split(g::GNNGraph, frac) -> g1, g2 + rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g)) -> g1, g2 Randomly partition the edges in `g` to from two graphs, `g1` and `g2`. Both will have the same number of nodes as `g`. `g1` will contain a fraction `frac` of the original edges, while `g2` wil contain the rest. + +If `bidirected = true` makes sure that an edge and its reverse go into the same split. + Useful for train/test splits in link prediction tasks. """ -function rand_edge_split(g::GNNGraph, frac) - # TODO add bidirected version +function rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g)) s, t = edge_index(g) - eids = randperm(g.num_edges) - size1 = round(Int, g.num_edges * frac) + idx, idmax = edge_encoding(s, t, g.num_nodes, directed=!bidirected) + uidx = union(idx) # So that multi-edges (and reverse edges in the bidir case) go in the same split + nu = length(uidx) + eids = randperm(nu) + size1 = round(Int, nu * frac) s1, t1 = s[eids[1:size1]], t[eids[1:size1]] g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes) From a3cd35b0770e8d5b1136b7e62be38d906d3e1ba7 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 6 Nov 2021 10:19:57 +0100 Subject: [PATCH 3/3] rand edge split --- examples/link_prediction_pubmed.jl | 6 ++--- src/GNNGraphs/GNNGraphs.jl | 2 +- src/GNNGraphs/transform.jl | 37 ++++++++++++++++-------------- test/GNNGraphs/transform.jl | 31 +++++++++++++++++++++++++ 4 files changed, 54 insertions(+), 22 deletions(-) diff --git a/examples/link_prediction_pubmed.jl b/examples/link_prediction_pubmed.jl index eb1533ab8..c27af951d 100644 --- a/examples/link_prediction_pubmed.jl +++ b/examples/link_prediction_pubmed.jl @@ -6,10 +6,9 @@ using Flux using Flux: onecold, onehotbatch using Flux.Losses: logitbinarycrossentropy using GraphNeuralNetworks -using MLDatasets: PubMed, Cora +using MLDatasets: PubMed using Statistics, Random, LinearAlgebra using CUDA -# using MLJBase: AreaUnderCurve CUDA.allowscalar(false) # arguments for the `train` function @@ -47,8 +46,7 @@ function train(; kws...) end ### LOAD DATA - data = Cora.dataset() - # data = PubMed.dataset() + data = PubMed.dataset() g = GNNGraph(data.adjacency_list) # Print some info diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl index b1bd5cecf..800abe627 100644 --- a/src/GNNGraphs/GNNGraphs.jl +++ b/src/GNNGraphs/GNNGraphs.jl @@ -4,7 +4,7 @@ using SparseArrays using Functors: @functor using CUDA import Graphs -using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree +using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree, has_self_loops import Flux using Flux: batch import NNlib diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl index 837bcd129..e6910760b 100644 --- a/src/GNNGraphs/transform.jl +++ b/src/GNNGraphs/transform.jl @@ -386,30 +386,33 @@ and `g2`. Both will have the same number of nodes as `g`. while `g2` wil contain the rest. If `bidirected = true` makes sure that an edge and its reverse go into the same split. +This option is supported only for bidirected graphs with no self-loops +and multi-edges. -Useful for train/test splits in link prediction tasks. +`rand_edge_split` is tipically used to create train/test splits in link prediction tasks. """ function rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g)) s, t = edge_index(g) - idx, idmax = edge_encoding(s, t, g.num_nodes, directed=!bidirected) - uidx = union(idx) # So that multi-edges (and reverse edges in the bidir case) go in the same split - nu = length(uidx) - eids = randperm(nu) - size1 = round(Int, nu * frac) + ne = bidirected ? g.num_edges รท 2 : g.num_edges + eids = randperm(ne) + size1 = round(Int, ne * frac) - s1, t1 = s[eids[1:size1]], t[eids[1:size1]] - g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes) - - s, t = edge_index(g) - eids = randperm(g.num_edges) - size1 = round(Int, g.num_edges * frac) - - s1, t1 = s[eids[1:size1]], t[eids[1:size1]] + if !bidirected + s1, t1 = s[eids[1:size1]], t[eids[1:size1]] + s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]] + else + @assert is_bidirected(g) + @assert !has_self_loops(g) + @assert !has_multi_edges(g) + mask = s .< t + s, t = s[mask], t[mask] + s1, t1 = s[eids[1:size1]], t[eids[1:size1]] + s1, t1 = [s1; t1], [t1; s1] + s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]] + s2, t2 = [s2; t2], [t2; s2] + end g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes) - - s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]] g2 = GNNGraph(s2, t2, num_nodes=g.num_nodes) - return g1, g2 end diff --git a/test/GNNGraphs/transform.jl b/test/GNNGraphs/transform.jl index f9fc669ad..569e833ed 100644 --- a/test/GNNGraphs/transform.jl +++ b/test/GNNGraphs/transform.jl @@ -140,4 +140,35 @@ @test intersect(g, gneg).num_edges == 0 end end + + @testset "rand_edge_split" begin + if GRAPH_T == :coo + n, m = 100,300 + + g = rand_graph(n, m, bidirected=true, graph_type=GRAPH_T) + # check bidirected=is_bidirected(g) default + g1, g2 = rand_edge_split(g, 0.9) + @test is_bidirected(g1) + @test is_bidirected(g2) + @test intersect(g1, g2).num_edges == 0 + @test g1.num_edges + g2.num_edges == g.num_edges + @test g2.num_edges < 50 + + g = rand_graph(n, m, bidirected=false, graph_type=GRAPH_T) + # check bidirected=is_bidirected(g) default + g1, g2 = rand_edge_split(g, 0.9) + @test !is_bidirected(g1) + @test !is_bidirected(g2) + @test intersect(g1, g2).num_edges == 0 + @test g1.num_edges + g2.num_edges == g.num_edges + @test g2.num_edges < 50 + + g1, g2 = rand_edge_split(g, 0.9, bidirected=false) + @test !is_bidirected(g1) + @test !is_bidirected(g2) + @test intersect(g1, g2).num_edges == 0 + @test g1.num_edges + g2.num_edges == g.num_edges + @test g2.num_edges < 50 + end + end end