diff --git a/examples/link_prediction_pubmed.jl b/examples/link_prediction_pubmed.jl index 1fcf0a4a0..c27af951d 100644 --- a/examples/link_prediction_pubmed.jl +++ b/examples/link_prediction_pubmed.jl @@ -6,10 +6,9 @@ using Flux using Flux: onecold, onehotbatch using Flux.Losses: logitbinarycrossentropy using GraphNeuralNetworks -using MLDatasets: PubMed, Cora +using MLDatasets: PubMed using Statistics, Random, LinearAlgebra using CUDA -# using MLJBase: AreaUnderCurve CUDA.allowscalar(false) # arguments for the `train` function @@ -28,7 +27,7 @@ struct DotPredictor end function (::DotPredictor)(g, x) z = apply_edges((xi, xj, e) -> sum(xi .* xj, dims=1), g, xi=x, xj=x) - # z = apply_edges(xi_dot_xj, g, xi=x, xj=x) # Same with buit-in methods + # z = apply_edges(xi_dot_xj, g, xi=x, xj=x) # Same with built-in method return vec(z) end @@ -47,9 +46,10 @@ function train(; kws...) end ### LOAD DATA - data = Cora.dataset() - # data = PubMed.dataset() + data = PubMed.dataset() g = GNNGraph(data.adjacency_list) + + # Print some info @info g @show is_bidirected(g) @show has_self_loops(g) @@ -57,11 +57,14 @@ function train(; kws...) @show mean(degree(g)) isbidir = is_bidirected(g) + # Move to device g = g |> device X = data.node_features |> device - #### SPLIT INTO NEGATIVE AND POSITIVE SAMPLES - train_pos_g, test_pos_g = rand_edge_split(g, 0.9) + #### TRAIN/TEST splits + # With bidirected graph, we make sure that an edge and its reverse + # are in the same split + train_pos_g, test_pos_g = rand_edge_split(g, 0.9, bidirected=isbidir) test_neg_g = negative_sample(g, num_neg_edges=test_pos_g.num_edges, bidirected=isbidir) ### DEFINE MODEL ######### diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl index b1bd5cecf..800abe627 100644 --- a/src/GNNGraphs/GNNGraphs.jl +++ b/src/GNNGraphs/GNNGraphs.jl @@ -4,7 +4,7 @@ using SparseArrays using Functors: @functor using CUDA import Graphs -using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree +using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree, has_self_loops import Flux using Flux: batch import NNlib diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl index a1b80d603..e6910760b 100644 --- a/src/GNNGraphs/transform.jl +++ b/src/GNNGraphs/transform.jl @@ -378,33 +378,41 @@ function negative_sample(g::GNNGraph; end """ - rand_edge_split(g::GNNGraph, frac) -> g1, g2 + rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g)) -> g1, g2 Randomly partition the edges in `g` to from two graphs, `g1` and `g2`. Both will have the same number of nodes as `g`. `g1` will contain a fraction `frac` of the original edges, while `g2` wil contain the rest. -Useful for train/test splits in link prediction tasks. -""" -function rand_edge_split(g::GNNGraph, frac) - # TODO add bidirected version - s, t = edge_index(g) - eids = randperm(g.num_edges) - size1 = round(Int, g.num_edges * frac) - - s1, t1 = s[eids[1:size1]], t[eids[1:size1]] - g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes) +If `bidirected = true` makes sure that an edge and its reverse go into the same split. +This option is supported only for bidirected graphs with no self-loops +and multi-edges. + +`rand_edge_split` is tipically used to create train/test splits in link prediction tasks. +""" +function rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g)) s, t = edge_index(g) - eids = randperm(g.num_edges) - size1 = round(Int, g.num_edges * frac) + ne = bidirected ? g.num_edges รท 2 : g.num_edges + eids = randperm(ne) + size1 = round(Int, ne * frac) - s1, t1 = s[eids[1:size1]], t[eids[1:size1]] + if !bidirected + s1, t1 = s[eids[1:size1]], t[eids[1:size1]] + s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]] + else + @assert is_bidirected(g) + @assert !has_self_loops(g) + @assert !has_multi_edges(g) + mask = s .< t + s, t = s[mask], t[mask] + s1, t1 = s[eids[1:size1]], t[eids[1:size1]] + s1, t1 = [s1; t1], [t1; s1] + s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]] + s2, t2 = [s2; t2], [t2; s2] + end g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes) - - s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]] g2 = GNNGraph(s2, t2, num_nodes=g.num_nodes) - return g1, g2 end diff --git a/test/GNNGraphs/transform.jl b/test/GNNGraphs/transform.jl index f9fc669ad..569e833ed 100644 --- a/test/GNNGraphs/transform.jl +++ b/test/GNNGraphs/transform.jl @@ -140,4 +140,35 @@ @test intersect(g, gneg).num_edges == 0 end end + + @testset "rand_edge_split" begin + if GRAPH_T == :coo + n, m = 100,300 + + g = rand_graph(n, m, bidirected=true, graph_type=GRAPH_T) + # check bidirected=is_bidirected(g) default + g1, g2 = rand_edge_split(g, 0.9) + @test is_bidirected(g1) + @test is_bidirected(g2) + @test intersect(g1, g2).num_edges == 0 + @test g1.num_edges + g2.num_edges == g.num_edges + @test g2.num_edges < 50 + + g = rand_graph(n, m, bidirected=false, graph_type=GRAPH_T) + # check bidirected=is_bidirected(g) default + g1, g2 = rand_edge_split(g, 0.9) + @test !is_bidirected(g1) + @test !is_bidirected(g2) + @test intersect(g1, g2).num_edges == 0 + @test g1.num_edges + g2.num_edges == g.num_edges + @test g2.num_edges < 50 + + g1, g2 = rand_edge_split(g, 0.9, bidirected=false) + @test !is_bidirected(g1) + @test !is_bidirected(g2) + @test intersect(g1, g2).num_edges == 0 + @test g1.num_edges + g2.num_edges == g.num_edges + @test g2.num_edges < 50 + end + end end