JuliaGraphs · CarloLucibello · Nov 6, 2021 · Nov 5, 2021 · Nov 5, 2021 · Nov 6, 2021
diff --git a/examples/link_prediction_pubmed.jl b/examples/link_prediction_pubmed.jl
@@ -6,10 +6,9 @@ using Flux
 using Flux: onecold, onehotbatch
 using Flux.Losses: logitbinarycrossentropy
 using GraphNeuralNetworks
-using MLDatasets: PubMed, Cora
+using MLDatasets: PubMed
 using Statistics, Random, LinearAlgebra
 using CUDA
-# using MLJBase: AreaUnderCurve
 CUDA.allowscalar(false)
 
 # arguments for the `train` function 
@@ -28,7 +27,7 @@ struct DotPredictor end
 
 function (::DotPredictor)(g, x)
     z = apply_edges((xi, xj, e) -> sum(xi .* xj, dims=1), g, xi=x, xj=x)
-    # z = apply_edges(xi_dot_xj, g, xi=x, xj=x) # Same with  buit-in methods
+    # z = apply_edges(xi_dot_xj, g, xi=x, xj=x) # Same with built-in method
     return vec(z)
 end
 
@@ -47,21 +46,25 @@ function train(; kws...)
     end
 
     ### LOAD DATA
-    data = Cora.dataset()
-    # data = PubMed.dataset()
+    data = PubMed.dataset()
     g = GNNGraph(data.adjacency_list)
+
+    # Print some info
     @info g
     @show is_bidirected(g)
     @show has_self_loops(g)
     @show has_multi_edges(g)
     @show mean(degree(g))
     isbidir = is_bidirected(g)  
 
+    # Move to device
     g = g |> device
     X = data.node_features |> device
 
-    #### SPLIT INTO NEGATIVE AND POSITIVE SAMPLES
-    train_pos_g, test_pos_g = rand_edge_split(g, 0.9)
+    #### TRAIN/TEST splits
+    # With bidirected graph, we make sure that an edge and its reverse
+    # are in the same split 
+    train_pos_g, test_pos_g = rand_edge_split(g, 0.9, bidirected=isbidir)
     test_neg_g = negative_sample(g, num_neg_edges=test_pos_g.num_edges, bidirected=isbidir)
 
     ### DEFINE MODEL #########

diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl
@@ -4,7 +4,7 @@ using SparseArrays
 using Functors: @functor
 using CUDA 
 import Graphs
-using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree
+using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree, has_self_loops
 import Flux
 using Flux: batch
 import NNlib

diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl
@@ -378,33 +378,41 @@ function negative_sample(g::GNNGraph;
 end
 
 """
-    rand_edge_split(g::GNNGraph, frac) -> g1, g2
+    rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g)) -> g1, g2
 
 Randomly partition the edges in `g` to from two graphs, `g1`
 and `g2`. Both will have the same number of nodes as `g`.
 `g1` will contain a fraction `frac` of the original edges, 
 while `g2` wil contain the rest.
-Useful for train/test splits in link prediction tasks.
-"""
-function rand_edge_split(g::GNNGraph, frac)
-    # TODO add bidirected version
-    s, t = edge_index(g)
-    eids = randperm(g.num_edges)
-    size1 = round(Int, g.num_edges * frac)
-
-    s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
-    g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)
 
+If `bidirected = true` makes sure that an edge and its reverse go into the same split.
+This option is supported only for bidirected graphs with no self-loops
+and multi-edges.
+
+`rand_edge_split` is tipically used to create train/test splits in link prediction tasks.
+"""
+function rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g))
     s, t = edge_index(g)
-    eids = randperm(g.num_edges)
-    size1 = round(Int, g.num_edges * frac)
+    ne = bidirected ? g.num_edges ÷ 2 : g.num_edges
+    eids = randperm(ne)
+    size1 = round(Int, ne * frac)
 
-    s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
+    if !bidirected
+        s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
+        s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]]
+    else
+        @assert is_bidirected(g)
+        @assert !has_self_loops(g)
+        @assert !has_multi_edges(g)
+        mask = s .< t
+        s, t = s[mask], t[mask]
+        s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
+        s1, t1 = [s1; t1], [t1; s1]
+        s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]]
+        s2, t2 = [s2; t2], [t2; s2]
+    end
     g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)
-
-    s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]]
     g2 = GNNGraph(s2, t2, num_nodes=g.num_nodes)
-
     return g1, g2
 end
 

diff --git a/test/GNNGraphs/transform.jl b/test/GNNGraphs/transform.jl
@@ -140,4 +140,35 @@
             @test intersect(g, gneg).num_edges == 0
         end
     end
+
+    @testset "rand_edge_split" begin
+        if GRAPH_T == :coo
+            n, m = 100,300
+
+            g = rand_graph(n, m, bidirected=true, graph_type=GRAPH_T)
+            # check bidirected=is_bidirected(g) default
+            g1, g2 = rand_edge_split(g, 0.9)
+            @test is_bidirected(g1)
+            @test is_bidirected(g2)
+            @test intersect(g1, g2).num_edges == 0
+            @test g1.num_edges + g2.num_edges == g.num_edges
+            @test g2.num_edges < 50
+
+            g = rand_graph(n, m, bidirected=false, graph_type=GRAPH_T)
+            # check bidirected=is_bidirected(g) default
+            g1, g2 = rand_edge_split(g, 0.9)
+            @test !is_bidirected(g1)
+            @test !is_bidirected(g2)
+            @test intersect(g1, g2).num_edges == 0
+            @test g1.num_edges + g2.num_edges == g.num_edges
+            @test g2.num_edges < 50
+
+            g1, g2 = rand_edge_split(g, 0.9, bidirected=false)
+            @test !is_bidirected(g1)
+            @test !is_bidirected(g2)
+            @test intersect(g1, g2).num_edges == 0
+            @test g1.num_edges + g2.num_edges == g.num_edges
+            @test g2.num_edges < 50
+        end
+    end
 end