From f96f58e25e848066abbebaeb9d5f6d0f9034e676 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Fri, 5 Nov 2021 10:50:17 +0100
Subject: [PATCH 1/3] wip on rand_split_edge

---
 examples/link_prediction_pubmed.jl | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/link_prediction_pubmed.jl b/examples/link_prediction_pubmed.jl
index 1fcf0a4a0..eb1533ab8 100644
--- a/examples/link_prediction_pubmed.jl
+++ b/examples/link_prediction_pubmed.jl
@@ -28,7 +28,7 @@ struct DotPredictor end
 
 function (::DotPredictor)(g, x)
     z = apply_edges((xi, xj, e) -> sum(xi .* xj, dims=1), g, xi=x, xj=x)
-    # z = apply_edges(xi_dot_xj, g, xi=x, xj=x) # Same with  buit-in methods
+    # z = apply_edges(xi_dot_xj, g, xi=x, xj=x) # Same with built-in method
     return vec(z)
 end
 
@@ -50,6 +50,8 @@ function train(; kws...)
     data = Cora.dataset()
     # data = PubMed.dataset()
     g = GNNGraph(data.adjacency_list)
+
+    # Print some info
     @info g
     @show is_bidirected(g)
     @show has_self_loops(g)
@@ -57,11 +59,14 @@ function train(; kws...)
     @show mean(degree(g))
     isbidir = is_bidirected(g)  
 
+    # Move to device
     g = g |> device
     X = data.node_features |> device
     
-    #### SPLIT INTO NEGATIVE AND POSITIVE SAMPLES
-    train_pos_g, test_pos_g = rand_edge_split(g, 0.9)
+    #### TRAIN/TEST splits
+    # With bidirected graph, we make sure that an edge and its reverse
+    # are in the same split 
+    train_pos_g, test_pos_g = rand_edge_split(g, 0.9, bidirected=isbidir)
     test_neg_g = negative_sample(g, num_neg_edges=test_pos_g.num_edges, bidirected=isbidir)
 
     ### DEFINE MODEL #########

From 446aba55cbd838889549b70203d9a2537661acab Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Fri, 5 Nov 2021 10:51:11 +0100
Subject: [PATCH 2/3] wip on rand_split_edge

---
 src/GNNGraphs/transform.jl | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl
index a1b80d603..837bcd129 100644
--- a/src/GNNGraphs/transform.jl
+++ b/src/GNNGraphs/transform.jl
@@ -378,19 +378,24 @@ function negative_sample(g::GNNGraph;
 end
 
 """
-    rand_edge_split(g::GNNGraph, frac) -> g1, g2
+    rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g)) -> g1, g2
 
 Randomly partition the edges in `g` to from two graphs, `g1`
 and `g2`. Both will have the same number of nodes as `g`.
 `g1` will contain a fraction `frac` of the original edges, 
 while `g2` wil contain the rest.
+
+If `bidirected = true` makes sure that an edge and its reverse go into the same split.
+
 Useful for train/test splits in link prediction tasks.
 """
-function rand_edge_split(g::GNNGraph, frac)
-    # TODO add bidirected version
+function rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g))
     s, t = edge_index(g)
-    eids = randperm(g.num_edges)
-    size1 = round(Int, g.num_edges * frac)
+    idx, idmax = edge_encoding(s, t, g.num_nodes, directed=!bidirected)
+    uidx = union(idx) # So that multi-edges (and reverse edges in the bidir case) go in the same split
+    nu = length(uidx)
+    eids = randperm(nu)
+    size1 = round(Int, nu * frac)
     
     s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
     g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)

From a3cd35b0770e8d5b1136b7e62be38d906d3e1ba7 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@gmail.com>
Date: Sat, 6 Nov 2021 10:19:57 +0100
Subject: [PATCH 3/3] rand edge split

---
 examples/link_prediction_pubmed.jl |  6 ++---
 src/GNNGraphs/GNNGraphs.jl         |  2 +-
 src/GNNGraphs/transform.jl         | 37 ++++++++++++++++--------------
 test/GNNGraphs/transform.jl        | 31 +++++++++++++++++++++++++
 4 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/examples/link_prediction_pubmed.jl b/examples/link_prediction_pubmed.jl
index eb1533ab8..c27af951d 100644
--- a/examples/link_prediction_pubmed.jl
+++ b/examples/link_prediction_pubmed.jl
@@ -6,10 +6,9 @@ using Flux
 using Flux: onecold, onehotbatch
 using Flux.Losses: logitbinarycrossentropy
 using GraphNeuralNetworks
-using MLDatasets: PubMed, Cora
+using MLDatasets: PubMed
 using Statistics, Random, LinearAlgebra
 using CUDA
-# using MLJBase: AreaUnderCurve
 CUDA.allowscalar(false)
 
 # arguments for the `train` function 
@@ -47,8 +46,7 @@ function train(; kws...)
     end
 
     ### LOAD DATA
-    data = Cora.dataset()
-    # data = PubMed.dataset()
+    data = PubMed.dataset()
     g = GNNGraph(data.adjacency_list)
 
     # Print some info
diff --git a/src/GNNGraphs/GNNGraphs.jl b/src/GNNGraphs/GNNGraphs.jl
index b1bd5cecf..800abe627 100644
--- a/src/GNNGraphs/GNNGraphs.jl
+++ b/src/GNNGraphs/GNNGraphs.jl
@@ -4,7 +4,7 @@ using SparseArrays
 using Functors: @functor
 using CUDA 
 import Graphs
-using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree
+using Graphs: AbstractGraph, outneighbors, inneighbors, adjacency_matrix, degree, has_self_loops
 import Flux
 using Flux: batch
 import NNlib
diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl
index 837bcd129..e6910760b 100644
--- a/src/GNNGraphs/transform.jl
+++ b/src/GNNGraphs/transform.jl
@@ -386,30 +386,33 @@ and `g2`. Both will have the same number of nodes as `g`.
 while `g2` wil contain the rest.
 
 If `bidirected = true` makes sure that an edge and its reverse go into the same split.
+This option is supported only for bidirected graphs with no self-loops
+and multi-edges.
 
-Useful for train/test splits in link prediction tasks.
+`rand_edge_split` is tipically used to create train/test splits in link prediction tasks.
 """
 function rand_edge_split(g::GNNGraph, frac; bidirected=is_bidirected(g))
     s, t = edge_index(g)
-    idx, idmax = edge_encoding(s, t, g.num_nodes, directed=!bidirected)
-    uidx = union(idx) # So that multi-edges (and reverse edges in the bidir case) go in the same split
-    nu = length(uidx)
-    eids = randperm(nu)
-    size1 = round(Int, nu * frac)
+    ne = bidirected ? g.num_edges ÷ 2 : g.num_edges
+    eids = randperm(ne)
+    size1 = round(Int, ne * frac)
     
-    s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
-    g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)
-
-    s, t = edge_index(g)
-    eids = randperm(g.num_edges)
-    size1 = round(Int, g.num_edges * frac)
-    
-    s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
+    if !bidirected
+        s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
+        s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]]
+    else
+        @assert is_bidirected(g)
+        @assert !has_self_loops(g)
+        @assert !has_multi_edges(g)
+        mask = s .< t
+        s, t = s[mask], t[mask]
+        s1, t1 = s[eids[1:size1]], t[eids[1:size1]]
+        s1, t1 = [s1; t1], [t1; s1]
+        s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]]
+        s2, t2 = [s2; t2], [t2; s2]
+    end
     g1 = GNNGraph(s1, t1, num_nodes=g.num_nodes)
-
-    s2, t2 = s[eids[size1+1:end]], t[eids[size1+1:end]]
     g2 = GNNGraph(s2, t2, num_nodes=g.num_nodes)
-
     return g1, g2
 end
 
diff --git a/test/GNNGraphs/transform.jl b/test/GNNGraphs/transform.jl
index f9fc669ad..569e833ed 100644
--- a/test/GNNGraphs/transform.jl
+++ b/test/GNNGraphs/transform.jl
@@ -140,4 +140,35 @@
             @test intersect(g, gneg).num_edges == 0
         end
     end
+
+    @testset "rand_edge_split" begin
+        if GRAPH_T == :coo
+            n, m = 100,300
+
+            g = rand_graph(n, m, bidirected=true, graph_type=GRAPH_T)
+            # check bidirected=is_bidirected(g) default
+            g1, g2 = rand_edge_split(g, 0.9)
+            @test is_bidirected(g1)
+            @test is_bidirected(g2)
+            @test intersect(g1, g2).num_edges == 0
+            @test g1.num_edges + g2.num_edges == g.num_edges
+            @test g2.num_edges < 50
+
+            g = rand_graph(n, m, bidirected=false, graph_type=GRAPH_T)
+            # check bidirected=is_bidirected(g) default
+            g1, g2 = rand_edge_split(g, 0.9)
+            @test !is_bidirected(g1)
+            @test !is_bidirected(g2)
+            @test intersect(g1, g2).num_edges == 0
+            @test g1.num_edges + g2.num_edges == g.num_edges
+            @test g2.num_edges < 50
+
+            g1, g2 = rand_edge_split(g, 0.9, bidirected=false)
+            @test !is_bidirected(g1)
+            @test !is_bidirected(g2)
+            @test intersect(g1, g2).num_edges == 0
+            @test g1.num_edges + g2.num_edges == g.num_edges
+            @test g2.num_edges < 50
+        end
+    end
 end