Skip to content

Commit

Permalink
support for MLDatasets v0.6 (#164)
Browse files Browse the repository at this point in the history
* mldatasets new release

* cleanup

* neuralode example

* docs

* fix node test

* cleanup

* cleanup

* julia 1.6 compat

* more tests

* cleanup
  • Loading branch information
CarloLucibello authored May 7, 2022
1 parent eb43bc9 commit 57d84ef
Show file tree
Hide file tree
Showing 16 changed files with 217 additions and 89 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Expand Down
9 changes: 7 additions & 2 deletions docs/src/datasets.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# Datasets

GraphNeuralNetworks.jl doesn't come with its own datasets, but leverages those available in the Julia (and non-Julia) ecosystem. In particular, the [examples in the GraphNeuralNetworks.jl repository](https://github.com/CarloLucibello/GraphNeuralNetworks.jl/tree/master/examples) make use of the [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) package. There you will find common graph datasets such as Cora, PubMed, and Citeseer.
Also MLDatasets gives access to the [TUDataset](https://chrsmrrs.github.io/datasets/docs/datasets/) repository and its numerous datasets.
GraphNeuralNetworks.jl doesn't come with its own datasets, but leverages those available in the Julia (and non-Julia) ecosystem. In particular, the [examples in the GraphNeuralNetworks.jl repository](https://github.com/CarloLucibello/GraphNeuralNetworks.jl/tree/master/examples) make use of the [MLDatasets.jl](https://github.com/JuliaML/MLDatasets.jl) package. There you will find common graph datasets such as Cora, PubMed, Citeseer, TUDataset and [many others](https://juliaml.github.io/MLDatasets.jl/dev/datasets/graphs/).

GraphNeuralNetworks.jl provides the [`mldatasets2gnngraph`](@ref) method for interfacing with MLDatasets.jl.

```@docs
mldatasets2gnngraph
```
11 changes: 7 additions & 4 deletions examples/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DiffEqFlux = "aae7a2af-3d4f-5e19-a356-7da93b79d9d0"
DifferentialEquations = "0c46a032-eb83-5123-abaf-570d42b7fbaa"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
GeometricFlux = "7e08b658-56d3-11e9-2997-919d5b31e4ea"
GraphNeuralNetworks = "cffab07f-9bc2-4db1-8861-388f63bf7694"
GraphSignals = "3ebe565e-a4b5-49c6-aed2-300248c3a9c1"
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"

[extras]
CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
[compat]
DiffEqFlux = "1.45"
Flux = "0.13"
Graphs = "1"
GraphNeuralNetworks = "0.4"
MLDatasets = "0.6"
julia = "1.7"
49 changes: 21 additions & 28 deletions examples/graph_classification_tudataset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,18 @@ using Flux.Data: DataLoader
using GraphNeuralNetworks
using MLDatasets: TUDataset
using Statistics, Random
using MLUtils
using CUDA
CUDA.allowscalar(false)

function eval_loss_accuracy(model, data_loader, device)
loss = 0.
acc = 0.
ntot = 0
for g in data_loader
g = g |> device
n = g.num_graphs
y = g.gdata.y
for (graphs, y) in data_loader
g = Flux.batch(graphs) |> device
y = y |> device
n = length(y)
= model(g, g.ndata.x) |> vec
loss += logitbinarycrossentropy(ŷ, y) * n
acc += mean((ŷ .> 0) .== y) * n
Expand All @@ -28,26 +29,19 @@ end

function getdataset()
tudata = TUDataset("MUTAG")

x = Array{Float32}(onehotbatch(tudata.node_labels, 0:6))
y = (1 .+ Array{Float32}(tudata.graph_labels)) ./ 2
display(tudata)
graphs = mldataset2gnngraph(tudata)
oh(x) = Float32.(onehotbatch(x, 0:6))
graphs = [GNNGraph(g, ndata=oh(g.ndata.targets)) for g in graphs]
y = (1 .+ Float32.(tudata.graph_data.targets)) ./ 2
@assert all(([0,1]), y) # binary classification

## The dataset also has edge features but we won't be using them
# e = Array{Float32}(onehotbatch(data.edge_labels, sort(unique(data.edge_labels))))

gall = GNNGraph(tudata.source, tudata.target,
num_nodes=tudata.num_nodes,
graph_indicator=tudata.graph_indicator,
ndata=(; x), gdata=(; y))

return [getgraph(gall, i) for i=1:gall.num_graphs]
return graphs, y
end

# arguments for the `train` function
Base.@kwdef mutable struct Args
η = 1f-3 # learning rate
batchsize = 64 # batch size (number of graphs in each batch)
batchsize = 32 # batch size (number of graphs in each batch)
epochs = 200 # number of epochs
seed = 17 # set seed > 0 for reproducibility
usecuda = true # if true use cuda (if available)
Expand All @@ -71,19 +65,18 @@ function train(; kws...)
# LOAD DATA
NUM_TRAIN = 150

data = getdataset()
shuffle!(data)
dataset = getdataset()
train_data, test_data = splitobs(dataset, at=NUM_TRAIN/numobs(dataset), shuffle=true)

train_loader = DataLoader(data[1:NUM_TRAIN], batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader(data[NUM_TRAIN+1:end], batchsize=args.batchsize, shuffle=false)
train_loader = DataLoader(train_data, batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader(test_data, batchsize=args.batchsize, shuffle=false)

# DEFINE MODEL

nin = size(data[1].ndata.x, 1)
nin = size(dataset[1][1].ndata.x, 1)
nhidden = args.nhidden

model = GNNChain(GraphConv(nin => nhidden, relu),
Dropout(0.5),
GraphConv(nhidden => nhidden, relu),
GlobalPool(mean),
Dense(nhidden, 1)) |> device
Expand All @@ -103,15 +96,15 @@ function train(; kws...)

report(0)
for epoch in 1:args.epochs
for g in train_loader
g = g |> device
for (graphs, y) in train_loader
g = Flux.batch(graphs) |> device
y = y |> device
gs = Flux.gradient(ps) do
= model(g, g.ndata.x) |> vec
logitbinarycrossentropy(ŷ, g.gdata.y)
logitbinarycrossentropy(ŷ, y)
end
Flux.Optimise.update!(opt, ps, gs)
end

epoch % args.infotime == 0 && report(epoch)
end
end
Expand Down
11 changes: 5 additions & 6 deletions examples/link_prediction_pubmed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,10 @@ function train(; kws...)
end

### LOAD DATA
data = PubMed.dataset()
g = GNNGraph(data.adjacency_list)

g = mldataset2gnngraph(PubMed())

# Print some info
@info g
display(g)
@show is_bidirected(g)
@show has_self_loops(g)
@show has_multi_edges(g)
Expand All @@ -59,7 +58,7 @@ function train(; kws...)

# Move to device
g = g |> device
X = data.node_features |> device
X = g.ndata.features

#### TRAIN/TEST splits
# With bidirected graph, we make sure that an edge and its reverse
Expand Down Expand Up @@ -117,4 +116,4 @@ function train(; kws...)
end
end

# train()
train()
39 changes: 22 additions & 17 deletions examples/neural_ode_cora.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using GraphNeuralNetworks, DiffEqFlux, DifferentialEquations
using Flux: onehotbatch, onecold
using Flux.Losses: logitcrossentropy
using Flux
using Statistics: mean
using MLDatasets: Cora
using CUDA
Expand All @@ -11,27 +12,24 @@ using CUDA
device = CUDA.functional() ? gpu : cpu

# LOAD DATA
data = Cora.dataset()
g = GNNGraph(data.adjacency_list) |> device
X = data.node_features |> device
y = onehotbatch(data.node_labels, 1:data.num_classes) |> device
train_ids = data.train_indices |> device
val_ids = data.val_indices |> device
test_ids = data.test_indices |> device
ytrain = y[:, train_ids]
dataset = Cora()
classes = dataset.metadata["classes"]
g = mldataset2gnngraph(dataset) |> device
X = g.ndata.features
y = onehotbatch(g.ndata.targets |> cpu, classes) |> device # remove when https://github.com/FluxML/Flux.jl/pull/1959 tagged
(; train_mask, val_mask, test_mask) = g.ndata
ytrain = y[:,train_mask]


# Model and Data Configuration
nin = size(X, 1)
nhidden = 16
nout = data.num_classes
nout = length(classes)
epochs = 40

# Define the Neural GDE
diffeqsol_to_array(x) = reshape(device(x), size(x)[1:2])

# GCNConv(nhidden => nhidden, graph=g),

node_chain = GNNChain(GCNConv(nhidden => nhidden, relu),
GCNConv(nhidden => nhidden, relu)) |> device

Expand All @@ -40,14 +38,10 @@ node = NeuralODE(WithGraph(node_chain, g),
reltol = 1e-3, abstol = 1e-3, save_start = false) |> device

model = GNNChain(GCNConv(nin => nhidden, relu),
Dropout(0.5),
node,
diffeqsol_to_array,
Dense(nhidden, nout)) |> device

# Loss
loss(x, y) = logitcrossentropy(model(g, x), y)
accuracy(x, y) = mean(onecold(model(g, x)) .== onecold(y))

# # Training
# ## Model Parameters
Expand All @@ -56,9 +50,20 @@ ps = Flux.params(model);
# ## Optimizer
opt = ADAM(0.01)


function eval_loss_accuracy(X, y, mask)
= model(g, X)
l = logitcrossentropy(ŷ[:,mask], y[:,mask])
acc = mean(onecold(ŷ[:,mask]) .== onecold(y[:,mask]))
return (loss = round(l, digits=4), acc = round(acc*100, digits=2))
end

# ## Training Loop
for epoch in 1:epochs
gs = gradient(() -> loss(X, y), ps)
gs = gradient(ps) do
= model(g, X)
logitcrossentropy(ŷ[:,train_mask], ytrain)
end
Flux.Optimise.update!(opt, ps, gs)
@show(accuracy(X, y))
@show eval_loss_accuracy(X, y, train_mask)
end
33 changes: 16 additions & 17 deletions examples/node_classification_cora.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ using Statistics, Random
using CUDA
CUDA.allowscalar(false)

function eval_loss_accuracy(X, y, ids, model, g)
function eval_loss_accuracy(X, y, mask, model, g)
= model(g, X)
l = logitcrossentropy(ŷ[:,ids], y[:,ids])
acc = mean(onecold(ŷ[:,ids]) .== onecold(y[:,ids]))
l = logitcrossentropy(ŷ[:,mask], y[:,mask])
acc = mean(onecold(ŷ[:,mask]) .== onecold(y[:,mask]))
return (loss = round(l, digits=4), acc = round(acc*100, digits=2))
end

Expand Down Expand Up @@ -41,32 +41,30 @@ function train(; kws...)
end

# LOAD DATA
data = Cora.dataset()
g = GNNGraph(data.adjacency_list) |> device
X = data.node_features |> device
y = onehotbatch(data.node_labels, 1:data.num_classes) |> device
train_ids = data.train_indices |> device
val_ids = data.val_indices |> device
test_ids = data.test_indices |> device
ytrain = y[:,train_ids]
dataset = Cora()
classes = dataset.metadata["classes"]
g = mldataset2gnngraph(dataset) |> device
X = g.ndata.features
y = onehotbatch(g.ndata.targets |> cpu, classes) |> device # remove when https://github.com/FluxML/Flux.jl/pull/1959 tagged
(; train_mask, val_mask, test_mask) = g.ndata
ytrain = y[:,train_mask]

nin, nhidden, nout = size(X,1), args.nhidden, data.num_classes
nin, nhidden, nout = size(X,1), args.nhidden, length(classes)

## DEFINE MODEL
model = GNNChain(GCNConv(nin => nhidden, relu),
Dropout(0.5),
GCNConv(nhidden => nhidden, relu),
Dense(nhidden, nout)) |> device

ps = Flux.params(model)
opt = ADAM(args.η)

@info g
display(g)

## LOGGING FUNCTION
function report(epoch)
train = eval_loss_accuracy(X, y, train_ids, model, g)
test = eval_loss_accuracy(X, y, test_ids, model, g)
train = eval_loss_accuracy(X, y, train_mask, model, g)
test = eval_loss_accuracy(X, y, test_mask, model, g)
println("Epoch: $epoch Train: $(train) Test: $(test)")
end

Expand All @@ -75,7 +73,7 @@ function train(; kws...)
for epoch in 1:args.epochs
gs = Flux.gradient(ps) do
= model(g, X)
logitcrossentropy(ŷ[:,train_ids], ytrain)
logitcrossentropy(ŷ[:,train_mask], ytrain)
end

Flux.Optimise.update!(opt, ps, gs)
Expand All @@ -85,3 +83,4 @@ function train(; kws...)
end

train()

60 changes: 60 additions & 0 deletions perf/neural_ode_mnist.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Load the packages
using GraphNeuralNetworks, DiffEqFlux, DifferentialEquations
using Flux: onehotbatch, onecold
using Flux.Losses: logitcrossentropy
using Flux
using Statistics: mean
using MLDatasets
using CUDA
# CUDA.allowscalar(false) # Some scalar indexing is still done by DiffEqFlux

# device = cpu # `gpu` not working yet
device = CUDA.functional() ? gpu : cpu

# LOAD DATA
X, y = MNIST(:train)[:]
y = onehotbatch(y, 0:9)


# Define the Neural GDE
diffeqsol_to_array(x) = reshape(device(x), size(x)[1:2])

nin, nhidden, nout = 28*28, 100, 10
epochs = 10

node_chain = Chain(Dense(nhidden => nhidden, tanh),
Dense(nhidden => nhidden)) |> device

node = NeuralODE(node_chain,
(0.f0, 1.f0), Tsit5(), save_everystep=false,
reltol=1e-3, abstol=1e-3, save_start=false) |> device

model = Chain(Flux.flatten,
Dense(nin => nhidden, relu),
node,
diffeqsol_to_array,
Dense(nhidden, nout)) |> device

# # Training
# ## Model Parameters
ps = Flux.params(model);

# ## Optimizer
opt = ADAM(0.01)

function eval_loss_accuracy(X, y)
= model(X)
l = logitcrossentropy(ŷ, y)
acc = mean(onecold(ŷ) .== onecold(y))
return (loss = round(l, digits=4), acc = round(acc*100, digits=2))
end

# ## Training Loop
for epoch in 1:epochs
gs = gradient(ps) do
= model(X)
logitcrossentropy(ŷ, y)
end
Flux.Optimise.update!(opt, ps, gs)
@show eval_loss_accuracy(X, y)
end
Loading

0 comments on commit 57d84ef

Please sign in to comment.