From 987fd039182186b618a4dc31432955bcaabe352e Mon Sep 17 00:00:00 2001
From: Matthew LeMay <mplemay@users.noreply.github.com>
Date: Mon, 25 Jul 2022 13:28:17 -0400
Subject: [PATCH] Replace deprecated Flux.ADAM with Flux.Adam (#203)

* replace depreciated Flux.ADAM with Flux.Adam

* update Flux compat to 0.14.4
---
 Project.toml                                     | 2 +-
 docs/src/index.md                                | 2 +-
 docs/src/tutorials/gnn_intro_pluto.jl            | 4 ++--
 docs/src/tutorials/graph_classification_pluto.jl | 2 +-
 examples/graph_classification_tudataset.jl       | 2 +-
 examples/link_prediction_pubmed.jl               | 2 +-
 examples/neural_ode_cora.jl                      | 2 +-
 examples/node_classification_cora.jl             | 2 +-
 perf/neural_ode_mnist.jl                         | 2 +-
 perf/node_classification_cora_geometricflux.jl   | 2 +-
 test/examples/node_classification_cora.jl        | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Project.toml b/Project.toml
index 91ecb24f2..60543ee10 100644
--- a/Project.toml
+++ b/Project.toml
@@ -29,7 +29,7 @@ Adapt = "3"
 CUDA = "3.3"
 ChainRulesCore = "1"
 DataStructures = "0.18"
-Flux = "0.13"
+Flux = "0.13.4"
 Functors = "0.2, 0.3"
 Graphs = "1.4"
 KrylovKit = "0.5"
diff --git a/docs/src/index.md b/docs/src/index.md
index 3da002eba..5a7d96e71 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -54,7 +54,7 @@ model = GNNChain(GCNConv(16 => 64),
                 Dense(64, 1)) |> device
 
 ps = Flux.params(model)
-opt = ADAM(1f-4)
+opt = Adam(1f-4)
 ```
 
 ### Training 
diff --git a/docs/src/tutorials/gnn_intro_pluto.jl b/docs/src/tutorials/gnn_intro_pluto.jl
index ce81d1f78..6b903deac 100644
--- a/docs/src/tutorials/gnn_intro_pluto.jl
+++ b/docs/src/tutorials/gnn_intro_pluto.jl
@@ -266,7 +266,7 @@ Since everything in our model is differentiable and parameterized, we can add so
 Here, we make use of a semi-supervised or transductive learning procedure: We simply train against one node per class, but are allowed to make use of the complete input graph data.
 
 Training our model is very similar to any other Flux model.
-In addition to defining our network architecture, we define a loss criterion (here, `logitcrossentropy` and initialize a stochastic gradient optimizer (here, `ADAM`).
+In addition to defining our network architecture, we define a loss criterion (here, `logitcrossentropy` and initialize a stochastic gradient optimizer (here, `Adam`).
 After that, we perform multiple rounds of optimization, where each round consists of a forward and backward pass to compute the gradients of our model parameters w.r.t. to the loss derived from the forward pass.
 If you are not new to Flux, this scheme should appear familar to you. 
 
@@ -285,7 +285,7 @@ Let us now start training and see how our node embeddings evolve over time (best
 begin
 	model = GCN(num_features, num_classes)
     ps = Flux.params(model)
-    opt = ADAM(1e-2)
+    opt = Adam(1e-2)
 	epochs = 2000
 
 	emb = h
diff --git a/docs/src/tutorials/graph_classification_pluto.jl b/docs/src/tutorials/graph_classification_pluto.jl
index a54a19372..ed80e2810 100644
--- a/docs/src/tutorials/graph_classification_pluto.jl
+++ b/docs/src/tutorials/graph_classification_pluto.jl
@@ -202,7 +202,7 @@ function train!(model; epochs=200, η=1e-2, infotime=10)
 	device = Flux.cpu
 	model = model |> device
 	ps = Flux.params(model)
-    opt = ADAM(1e-3)
+    opt = Adam(1e-3)
 	
 
     function report(epoch)
diff --git a/examples/graph_classification_tudataset.jl b/examples/graph_classification_tudataset.jl
index a724165c4..c2c5c68ce 100644
--- a/examples/graph_classification_tudataset.jl
+++ b/examples/graph_classification_tudataset.jl
@@ -82,7 +82,7 @@ function train(; kws...)
                      Dense(nhidden, 1))  |> device
 
     ps = Flux.params(model)
-    opt = ADAM(args.η)
+    opt = Adam(args.η)
 
     # LOGGING FUNCTION
 
diff --git a/examples/link_prediction_pubmed.jl b/examples/link_prediction_pubmed.jl
index d7c5d88ea..5693c651b 100644
--- a/examples/link_prediction_pubmed.jl
+++ b/examples/link_prediction_pubmed.jl
@@ -77,7 +77,7 @@ function train(; kws...)
     pred = DotPredictor()
 
     ps = Flux.params(model)
-    opt = ADAM(args.η)
+    opt = Adam(args.η)
 
     ### LOSS FUNCTION ############
 
diff --git a/examples/neural_ode_cora.jl b/examples/neural_ode_cora.jl
index f5a340fc6..f90c9e4b9 100644
--- a/examples/neural_ode_cora.jl
+++ b/examples/neural_ode_cora.jl
@@ -48,7 +48,7 @@ model = GNNChain(GCNConv(nin => nhidden, relu),
 ps = Flux.params(model);
 
 # ## Optimizer
-opt = ADAM(0.01)
+opt = Adam(0.01)
 
 
 function eval_loss_accuracy(X, y, mask)
diff --git a/examples/node_classification_cora.jl b/examples/node_classification_cora.jl
index 3746b02ba..185f40a05 100644
--- a/examples/node_classification_cora.jl
+++ b/examples/node_classification_cora.jl
@@ -57,7 +57,7 @@ function train(; kws...)
                      Dense(nhidden, nout))  |> device
 
     ps = Flux.params(model)
-    opt = ADAM(args.η)
+    opt = Adam(args.η)
 
     display(g)
     
diff --git a/perf/neural_ode_mnist.jl b/perf/neural_ode_mnist.jl
index 0ddc8cded..6c41f2420 100644
--- a/perf/neural_ode_mnist.jl
+++ b/perf/neural_ode_mnist.jl
@@ -40,7 +40,7 @@ model = Chain(Flux.flatten,
 ps = Flux.params(model);
 
 # ## Optimizer
-opt = ADAM(0.01)
+opt = Adam(0.01)
 
 function eval_loss_accuracy(X, y)
     ŷ = model(X)
diff --git a/perf/node_classification_cora_geometricflux.jl b/perf/node_classification_cora_geometricflux.jl
index 4188bb579..e3d39e9d5 100644
--- a/perf/node_classification_cora_geometricflux.jl
+++ b/perf/node_classification_cora_geometricflux.jl
@@ -59,7 +59,7 @@ function train(; kws...)
                 Dense(nhidden, nout))  |> device
 
     ps = Flux.params(model)
-    opt = ADAM(args.η)
+    opt = Adam(args.η)
 
     @info g
     
diff --git a/test/examples/node_classification_cora.jl b/test/examples/node_classification_cora.jl
index 7056e3d21..0334c8ef5 100644
--- a/test/examples/node_classification_cora.jl
+++ b/test/examples/node_classification_cora.jl
@@ -53,7 +53,7 @@ function train(Layer; verbose=false, kws...)
                      Dense(nhidden, nout))  |> device
 
     ps = Flux.params(model)
-    opt = ADAM(args.η)
+    opt = Adam(args.η)
     
 
     ## TRAINING