diff --git a/docs/src/api/conv.md b/docs/src/api/conv.md
index c6b2f533b..c3151ac74 100644
--- a/docs/src/api/conv.md
+++ b/docs/src/api/conv.md
@@ -4,9 +4,8 @@ CurrentModule = GraphNeuralNetworks
 
 # Convolutional Layers
 
-Many different types of graphs convolutional layers have been proposed in the literature.
-Choosing the right layer for your application can be a matter of trial and error. 
-Some of the most commonly used layers are the [`GCNConv`](@ref) and the [`GATv2Conv`](@ref) layers. Multiple graph convolutional layers are stacked to create a graph neural network model
+Many different types of graphs convolutional layers have been proposed in the literature. Choosing the right layer for your application can bould involve a lot of exploration. 
+Some of the most commonly used layers are the [`GCNConv`](@ref) and the [`GATv2Conv`](@ref). Multiple graph convolutional layers are typically stacked together to create a graph neural network model
 (see [`GNNChain`](@ref)).
 
 The table below lists all graph convolutional layers implemented in the *GraphNeuralNetworks.jl*. It also highlights the presence of some additional capabilities with respect to basic message passing:
diff --git a/src/GNNGraphs/transform.jl b/src/GNNGraphs/transform.jl
index 626dac47d..5792055a7 100644
--- a/src/GNNGraphs/transform.jl
+++ b/src/GNNGraphs/transform.jl
@@ -38,7 +38,14 @@ function add_self_loops(g::GNNGraph{<:ADJMAT_T})
             g.ndata, g.edata, g.gdata)
 end
 
+"""
+    remove_self_loops(g::GNNGraph)
+
+Return a graph constructed from `g` where self-loops (edges from a node to itself)
+are removed. 
 
+See also [`add_self_loops`](@ref) and [`remove_multi_edges`](@ref).
+"""
 function remove_self_loops(g::GNNGraph{<:COO_T})
     s, t = edge_index(g)
     w = get_edge_weight(g)
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
index 6d7acab24..bb3690372 100644
--- a/src/layers/conv.jl
+++ b/src/layers/conv.jl
@@ -269,7 +269,7 @@ with ``z_i`` a normalization factor.
 In case `ein > 0` is given, edge features of dimension `ein` will be expected in the forward pass 
 and the attention coefficients will be calculated as  
 ```
-\alpha_{ij} = \frac{1}{z_i} \exp(\mathbf{a}^T LeakyReLU([W_3 \mathbf{e}_{j\to i}; W_2 \mathbf{x}_i; W_1 \mathbf{x}_j]))
+\alpha_{ij} = \frac{1}{z_i} \exp(LeakyReLU(\mathbf{a}^T [W_e \mathbf{e}_{j\to i}; W \mathbf{x}_i; W \mathbf{x}_j]))
 ````
 
 # Arguments
@@ -389,9 +389,9 @@ with ``z_i`` a normalization factor.
 
 In case `ein > 0` is given, edge features of dimension `ein` will be expected in the forward pass 
 and the attention coefficients will be calculated as  
+```math
+\alpha_{ij} = \frac{1}{z_i} \exp(\mathbf{a}^T LeakyReLU([W_3 \mathbf{e}_{j\to i}; W_2 \mathbf{x}_i; W_1 \mathbf{x}_j])).
 ```
-\alpha_{ij} = \frac{1}{z_i} \exp(\mathbf{a}^T LeakyReLU([W_3 \mathbf{e}_{j\to i}; W_2 \mathbf{x}_i; W_1 \mathbf{x}_j]))
-````
 
 # Arguments
 
@@ -420,7 +420,7 @@ struct GATv2Conv{T, A1, A2, A3, B, C<:AbstractMatrix} <: GNNLayer
 end
 
 @functor GATv2Conv
-Flux.trainable(l::GATv2Conv) = (l.dense_i, l.dense_j, l.dense_j, l.bias, l.a)
+Flux.trainable(l::GATv2Conv) = (l.dense_i, l.dense_j, l.dense_e, l.bias, l.a)
 
 GATv2Conv(ch::Pair{Int,Int}, args...; kws...) = GATv2Conv((ch[1], 0) => ch[2], args...; kws...)
 
@@ -509,7 +509,7 @@ Gated graph convolution layer from [Gated Graph Sequence Neural Networks](https:
 
 Implements the recursion
 ```math
-\mathbf{h}^{(0)}_i = [\mathbf{x}_i || \mathbf{0}] \\
+\mathbf{h}^{(0)}_i = [\mathbf{x}_i; \mathbf{0}] \\
 \mathbf{h}^{(l)}_i = GRU(\mathbf{h}^{(l-1)}_i, \square_{j \in N(i)} W \mathbf{h}^{(l-1)}_j)
 ```
 
@@ -572,14 +572,14 @@ Edge convolutional layer from paper [Dynamic Graph CNN for Learning on Point Clo
 
 Performs the operation
 ```math
-\mathbf{x}_i' = \square_{j \in N(i)} nn(\mathbf{x}_i || \mathbf{x}_j - \mathbf{x}_i)
+\mathbf{x}_i' = \square_{j \in N(i)}\, nn([\mathbf{x}_i; \mathbf{x}_j - \mathbf{x}_i])
 ```
 
 where `nn` generally denotes a learnable function, e.g. a linear layer or a multi-layer perceptron.
 
 # Arguments
 
-- `nn`: A (possibly learnable) function acting on edge features. 
+- `nn`: A (possibly learnable) function. 
 - `aggr`: Aggregation operator for the incoming messages (e.g. `+`, `*`, `max`, `min`, and `mean`).
 """
 struct EdgeConv <: GNNLayer
@@ -946,19 +946,19 @@ end
 Attention-based Graph Neural Network layer from paper [Attention-based
 Graph Neural Network for Semi-Supervised Learning](https://arxiv.org/abs/1803.03735).
 
-THe forward pass is given by
+The forward pass is given by
 ```math
-\mathbf{x}_i' = \sum_{j \in {N(i) \cup \{i\}} \alpha_{ij} W \mathbf{x}_j
+\mathbf{x}_i' = \sum_{j \in {N(i) \cup \{i\}}} \alpha_{ij} W \mathbf{x}_j
 ```
 where the attention coefficients ``\alpha_{ij}`` are given by
 ```math
 \alpha_{ij} =\frac{e^{\beta \cos(\mathbf{x}_i, \mathbf{x}_j)}}
-                  {\sum_{j'}e^{\beta \cos(\mathbf{x}_i, \mathbf{x}_j'}}
+                  {\sum_{j'}e^{\beta \cos(\mathbf{x}_i, \mathbf{x}_{j'})}}
 ```
 with the cosine distance defined by
 ```math 
 \cos(\mathbf{x}_i, \mathbf{x}_j) = 
-  \mathbf{x}_i \cdot \mathbf{x}_j / \lVert\mathbf{x}_i\rVert \lVert\mathbf{x}_j\rVert``
+  \frac{\mathbf{x}_i \cdot \mathbf{x}_j}{\lVert\mathbf{x}_i\rVert \lVert\mathbf{x}_j\rVert}
 ```
 and ``\beta`` a trainable parameter.
 
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
index cc4c24787..923ff00f5 100644
--- a/test/layers/conv.jl
+++ b/test/layers/conv.jl
@@ -103,18 +103,21 @@
             end
         end
 
-        @testset "bias=false" begin
-            @test length(Flux.params(GATConv(2=>3))) == 3
-            @test length(Flux.params(GATConv(2=>3, bias=false))) == 2
-        end
-
-
         @testset "edge features" begin
             ein = 3
             l = GATConv((in_channel, ein) => out_channel, add_self_loops=false)
             g = GNNGraph(g1, edata=rand(T, ein, g1.num_edges))
             test_layer(l, g, rtol=1e-3, outsize=(out_channel, g.num_nodes))
-        end 
+        end
+
+        @testset "num params" begin
+            l = GATConv(2 => 3, add_self_loops=false)
+            @test length(Flux.params(l)) == 3
+            l = GATConv((2,4) => 3, add_self_loops=false)
+            @test length(Flux.params(l)) == 4
+            l = GATConv((2,4) => 3, add_self_loops=false, bias=false)
+            @test length(Flux.params(l)) == 3            
+        end
     end
 
     @testset "GATv2Conv" begin
@@ -127,9 +130,20 @@
             end
         end
 
-        @testset "bias=false" begin
-            @test length(Flux.params(GATv2Conv(2=>3))) == 5
-            @test length(Flux.params(GATv2Conv(2=>3, bias=false))) == 3
+        @testset "edge features" begin
+            ein = 3
+            l = GATv2Conv((in_channel, ein) => out_channel, add_self_loops=false)
+            g = GNNGraph(g1, edata=rand(T, ein, g1.num_edges))
+            test_layer(l, g, rtol=1e-3, outsize=(out_channel, g.num_nodes))
+        end
+
+        @testset "num params" begin
+            l = GATv2Conv(2 => 3, add_self_loops=false)
+            @test length(Flux.params(l)) == 5
+            l = GATv2Conv((2,4) => 3, add_self_loops=false)
+            @test length(Flux.params(l)) == 6
+            l = GATv2Conv((2,4) => 3, add_self_loops=false, bias=false)
+            @test length(Flux.params(l)) == 4            
         end
 
         @testset "edge features" begin