diff --git a/Project.toml b/Project.toml
index 34e98362c..a363c55ab 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,13 +5,11 @@ version = "0.7.3"
 [deps]
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-LoopVectorization = "0.8.10"
 Requires = "0.5, 1.0"
 julia = "1.3"
 
diff --git a/src/NNlib.jl b/src/NNlib.jl
index ef8acc987..d954a68cd 100644
--- a/src/NNlib.jl
+++ b/src/NNlib.jl
@@ -1,8 +1,6 @@
 module NNlib
 using Pkg
 using Requires
-using LoopVectorization
-using Requires
 
 # Include APIs
 include("dim_helpers.jl")
diff --git a/src/activation.jl b/src/activation.jl
index 3e7dd22c4..870006b69 100644
--- a/src/activation.jl
+++ b/src/activation.jl
@@ -1,11 +1,6 @@
 export σ, sigmoid, hardσ, hardsigmoid, hardtanh, relu, leakyrelu, relu6, rrelu, elu, gelu, swish, selu, celu, softplus, softsign, logσ,
        logsigmoid, logcosh, mish, tanhshrink, softshrink, thresholdrelu, trelu, lisht
 
-import LoopVectorization: vifelse
-using LoopVectorization.SLEEFPirates: FloatType
-
-const RealOrFloatType = Union{Real, FloatType}
-
 ## Activation functions
 #
 # Some of activation functions have its wrapper function for GPU in CuArrays.jl.
@@ -17,9 +12,9 @@ const RealOrFloatType = Union{Real, FloatType}
 Classic [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function) activation
 function.
 """
-function σ(x::RealOrFloatType)
+function σ(x)
     t = exp(-abs(x))
-    vifelse(x ≥ 0, inv(one(t) + t), t / (one(t) + t))
+    ifelse(x ≥ 0, inv(one(t) + t), t / (one(t) + t))
 end
 const sigmoid = σ
 
@@ -29,7 +24,7 @@ const sigmoid = σ
 Segment-wise linear approximation of sigmoid.
 See [BinaryConnect: Training Deep Neural Networks withbinary weights during propagations](https://arxiv.org/pdf/1511.00363.pdf).
 """
-hardσ(x::RealOrFloatType, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5))))
+hardσ(x, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5))))
 const hardsigmoid = hardσ
 
 
@@ -46,7 +41,7 @@ Return `log(σ(x))` which is computed in a numerically stable way.
       -10.000045398899218
        -3.720075976020836e-44
 """
-logσ(x::RealOrFloatType) = -softplus(-x)
+logσ(x) = -softplus(-x)
 const logsigmoid = logσ
 
 
@@ -56,7 +51,7 @@ const logsigmoid = logσ
 Segment-wise linear approximation of tanh. Cheaper  and  more  computational  efficient version of tanh.
 See [Large Scale Machine Learning](http://ronan.collobert.org/pub/matos/2004_phdthesis_lip6.pdf).
 """
-hardtanh(x::RealOrFloatType) = max(-one(x), min( one(x), x))
+hardtanh(x) = max(-one(x), min( one(x), x))
 
 
 """
@@ -65,7 +60,7 @@ hardtanh(x::RealOrFloatType) = max(-one(x), min( one(x), x))
 [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))
 activation function.
 """
-relu(x::RealOrFloatType) = max(zero(x), x)
+relu(x) = max(zero(x), x)
 
 
 """
@@ -75,7 +70,7 @@ Leaky [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_ne
 activation function.
 You can also specify the coefficient explicitly, e.g. `leakyrelu(x, 0.01)`.
 """
-leakyrelu(x::RealOrFloatType, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x))
+leakyrelu(x, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x))
 
 """
     relu6(x) = min(max(0, x), 6)
@@ -84,7 +79,7 @@ leakyrelu(x::RealOrFloatType, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x))
 activation function capped at 6.
 See [Convolutional Deep Belief Networks on CIFAR-10](http://www.cs.utoronto.ca/%7Ekriz/conv-cifar10-aug2010.pdf)
 """
-relu6(x::RealOrFloatType) = min(relu(x), oftype(x, 6))
+relu6(x) = min(relu(x), oftype(x, 6))
 
 """
     rrelu(x, l=1/8, u=1/3) = max(a*x, x)
@@ -95,7 +90,7 @@ Randomized Leaky [Rectified Linear Unit](https://arxiv.org/pdf/1505.00853.pdf)
 activation function.
 You can also specify the bound explicitly, e.g. `rrelu(x, 0.0, 1.0)`.
 """
-function rrelu(x::RealOrFloatType, l::Real = 1 / 8.0, u::Real = 1 / 3.0)
+function rrelu(x, l = 1 / 8.0, u = 1 / 3.0)
     a = oftype(x / 1, (u - l) * rand() + l)
     return leakyrelu(x, a)
 end
@@ -108,7 +103,7 @@ Exponential Linear Unit activation function.
 See [Fast and Accurate Deep Network Learning by Exponential Linear Units](https://arxiv.org/abs/1511.07289).
 You can also specify the coefficient explicitly, e.g. `elu(x, 1)`.
 """
-elu(x::RealOrFloatType, α = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x)))
+elu(x, α = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x)))
 
 
 """
@@ -117,7 +112,7 @@ elu(x::RealOrFloatType, α = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x)
 [Gaussian Error Linear Unit](https://arxiv.org/pdf/1606.08415.pdf)
 activation function.
 """
-function gelu(x::RealOrFloatType)
+function gelu(x)
     p = oftype(x / 1, π)
     λ = oftype(x / 1, √(2 / p))
     α = oftype(x / 1, 0.044715)
@@ -132,7 +127,7 @@ end
 Self-gated activation function.
 See [Swish: a Self-Gated Activation Function](https://arxiv.org/pdf/1710.05941.pdf).
 """
-swish(x::RealOrFloatType) = x * σ(x)
+swish(x) = x * σ(x)
 
 
 """
@@ -141,7 +136,7 @@ swish(x::RealOrFloatType) = x * σ(x)
 Non-Parametric Linearly Scaled Hyperbolic Tangent Activation Function.
 See [LiSHT](https://arxiv.org/abs/1901.05894)
 """
-lisht(x::RealOrFloatType) = x * tanh(x)
+lisht(x) = x * tanh(x)
 
 
 """
@@ -153,10 +148,10 @@ lisht(x::RealOrFloatType) = x * tanh(x)
 Scaled exponential linear units.
 See [Self-Normalizing Neural Networks](https://arxiv.org/pdf/1706.02515.pdf).
 """
-function selu(x::RealOrFloatType)
+function selu(x)
   λ = oftype(x / 1, 1.0507009873554804934193349852946)
   α = oftype(x / 1, 1.6732632423543772848170429916717)
-  λ * vifelse(x > 0, x / one(x), α * (exp(x) - one(x)))
+  λ * ifelse(x > 0, x / one(x), α * (exp(x) - one(x)))
 end
 
 """
@@ -166,7 +161,7 @@ end
 Continuously Differentiable Exponential Linear Units
 See [Continuously Differentiable Exponential Linear Units](https://arxiv.org/pdf/1704.07483.pdf).
 """
-celu(x::RealOrFloatType, α::Real = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x)))
+celu(x, α = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x)))
 
 
 """
@@ -175,7 +170,7 @@ celu(x::RealOrFloatType, α::Real = one(x)) = vifelse(x ≥ 0, x / one(x), α *
 Threshold Gated Rectified Linear.
 See [ThresholdRelu](https://arxiv.org/pdf/1402.3337.pdf)
 """
-trelu(x::RealOrFloatType,theta = one(x)) = vifelse(x> theta, x, zero(x))
+trelu(x, theta = one(x)) = ifelse(x> theta, x, zero(x))
 const thresholdrelu = trelu
 
 
@@ -184,7 +179,7 @@ const thresholdrelu = trelu
 
 See [Quadratic Polynomials Learn Better Image Features](http://www.iro.umontreal.ca/~lisa/publications2/index.php/attachments/single/205).
 """
-softsign(x::RealOrFloatType) = x / (one(x) + abs(x))
+softsign(x) = x / (one(x) + abs(x))
 
 
 """
@@ -192,7 +187,7 @@ softsign(x::RealOrFloatType) = x / (one(x) + abs(x))
 
 See [Deep Sparse Rectifier Neural Networks](http://proceedings.mlr.press/v15/glorot11a/glorot11a.pdf).
 """
-softplus(x::RealOrFloatType) = vifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))
+softplus(x) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))
 
 
 """
@@ -200,7 +195,7 @@ softplus(x::RealOrFloatType) = vifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))
 
 Return `log(cosh(x))` which is computed in a numerically stable way.
 """
-logcosh(x::RealOrFloatType) = x + softplus(-2x) - log(oftype(x, 2))
+logcosh(x) = x + softplus(-2x) - log(oftype(x, 2))
 
 
 """
@@ -209,14 +204,14 @@ logcosh(x::RealOrFloatType) = x + softplus(-2x) - log(oftype(x, 2))
 Self Regularized Non-Monotonic Neural Activation Function.
 See [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681).
 """
-mish(x::RealOrFloatType) = x * tanh(softplus(x))
+mish(x) = x * tanh(softplus(x))
 
 """
     tanhshrink(x) = x - tanh(x)
 
 See [Tanhshrink Activation Function](https://www.gabormelli.com/RKB/Tanhshrink_Activation_Function).
 """
-tanhshrink(x::RealOrFloatType) = x - tanh(x)
+tanhshrink(x) = x - tanh(x)
 
 """
     softshrink(x, λ=0.5) =
@@ -224,14 +219,10 @@ tanhshrink(x::RealOrFloatType) = x - tanh(x)
 
 See [Softshrink Activation Function](https://www.gabormelli.com/RKB/Softshrink_Activation_Function).
 """
-softshrink(x::RealOrFloatType, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ)
+softshrink(x, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ)
 
 # Provide an informative error message if activation functions are called with an array
 for f in (:σ, :hardσ, :logσ, :hardtanh, :relu, :leakyrelu, :relu6, :rrelu, :elu, :gelu, :swish, :lisht, :selu, :celu, :trelu, :softsign, :softplus, :logcosh, :mish, :tanhshrink, :softshrink)
     @eval $(f)(x::AbstractArray, args...) =
       error("Use broadcasting (`", $(string(f)), ".(x)`) to apply activation functions to arrays.")
 end
-
-for f in (:σ, :tanh)
-    @eval Base.broadcasted(::typeof($f), x::Array{T, N}) where {T <: Union{Float64, Float32}, N} = vmap($f, x)
-end
diff --git a/src/softmax.jl b/src/softmax.jl
index 24ed88027..4fec286c3 100644
--- a/src/softmax.jl
+++ b/src/softmax.jl
@@ -1,13 +1,6 @@
 export softmax, softmax!, ∇softmax, ∇softmax!,
        logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!
 
-fsum(x; dims) = sum(x, dims = dims)
-fsum(x::Array; dims) = vreduce(+, x, dims = dims)
-fmaximum(x; dims) = maximum(x, dims = dims)
-fmaximum(x::Array; dims) = vreduce(max, x, dims = dims)
-fmap(f, x) = map(f, x)
-fmap(f, x::Array) = vmap(f, x)
-
 """
     softmax(x; dims=1)
 
@@ -33,38 +26,45 @@ julia> softmax([1, 2, 3])
 See also [`logsoftmax`](@ref).
 """
 function softmax(xs::AbstractArray; dims=1)
-    max_ = fmaximum(xs, dims=dims)
-    exp_ = fmap(exp, xs .- max_)
-    exp_ ./ fsum(exp_, dims=dims)
+    max_ = maximum(xs, dims=dims)
+    exp_ = exp.(xs .- max_)
+    exp_ ./ sum(exp_, dims=dims)
 end
 
-function softmax!(out::AbstractVecOrMat, xs::AbstractVecOrMat)
-    for j = 1:size(xs, 2)
-        xi_max = xs[1, j]
-        @avx for i = 1:size(xs, 1)
-            xi_max = max(xi_max, xs[i, j])
-        end
-        @avx for i = 1:size(out, 1)
-            out[i, j] = exp(xs[i, j] - xi_max)
-        end
-        s = zero(eltype(out))
-        @avx for i = 1:size(out, 1)
+function softmax!(out::AbstractVecOrMat{T}, xs::AbstractVecOrMat{T}) where {T}
+     @inbounds for j = 1:size(xs, 2)
+         # First, store column-wise maximum in the last element of `out`
+         out[end, j] = xs[end, j]
+         @inbounds for i = 1:(size(xs, 1) - 1)
+             out[end, j] = max(out[end, j], xs[i, j])
+         end
+
+         # Subtract the column-wise maximums to normalize, take exp()
+         # out .= exp(xs .- out[end, :])
+         @inbounds for i = 1:size(out, 1)
+             out[i, j] = exp(xs[i, j] - out[end, j])
+         end
+
+         # Normalize by sum of the entire thing
+         # out ./= sum(out, 1)
+         s = T(0)
+         @inbounds for i = 1:size(out, 1)
             s += out[i, j]
-        end
-        @avx for i = 1:size(out, 1)
+         end
+         @inbounds for i = 1:size(out, 1)
             out[i, j] /= s
-        end
+         end
+    out
     end
-    return out
 end
 
 function ∇softmax!(out::AbstractVecOrMat, Δ::AbstractVecOrMat, xs::AbstractVecOrMat)
     sf = softmax(xs)
-    out .= sf .* (Δ .- fsum(Δ .* sf, dims = 1))
+    out .= sf .* (Δ .- sum(Δ .* sf, dims = 1))
 end
 function ∇softmax(Δ, xs; dims=1)
     sf = softmax(xs, dims=dims)
-    sf .* (Δ .- fsum(Δ .* sf, dims=dims))
+    sf .* (Δ .- sum(Δ .* sf, dims=dims))
 end
 ∇softmax!(Δ, xs) = ∇softmax!(Δ, Δ, xs)
 
@@ -83,32 +83,34 @@ It is semantically equivalent to the following:
 See also [`softmax`](@ref).
 """
 function logsoftmax(xs::AbstractArray; dims=1)
-    max_ = fmaximum(xs, dims=dims)
-    exp_ = fmap(exp, xs .- max_)
-    log_ = fmap(log, fsum(exp_, dims=dims))
+    max_ = maximum(xs, dims=dims)
+    exp_ = exp.(xs .- max_)
+    log_ = log.(sum(exp_, dims=dims))
     (xs .- max_) .- log_
 end
 
 function logsoftmax!(out::AbstractVecOrMat, xs::AbstractVecOrMat)
     for j = 1:size(xs, 2)
-        xi_max = xs[1, j]
-        @avx for i = 1:size(out, 1)
-            xi_max = max(xi_max, xs[i, j])
-        end
-        s = zero(eltype(out))
-        @avx for i = 1:size(out, 1)
-            s += exp(xs[i, j] - xi_max)
-        end
-        @avx for i = 1:size(out, 1)
-            out[i, j] = xs[i, j] - log(s) - xi_max
+        @inbounds begin
+             xi_max = xs[1, j]
+             for i = 1:size(out, 1)
+                 xi_max = max(xi_max, xs[i, j])
+             end
+             s = zero(eltype(out))
+             for i = 1:size(out, 1)
+                 s += exp(xs[i, j] - xi_max)
+             end
+             for i = 1:size(out, 1)
+                 out[i, j] = xs[i, j] - log(s) - xi_max
+             end
         end
     end
     return out
 end
 
 function ∇logsoftmax!(out::AbstractVecOrMat, Δ::AbstractVecOrMat, xs::AbstractVecOrMat)
-    out .= Δ .- fsum(Δ, dims=1) .* softmax(xs, dims=1)
+    out .= Δ .- sum(Δ, dims=1) .* softmax(xs, dims=1)
 end
 
-∇logsoftmax(Δ, xs; dims=1) = Δ .- fsum(Δ, dims=dims) .* softmax(xs, dims=dims)
+∇logsoftmax(Δ, xs; dims=1) = Δ .- sum(Δ, dims=dims) .* softmax(xs, dims=dims)
 ∇logsoftmax!(Δ, xs) = ∇logsoftmax!(Δ, Δ, xs)