diff --git a/src/bandwidth.jl b/src/bandwidth.jl new file mode 100644 index 0000000..f60ff16 --- /dev/null +++ b/src/bandwidth.jl @@ -0,0 +1,23 @@ +# Silverman's rule of thumb for KDE bandwidth selection +function bandwidth(data::Vector{Float64}, alpha::Float64 = 0.9) + # Determine length of data + ndata = length(data) + + # Calculate width using variance and IQR + var_width = std(data) + q25, q75 = quantile(data, [0.25, 0.75]) + quantile_width = (q75 - q25) / 1.34 + + # Deal with edge cases with 0 IQR or variance + width = min(var_width, quantile_width) + if width == 0.0 + if var_width == 0.0 + width = 1.0 + else + width = var_width + end + end + + # Set bandwidth using Silverman's rule of thumb + return alpha * width * ndata^(-0.2) +end diff --git a/src/classifier.jl b/src/classifier.jl index 97a4daa..f238f75 100644 --- a/src/classifier.jl +++ b/src/classifier.jl @@ -1,19 +1,19 @@ immutable kNNClassifier - t::NaiveNeighborTree + t::BruteTree y::Vector end function knn(X::Matrix, y::Vector; metric::Metric = Euclidean()) - return kNNClassifier(NaiveNeighborTree(X, metric), y) + return kNNClassifier(BruteTree(X, metric), y) end # TODO: Don't construct copy of model.y just to extract majority vote function StatsBase.predict(model::kNNClassifier, x::Vector, k::Integer = 1) - inds, dists = nearest(model.t, x, k) + inds, dists = NearestNeighbors.knn(model.t, x, k) return majority_vote(model.y[inds]) end @@ -33,7 +33,7 @@ end function StatsBase.predict(model::kNNClassifier, X::Matrix, k::Integer = 1) - predictions = Array(eltype(model.y), size(X, 2)) + predictions = Array{eltype(model.y)}(size(X, 2)) predict!(predictions, model, X, k) return predictions end diff --git a/src/kNN.jl b/src/kNN.jl index 9444ac6..5a415cb 100644 --- a/src/kNN.jl +++ b/src/kNN.jl @@ -9,6 +9,7 @@ module kNN using NearestNeighbors using SmoothingKernels + include("bandwidth.jl") include("majority_vote.jl") include("classifier.jl") include("regress.jl") diff --git a/src/regress.jl b/src/regress.jl index 51cb561..a68fcae 100644 --- a/src/regress.jl +++ b/src/regress.jl @@ -11,7 +11,7 @@ function kernelregression{R <: Any, y::Vector{T}; kernel::Symbol = :epanechnikov, bandwidth::Real = NaN, - getbandwidth::Function = StatsBase.bandwidth) + getbandwidth::Function = kNN.bandwidth) if isnan(bandwidth) h_x = getbandwidth(x) h_y = getbandwidth(y) @@ -50,7 +50,7 @@ end function StatsBase.predict{T <: Real}(model::KernelRegression, xs::AbstractVector{T}) - ys = Array(Float64, length(xs)) + ys = Array{T}(length(xs)) predict!(ys, model, xs) return ys end diff --git a/test/classifier.jl b/test/classifier.jl index 48b095f..0ef9424 100644 --- a/test/classifier.jl +++ b/test/classifier.jl @@ -7,9 +7,9 @@ module TestClassifier using Distances using StatsBase - iris = data("datasets", "iris") - X = array(iris[:, 1:4])' - y = array(iris[:, 5]) + iris = dataset("datasets", "iris") + X = Array(iris[:, 1:4])' + y = Array(iris[:, 5]) model = knn(X, y, metric = Euclidean()) predict_k1 = predict(model, X, 1) diff --git a/test/regress.jl b/test/regress.jl index 87b3109..e6d6e6c 100644 --- a/test/regress.jl +++ b/test/regress.jl @@ -6,7 +6,7 @@ module TestRegress srand(1) n = 1_000 x = 10 * randn(n) - y = sin(x) + 0.5 * randn(n) + y = sin.(x) + 0.5 * randn(n) fit = kernelregression(x, y) grid = minimum(x):0.1:maximum(x)