forked from cdipaolo/goml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
knn.go
212 lines (181 loc) · 5.31 KB
/
knn.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
package cluster
import (
"fmt"
"math"
"github.com/Fabse333/goml/base"
)
/*
KNN implements the KNN algorithm
for classification, where an input
is classified by finding the K
nearest (by some distance metric)
data points, and taking a vote
based on those.
https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
Example K-Nearest-Neighbors Model Usage:
// initialize data!
twoClusters := [][]float64{}
twoClustersY := []float64{}
for i := -10.0; i < -3; i += 0.1 {
for j := -10.0; j < 10; j += 0.1 {
twoClusters = append(twoClusters, []float64{i, j})
twoClustersY = append(twoClustersY, 0.0)
}
}
for i := 3.0; i < 10; i += 0.1 {
for j := -10.0; j < 10; j += 0.1 {
twoClusters = append(twoClusters, []float64{i, j})
twoClustersY = append(twoClustersY, 1.0)
}
}
// create the model using 3 nearest neighbors
// for prediction, using the Euclidean Distance
// as the distance metric.
model := NewKNN(3, twoClusters, twoClustersY, base.EuclideanDistance)
// make predictions like usual
guess, err := model.Predict([]float64{-10,1})
if err != nil {
panic("THERE WAS AN ERROR")
}
// update the K used (use 10 neighbors now)
model.K = 10
*/
type KNN struct {
// Distance holds the distance
// measure for the KNN algorithm,
// which is just a function that
// maps 2 float64 vectors to a
// float64
Distance base.DistanceMeasure
// K is the number of nearest
// neighbors to classify based
// on in the KNN prediction
// algorithm
K int
// trainingSet holds all training
// examples, while expectedResults
// holds the associated class of the
// corresponding example.
trainingSet [][]float64
expectedResults []float64
}
// nn represents an encapsulation
// of the Nearest Neighbor data for
// each datapoint to facilitate easy
// sorting
type nn struct {
X []float64
Y float64
Distance float64
}
// NewKNN returns a pointer to the k-means
// model, which clusters given inputs in an
// unsupervised manner. The algorithm only has
// one optimization method (unless learning with
// the online variant which is more of a generalization
// than the same algorithm) so you aren't allowed
// to pass one in as an option.
//
// n is an optional parameter which (if given) assigns
// the length of the input vector.
func NewKNN(k int, trainingSet [][]float64, expectedResults []float64, distanceMeasure base.DistanceMeasure) *KNN {
return &KNN{
Distance: distanceMeasure,
K: k,
trainingSet: trainingSet,
expectedResults: expectedResults,
}
}
// UpdateTrainingSet takes in a new training set (variable x.)
func (k *KNN) UpdateTrainingSet(trainingSet [][]float64, expectedResults []float64) error {
if len(trainingSet) == 0 || len(expectedResults) == 0 {
return fmt.Errorf("Error: length of given data is 0! Need data!")
}
if len(trainingSet) != len(expectedResults) {
return fmt.Errorf("Datasets given do not match in length")
}
k.trainingSet = trainingSet
k.expectedResults = expectedResults
return nil
}
// Examples returns the number of training examples (m)
// that the model currently is holding
func (k *KNN) Examples() int {
return len(k.trainingSet)
}
// insertSorted takes a array v, and inserts u into
// the list in the position such that the list is
// sorted inversely. The function will not change the length
// of v, though, such that if u would appear last
// in the combined sorted list it would just be omitted.
//
// if the length of V is less than K, then u is inserted
// without deleting the last element
//
// Assumes v has been sorted. Uses binary search.
func insertSorted(u nn, v []nn, K int) []nn {
low := 0
high := len(v) - 1
for low <= high {
mid := (low + high) / 2
if u.Distance < v[mid].Distance {
high = mid - 1
} else if u.Distance >= v[mid].Distance {
low = mid + 1
}
}
if low >= len(v) && len(v) >= K {
return v
}
sorted := append(v[:low], append([]nn{u}, v[low:]...)...)
if len(v) < K {
return sorted
}
return sorted[:len(v)]
}
// round rounds a float64
func round(a float64) float64 {
if a < 0 {
return math.Ceil(a - 0.5)
}
return math.Floor(a + 0.5)
}
// Predict takes in a variable x (an array of floats,) and
// finds the value of the hypothesis function given the
// current parameter vector θ
//
// if normalize is given as true, then the input will
// first be normalized to unit length. Only use this if
// you trained off of normalized inputs and are feeding
// an un-normalized input
func (k *KNN) Predict(x []float64, normalize ...bool) ([]float64, error) {
if k.K > len(k.trainingSet) {
return nil, fmt.Errorf("Given K (%v) is greater than the length of the training set", k.K)
}
if len(x) != len(k.trainingSet[0]) {
return nil, fmt.Errorf("Given x (len %v) does not match dimensions of training set", len(x))
}
if len(normalize) != 0 && normalize[0] {
base.NormalizePoint(x)
}
// initialize the neighbors as an empty
// slice of Neighbors. insertSorted will
// take care of capping the neighbors at
// K.
neighbors := []nn{}
// calculate nearest neighbors
for i := range k.trainingSet {
dist := k.Distance(x, k.trainingSet[i])
neighbors = insertSorted(nn{
X: k.trainingSet[i],
Y: k.expectedResults[i],
Distance: dist,
}, neighbors, k.K)
}
// take weighted vote
sum := 0.0
for i := range neighbors {
sum += neighbors[i].Y
}
return []float64{round(sum / float64(k.K))}, nil
}