forked from boyter/cs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ranker.go
266 lines (228 loc) · 9.9 KB
/
ranker.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
// SPDX-License-Identifier: MIT OR Unlicense
package main
import (
str "github.com/boyter/go-string"
"math"
"sort"
"strings"
)
// Takes in the search terms and results and applies chained
// ranking over them to produce a score and then sort those results
// and return them
// Note that this method will evolve over time
// and as such you should never rely on the returned results being
// the same
func rankResults(corpusCount int, results []*FileJob) []*FileJob {
// needs to come first because it resets the scores
switch Ranker {
case "simple":
// in this case the results are already ranked by the number of matches
case "bm25":
results = rankResultsBM25(corpusCount, results, calculateDocumentFrequency(results))
results = rankResultsLocation(results)
case "tfidf2":
results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), false)
results = rankResultsLocation(results)
default:
results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), true)
results = rankResultsLocation(results)
}
// TODO maybe need to add something here to reward phrases
sortResults(results)
return results
}
// Base value used to determine how much location matches
// should be boosted by
const (
LocationBoostValue = 0.05
DefaultScoreValue = 0.01
PhraseBoostValue = 1.00
BytesWordDivisor = 2
)
// Given the results will boost the rank of them based on matches in the
// file location field.
// This is not using TF-IDF or any fancy algorithm just basic checks
// and boosts
func rankResultsLocation(results []*FileJob) []*FileJob {
for i := 0; i < len(results); i++ {
foundTerms := 0
for key := range results[i].MatchLocations {
l := str.IndexAllIgnoreCase(results[i].Location, key, -1)
// Boost the rank slightly based on number of matches and on
// how long a match it is as we should reward longer matches
if len(l) != 0 {
foundTerms++
// If the rank is ever 0 than nothing will change, so set it
// to a small value to at least introduce some ranking here
if results[i].Score == 0 || math.IsNaN(results[i].Score) {
results[i].Score = DefaultScoreValue
}
// Set the score to be itself * 1.something where something
// is 0.05 times the number of matches * the length of the match
// so if the user searches for test a file in the location
// /test/test.go
// will be boosted and have a higher rank than
// /test/other.go
//
// Of course this assumes that they have the text test in the
// content otherwise the match is discarded
results[i].Score = results[i].Score * (1.0 +
(LocationBoostValue * float64(len(l)) * float64(len(key))))
// If the location is closer to the start boost or rather don't
// affect negatively as much because we reduce the score slightly based on
// how far away from the start it is
low := math.MaxInt32
for _, l := range l {
if l[0] < low {
low = l[0]
}
}
results[i].Score = results[i].Score*1.0 - (float64(low) * 0.02)
}
}
// If we found multiple terms (assuming we have multiple), boost yet again to
// reward matches which have multiple matches
if foundTerms > 1 {
results[i].Score = results[i].Score * (1 + LocationBoostValue*float64(foundTerms))
}
}
return results
}
// TF-IDF implementation which ranks the results
// Technically this is not a real TF-IDF because we don't
// have counts of terms for documents that don't match
// so the IDF value is not correctly calculated
// https://en.wikipedia.org/wiki/Tf-idf
//
// NB loops in here use increment to avoid duffcopy
// https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot
// due to how often it is called by things like the TUI mode
func rankResultsTFIDF(corpusCount int, results []*FileJob, documentFrequencies map[string]int, classic bool) []*FileJob {
var weight float64
for i := 0; i < len(results); i++ {
weight = 0
// We don't know how many words are actually in this document... and I don't want to check
// because its going to slow things down. Keep in mind that this works inside the words themselves
// I.E. partial matches are the norm so it makes sense to base it on the number of bytes
// Also ensure that it is at least 1 to avoid divide by zero errors later on.
words := float64(maxInt(1, results[i].Bytes/BytesWordDivisor))
// word in the case is the word we are dealing with IE what the user actually searched for
// and wordCount is the locations of those words allowing us to know the number of words matching
for word, wordCount := range results[i].MatchLocations {
// Technically the IDF for this is wrong because we only
// have the count for the matches of the document not all the terms
// that are actually required
// its likely that a search for "a b" is missing the counts
// for documents that have a but not b and as such
// the document frequencies are off with respect to the total
// corpus... although we could get that if needed since we do calculate it...
// Anyway this isn't a huge issue in practice because TF/IDF
// still works for a search of a single term such as a or if multiple terms
// happen to match every document in the corpus which while unlikely
// is still something that could happen
// Its also slightly off due to the fact that we don't know the number of words
// in the document anyway but it should be close enough
// TF = number of this words in this document / words in entire document
// IDF = number of documents that contain this word
tf := float64(len(wordCount)) / words
idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word]))
if classic {
weight += tf * idf
} else {
// Lucene modification to improve results https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/
weight += math.Sqrt(tf) * idf * (1 / math.Sqrt(words))
}
}
// Override the score here because we don't want whatever we got originally
// which is just based on the number of keyword matches... of course this assumes
// that
results[i].Score = weight
}
return results
}
// BM25 implementation which ranks the results
// Technically this is not a real BM25 because we don't
// have counts of terms for documents that don't match
// so the IDF value is not correctly calculated
// https://en.wikipedia.org/wiki/Okapi_BM25
//
// NB loops in here use increment to avoid duffcopy
// https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot
// due to how often it is called by things like the TUI mode
//
// IDF * TF * (k1 + 1)
//
// BM25 = sum ----------------------------
//
// TF + k1 * (1 - b + b * D / L)
func rankResultsBM25(corpusCount int, results []*FileJob, documentFrequencies map[string]int) []*FileJob {
var weight float64
// Get the average number of words across all documents because we need that in BM25 to calculate correctly
var averageDocumentWords float64
for i := 0; i < len(results); i++ {
averageDocumentWords += float64(maxInt(1, results[i].Bytes/BytesWordDivisor))
}
averageDocumentWords = averageDocumentWords / float64(len(results))
k1 := 1.2
b := 0.75
for i := 0; i < len(results); i++ {
weight = 0
// We don't know how many words are actually in this document... and I don't want to check
// because its going to slow things down. Keep in mind that this works inside the words themselves
// I.E. partial matches are the norm so it makes sense to base it on the number of bytes
// Also ensure that it is at least 1 to avoid divide by zero errors later on.
words := float64(maxInt(1, results[i].Bytes/BytesWordDivisor))
// word in the case is the word we are dealing with IE what the user actually searched for
// and wordCount is the locations of those words allowing us to know the number of words matching
for word, wordCount := range results[i].MatchLocations {
// TF = number of this words in this document / words in entire document
// IDF = number of documents that contain this word
tf := float64(len(wordCount)) / words
idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word]))
step1 := idf * tf * (k1 + 1)
step2 := tf + k1*(1-b+(b*words/averageDocumentWords))
weight += step1 / step2
}
// Override the score here because we don't want whatever we got originally
// which is just based on the number of keyword matches... of course this assumes
// that
results[i].Score = weight
}
return results
}
// Calculate the document term frequency for all words across all documents
// letting us know how many times a term appears across the corpus
// This is mostly used for snippet extraction
func calculateDocumentTermFrequency(results []*FileJob) map[string]int {
documentFrequencies := map[string]int{}
for i := 0; i < len(results); i++ {
for k := range results[i].MatchLocations {
documentFrequencies[k] = documentFrequencies[k] + len(results[i].MatchLocations[k])
}
}
return documentFrequencies
}
// Calculate the document frequency for all words across all documents
// allowing us to know the number of documents for which a term appears
// This is mostly used for TF-IDF calculation
func calculateDocumentFrequency(results []*FileJob) map[string]int {
documentFrequencies := map[string]int{}
for i := 0; i < len(results); i++ {
for k := range results[i].MatchLocations {
documentFrequencies[k] = documentFrequencies[k] + 1
}
}
return documentFrequencies
}
// Sort a slice of filejob results based on their score for displaying
// and then sort based on location to stop any undeterministic ordering happening
// as since the location includes the filename we should never have two matches
// that are 100% equal based on the two criteria we use.
func sortResults(results []*FileJob) {
sort.Slice(results, func(i, j int) bool {
if results[i].Score == results[j].Score {
return strings.Compare(results[i].Location, results[j].Location) < 0
}
return results[i].Score > results[j].Score
})
}