forked from cdipaolo/goml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfidf.go
177 lines (149 loc) · 4.5 KB
/
tfidf.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
package text
import (
"math"
"sort"
"golang.org/x/text/transform"
)
/*
TFIDF is a Term Frequency- Inverse
Document Frequency model that is created
from a trained NaiveBayes model (they
are very similar so you can just train
NaiveBayes and convert into TDIDF)
This is not a probabalistic model, necessarily,
and doesn't give classification. It can be
used to determine the 'importance' of a
word in a document, though, which is
useful in, say, keyword tagging.
Term frequency is basically just adjusted
frequency of a word within a document/sentence:
termFrequency(word, doc) = 0.5 * ( 0.5 * word.Count ) / max{ w.Count | w ∈ doc }
Inverse document frequency is basically how
little the term is mentioned within all of
your documents:
invDocumentFrequency(word, Docs) = log( len(Docs) ) - log( 1 + |{ d ∈ Docs | t ∈ d}| )
TFIDF is the multiplication of those two
functions, giving you a term that is larger
when the word is more important, and less when
the word is less important
*/
type TFIDF NaiveBayes
// Frequency holds word frequency information
// so you don't have to hold a map[string]float64
// and can, then, sort
type Frequency struct {
Word string `json:"word"`
Frequency float64 `json:"frequency,omitempty"`
TFIDF float64 `json:"tfidf_score,omitempty"`
}
// Frequencies is an array of word frequencies
// (stored as separate type to be able to sort)
type Frequencies []Frequency
//* implement sort.Interface for Frequency list *//
// Len gives the length of a
// frequency array
func (f Frequencies) Len() int {
return len(f)
}
// Less gives whether the ith element
// of a frequency list has is lesser
// than the jth element by comparing
// their TFIDF values
func (f Frequencies) Less(i, j int) bool {
return f[i].TFIDF < f[j].TFIDF
}
// Swap swaps two indexed values in
// a frequency slice
func (f Frequencies) Swap(i, j int) {
f[i], f[j] = f[j], f[i]
}
// TFIDF returns the TermFrequency-
// InverseDocumentFrequency of a word
// within a corpus given by the trained
// NaiveBayes model
//
// Look at the TFIDF docs to see more about how
// this is calculated
func (t *TFIDF) TFIDF(word string, sentence string) float64 {
sentence, _, _ = transform.String(t.sanitize, sentence)
document := t.Tokenizer.Tokenize(sentence)
return t.TermFrequency(word, document) * t.InverseDocumentFrequency(word)
}
// MostImportantWords runs TFIDF on a
// whole document, returning the n most
// important words in the document. If
// n is greater than the number of words
// then all words will be returned.
//
// The returned keyword slice is sorted
// by importance
func (t *TFIDF) MostImportantWords(sentence string, n int) Frequencies {
sentence, _, _ = transform.String(t.sanitize, sentence)
document := t.Tokenizer.Tokenize(sentence)
freq := TermFrequencies(document)
for i := range freq {
freq[i].TFIDF = freq[i].Frequency * t.InverseDocumentFrequency(freq[i].Word)
freq[i].Frequency = float64(0.0)
}
// sort into slice
sort.Sort(sort.Reverse(freq))
if n > len(freq) {
return freq
}
return freq[:n]
}
// TermFrequency returns the term frequency
// of a word within a corpus defined by the
// trained NaiveBayes model
//
// Look at the TFIDF docs to see more about how
// this is calculated
func (t *TFIDF) TermFrequency(word string, document []string) float64 {
words := make(map[string]int)
for i := range document {
words[document[i]]++
}
// find max word frequency
var maxFreq int
for i := range words {
if words[i] > maxFreq {
maxFreq = words[i]
}
}
return 0.5 * (1 + float64(words[word])/float64(maxFreq))
}
// TermFrequencies gives the TermFrequency of
// all words in a document, and is more efficient
// at doing so than calling that function multiple
// times
func TermFrequencies(document []string) Frequencies {
words := make(map[string]int)
for i := range document {
words[document[i]]++
}
var maxFreq int
for i := range words {
if words[i] > maxFreq {
maxFreq = words[i]
}
}
// make frequency map
frequencies := Frequencies{}
for i := range words {
frequencies = append(frequencies, Frequency{
Word: i,
Frequency: 0.5 * (1 + float64(words[i])/float64(maxFreq)),
})
}
return frequencies
}
// InverseDocumentFrequency returns the 'uniqueness'
// of a word within the corpus defined within a
// trained NaiveBayes model.
//
// Look at the TFIDF docs to see more about how
// this is calculated
func (t *TFIDF) InverseDocumentFrequency(word string) float64 {
w, _ := t.Words.Get(word)
return math.Log(float64(t.DocumentCount)) - math.Log(float64(w.DocsSeen)+1)
}