forked from pterhx/tabbr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathglossary.js
86 lines (70 loc) · 1.83 KB
/
glossary.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
function Glossary(opts) {
this.opts = _(opts || {}).defaults({
minFreq: 2,
collapse: false,
blacklist: [],
verbose: true
});
}
function normalize(word) {
return word.singularize();
}
Glossary.prototype.extract = function(text) {
var tags = new POSTagger().tag(new Lexer().lex(text)),
terms = {};
function add(word) {
var norm = normalize(word);
terms[norm] = terms[norm] || {
count: 0,
norm: norm,
word: word
};
terms[norm].count++;
}
var searching = true;
for (var i = 0; i < tags.length; i++) {
var word = tags[i][0],
tag = tags[i][1];
var isNoun = tag.indexOf('N') == 0,
isAdj = tag == "JJ";
if (searching && (isNoun || (isAdj
&& word[0].match(/[A-Z]/)))) {
searching = false;
add(word);
}
else if (!searching && isNoun) {
add(word);
}
else if (!searching && !isNoun) {
searching = true;
}
}
var opts = this.opts;
var terms = _(terms).select(function(term) {
return term.count >= opts.minFreq;
});
if (opts.collapse) {
terms = _(terms).reject(function(term) {
return _(terms).any(function(term2) {
return term.word != term2.word
&& term2.norm.indexOf(term.norm) >= 0;
})
});
}
if (opts.blacklist) {
terms = _(terms).reject(function(term) {
return _(opts.blacklist).any(function(black) {
return term.norm.toLowerCase().indexOf(normalize(black).toLowerCase()) >= 0;
})
})
}
if (!this.opts.verbose) {
terms = _(terms).pluck("word");
}
return terms;
}
var createGlossary = function(opts) {
return new Glossary(opts);
}
var glossary = createGlossary();
createGlossary.extract = _(glossary.extract).bind(glossary);