This repository has been archived by the owner on May 14, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 164
/
tag_test.go
113 lines (96 loc) · 3.14 KB
/
tag_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package prose
import (
"encoding/json"
"fmt"
"path/filepath"
"reflect"
"testing"
)
func makeTagger(text string) (*Document, error) {
return NewDocument(
text,
WithSegmentation(false),
WithExtraction(false))
}
func ExampleReadTagged() {
tagged := "Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS"
fmt.Println(ReadTagged(tagged, "|"))
// Output: [[[Pierre Vinken , 61 years] [NNP NNP , CD NNS]]]
}
func TestTagSimple(t *testing.T) {
doc, err := makeTagger("Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.")
if err != nil {
panic(err)
}
tags := []string{}
for _, tok := range doc.Tokens() {
tags = append(tags, tok.Tag)
}
if !reflect.DeepEqual([]string{
"NNP", "NNP", ",", "CD", "NNS", "JJ", ",", "MD", "VB", "DT", "NN",
"IN", "DT", "JJ", "NN", "NNP", "CD", "."}, tags) {
t.Errorf("TagSimple() got = %v", tags)
}
}
func TestTagTreebank(t *testing.T) {
tagger := newPerceptronTagger()
tokens, expected := []*Token{}, []string{}
tags := readDataFile(filepath.Join(testdata, "treebank_tags.json"))
checkError(json.Unmarshal(tags, &expected))
treebank := readDataFile(filepath.Join(testdata, "treebank_tokens.json"))
checkError(json.Unmarshal(treebank, &tokens))
correct := 0.0
for i, tok := range tagger.tag(tokens) {
if expected[i] == tok.Tag {
correct++
}
}
v := correct / float64(len(expected))
if v < 0.957477 {
t.Errorf("TagTreebank() expected >= 0.957477, got = %v", v)
}
}
func BenchmarkTag(b *testing.B) {
tagger := newPerceptronTagger()
tokens := []*Token{}
treebank := readDataFile(filepath.Join(testdata, "treebank_tokens.json"))
checkError(json.Unmarshal(treebank, &tokens))
for n := 0; n < b.N; n++ {
_ = tagger.tag(tokens)
}
}
/* TODO: POS training API
var wsj = "Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD " +
"join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN " +
"Nov.|NNP 29|CD .|.\nMr.|NNP Vinken|NNP is|VBZ chairman|NN of|IN " +
"Elsevier|NNP N.V.|NNP ,|, the|DT Dutch|NNP publishing|VBG " +
"group|NN .|. Rudolph|NNP Agnew|NNP ,|, 55|CD years|NNS old|JJ " +
"and|CC former|JJ chairman|NN of|IN Consolidated|NNP Gold|NNP " +
"Fields|NNP PLC|NNP ,|, was|VBD named|VBN a|DT nonexecutive|JJ " +
"director|NN of|IN this|DT British|JJ industrial|JJ conglomerate|NN " +
".|.\nA|DT form|NN of|IN asbestos|NN once|RB used|VBN to|TO make|VB " +
"Kent|NNP cigarette|NN filters|NNS has|VBZ caused|VBN a|DT high|JJ " +
"percentage|NN of|IN cancer|NN deaths|NNS among|IN a|DT group|NN " +
"of|IN workers|NNS exposed|VBN to|TO it|PRP more|RBR than|IN " +
"30|CD years|NNS ago|IN ,|, researchers|NNS reported|VBD .|."
func TestTrain(t *testing.T) {
sentences := ReadTagged(wsj, "|")
iter := random(5, 20)
tagger.Train(sentences, iter)
tagSet := []string{}
nrWords := 0
for _, tuple := range sentences {
nrWords += len(tuple[0])
for _, tag := range tuple[1] {
if !util.StringInSlice(tag, tagSet) {
tagSet = append(tagSet, tag)
}
}
}
assert.Equal(t, nrWords*iter, int(tagger.model.instances))
assert.Subset(t, tagger.Classes(), tagSet)
}
func random(min, max int) int {
rand.Seed(time.Now().Unix())
return rand.Intn(max-min) + min
}*/