This repository has been archived by the owner on Oct 26, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 14
/
stopwords_test.go
96 lines (83 loc) · 1.79 KB
/
stopwords_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
package swan
import (
"fmt"
"io/ioutil"
"reflect"
"strings"
"testing"
"github.com/PuerkitoBio/goquery"
)
func TestDetectLang(t *testing.T) {
t.Parallel()
for l := range stopwords {
path := fmt.Sprintf("test_data/stopwords/%s.txt", l)
txt, err := ioutil.ReadFile(path)
if err != nil {
t.Fatalf("failed to read %s: %s", path, err)
}
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(txt)))
if err != nil {
t.Fatalf("failed to create doc: %s", err)
}
a := &Article{
Doc: doc,
}
err = precleanup{}.run(a)
err = cleanup{}.run(a)
if err != nil {
t.Fatalf("failed to clean doc: %s", err)
}
lang := detectLang(a)
path = strings.Replace(path, ".txt", "", -1)
if !strings.HasSuffix(path, lang) {
t.Fatalf("incorrect language detected for %s: %s", path, lang)
}
}
}
func TestSplitText(t *testing.T) {
t.Parallel()
type test struct {
in string
out []string
}
table := []test{
test{
in: "there once was a boy .",
out: []string{"there", "once", "was", "a", "boy"},
},
test{
in: "the boy's hat was green",
out: []string{"the", "boy's", "hat", "was", "green"},
},
test{
in: "spaces. ",
out: []string{"spaces"},
},
test{
in: " more spaces. ",
out: []string{"more", "spaces"},
},
test{
in: "punct: everywhere!",
out: []string{"punct", "everywhere"},
},
test{
in: "test – test",
out: []string{"test", "test"},
},
test{
in: "test ––––– test",
out: []string{"test", "test"},
},
test{
in: "test –– ü – ☃ –– test",
out: []string{"test", "ü", "☃", "test"},
},
}
for _, tc := range table {
ws := splitText(tc.in)
if !reflect.DeepEqual(ws, tc.out) {
t.Fatalf("%#v != %#v", ws, tc.out)
}
}
}