-
Notifications
You must be signed in to change notification settings - Fork 7
/
minimum_matching.go
82 lines (59 loc) · 1.51 KB
/
minimum_matching.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package gotokenizer
import (
"strings"
)
// MinMatch records dict and dictPath
type MinMatch struct {
dict *Dict
dictPath string
}
// NewMinMatch returns a newly initialized MinMatch object
func NewMinMatch(dictPath string) *MinMatch {
return &MinMatch{
dictPath: dictPath,
}
}
// LoadDict loads dict that implements the Tokenizer interface
func (mm *MinMatch) LoadDict() error {
mm.dict = NewDict(mm.dictPath)
return mm.dict.Load()
}
// Get returns segmentation that implements the Tokenizer interface
func (mm *MinMatch) Get(text string) ([]string, error) {
CheckDictIsLoaded(mm.dict)
var result []string
startLen := 2
text = strings.Trim(text, " ")
for len([]rune(text)) > 0 {
word := string([]rune(text)[0:startLen])
isFind := false
for !isFind {
if startLen == mm.dict.maxLen || startLen == len([]rune(text)) || len([]rune(text)) == 1 {
startLen = 1
word = string([]rune(text)[0:startLen])
break
}
if _, ok := mm.dict.Records[word]; !ok {
startLen++
if startLen > len([]rune(text)) {
break
}
word = string([]rune(text)[0:startLen])
} else {
startLen = 2
isFind = true
}
}
result = append(result, word)
text = string([]rune(text)[len([]rune(word)):])
}
return result, nil
}
// GetFrequency returns token frequency that implements the Tokenizer interface
func (mm *MinMatch) GetFrequency(text string) (map[string]int, error) {
result, err := mm.Get(text)
if err != nil {
return nil, err
}
return GetFrequency(result), nil
}