-
Notifications
You must be signed in to change notification settings - Fork 7
/
reverse_minimum_matching.go
82 lines (63 loc) · 1.7 KB
/
reverse_minimum_matching.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package gotokenizer
import (
"strings"
)
// ReverseMinMatch records dict and dictPath
type ReverseMinMatch struct {
dict *Dict
dictPath string
}
// NewReverseMinMatch returns a newly initialized ReverseMinMatch object
func NewReverseMinMatch(dictPath string) *ReverseMinMatch {
return &ReverseMinMatch{
dictPath: dictPath,
}
}
// LoadDict loads dict that implements the Tokenizer interface
func (rmm *ReverseMinMatch) LoadDict() error {
rmm.dict = NewDict(rmm.dictPath)
return rmm.dict.Load()
}
// Get returns segmentation that implements the Tokenizer interface
func (rmm *ReverseMinMatch) Get(text string) ([]string, error) {
CheckDictIsLoaded(rmm.dict)
var result []string
startLen := 2
text = strings.Trim(text, " ")
for startLen <= len([]rune(text)) {
word := string([]rune(text)[len([]rune(text))-startLen:])
isFind := false
for !isFind {
if len([]rune(text)) == 2 {
word = string([]rune(text))
} else {
if startLen == len([]rune(text))-1 {
startLen = 1
word = string([]rune(text)[len([]rune(text))-startLen:])
break
}
}
if _, ok := rmm.dict.Records[word]; !ok {
startLen++
if startLen > len([]rune(text)) {
break
}
word = string([]rune(text)[len([]rune(text))-startLen:])
} else {
startLen = 2
isFind = true
}
}
result = append(result, word)
text = string([]rune(text)[0 : len([]rune(text))-len([]rune(word))])
}
return Reverse(result), nil
}
// GetFrequency returns token frequency that implements the Tokenizer interface
func (rmm *ReverseMinMatch) GetFrequency(text string) (map[string]int, error) {
result, err := rmm.Get(text)
if err != nil {
return nil, err
}
return GetFrequency(result), nil
}