-
Notifications
You must be signed in to change notification settings - Fork 7
/
bidirectional_minimum_matching.go
90 lines (72 loc) · 2.16 KB
/
bidirectional_minimum_matching.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package gotokenizer
import (
"strings"
)
// BiDirectionalMinMatch records dict and bigramDic etc.
type BiDirectionalMinMatch struct {
dict *Dict
dictPath string
bigramDictPath string
bigramDic *BigramDict
MMScore float64
RMMScore float64
MM *MinMatch
RMM *ReverseMinMatch
}
// NewBiDirectionalMinMatch returns a newly initialized BiDirectionalMinMatch object
func NewBiDirectionalMinMatch(dictPath, bigramDictPath string) *BiDirectionalMinMatch {
return &BiDirectionalMinMatch{
dictPath: dictPath,
dict: NewDict(dictPath),
bigramDictPath: bigramDictPath,
bigramDic: NewBigramDict(bigramDictPath),
MM: NewMinMatch(dictPath),
RMM: NewReverseMinMatch(dictPath),
}
}
// LoadDict load dict and bigramDic that implements the Tokenizer interface
func (bdmm *BiDirectionalMinMatch) LoadDict() error {
bdmm.dict.Load()
bdmm.bigramDic.Load()
return nil
}
// Get returns segmentation that implements the Tokenizer interface
func (bdmm *BiDirectionalMinMatch) Get(text string) ([]string, error) {
text = strings.Trim(text, " ")
bdmm.MM.dict = bdmm.dict
mmResult, _ := bdmm.MM.Get(text)
bdmm.RMM.dict = bdmm.dict
rmmResult, _ := bdmm.RMM.Get(text)
for i := 0; i < len(mmResult)-1; i++ {
key := mmResult[i] + ":" + mmResult[i+1]
if val, ok := bdmm.bigramDic.records[key]; ok {
score := float64(val) / float64(bdmm.bigramDic.maxF)
bdmm.MMScore += score
}
}
for i := 0; i < len(rmmResult)-1; i++ {
key := rmmResult[i] + ":" + rmmResult[i+1]
if val, ok := bdmm.bigramDic.records[key]; ok {
score := float64(val) / float64(bdmm.bigramDic.maxF)
bdmm.RMMScore += score
}
}
if bdmm.MMScore > bdmm.RMMScore {
return mmResult, nil
}
if bdmm.MMScore < bdmm.RMMScore {
return rmmResult, nil
}
if bdmm.MMScore == bdmm.RMMScore {
return mmResult, nil
}
return nil, nil
}
// GetFrequency returns token frequency that implements the Tokenizer interface
func (bdmm *BiDirectionalMinMatch) GetFrequency(text string) (map[string]int, error) {
result, err := bdmm.Get(text)
if err != nil {
return nil, err
}
return GetFrequency(result), nil
}