-
Notifications
You must be signed in to change notification settings - Fork 7
/
bidirectional_maximum_matching.go
90 lines (72 loc) · 2.16 KB
/
bidirectional_maximum_matching.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package gotokenizer
import (
"strings"
)
// BiDirectionalMaxMatch records dict and bigramDic etc.
type BiDirectionalMaxMatch struct {
dict *Dict
dictPath string
bigramDictPath string
bigramDic *BigramDict
MMScore float64
RMMScore float64
MM *MaxMatch
RMM *ReverseMaxMatch
}
// NewBiDirectionalMaxMatch returns a newly initialized BiDirectionalMaxMatch object
func NewBiDirectionalMaxMatch(dictPath, bigramDictPath string) *BiDirectionalMaxMatch {
return &BiDirectionalMaxMatch{
dictPath: dictPath,
dict: NewDict(dictPath),
bigramDictPath: bigramDictPath,
bigramDic: NewBigramDict(bigramDictPath),
MM: NewMaxMatch(dictPath),
RMM: NewReverseMaxMatch(dictPath),
}
}
// LoadDict load dict and bigramDic that implements the Tokenizer interface
func (bdmm *BiDirectionalMaxMatch) LoadDict() error {
bdmm.dict.Load()
bdmm.bigramDic.Load()
return nil
}
// Get returns segmentation that implements the Tokenizer interface
func (bdmm *BiDirectionalMaxMatch) Get(text string) ([]string, error) {
text = strings.Trim(text, " ")
bdmm.MM.dict = bdmm.dict
fmmResult, _ := bdmm.MM.Get(text)
bdmm.RMM.dict = bdmm.dict
bmmResult, _ := bdmm.RMM.Get(text)
for i := 0; i < len(fmmResult)-1; i++ {
key := fmmResult[i] + ":" + fmmResult[i+1]
if val, ok := bdmm.bigramDic.records[key]; ok {
score := float64(val) / float64(bdmm.bigramDic.maxF)
bdmm.MMScore += score
}
}
for i := 0; i < len(bmmResult)-1; i++ {
key := bmmResult[i] + ":" + bmmResult[i+1]
if val, ok := bdmm.bigramDic.records[key]; ok {
score := float64(val) / float64(bdmm.bigramDic.maxF)
bdmm.RMMScore += score
}
}
if bdmm.MMScore > bdmm.RMMScore {
return fmmResult, nil
}
if bdmm.MMScore < bdmm.RMMScore {
return bmmResult, nil
}
if bdmm.MMScore == bdmm.RMMScore {
return fmmResult, nil
}
return nil, nil
}
// GetFrequency returns token frequency that implements the Tokenizer interface
func (bdmm *BiDirectionalMaxMatch) GetFrequency(text string) (map[string]int, error) {
result, err := bdmm.Get(text)
if err != nil {
return nil, err
}
return GetFrequency(result), nil
}