-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathChineseSegmentation.ts
70 lines (51 loc) · 1.71 KB
/
ChineseSegmentation.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
export async function splitChineseTextToWords_Jieba(text: string, fineGrained = false, useHMM = true) {
const jieba = await getJiebaWasmInstance()
if (!fineGrained) {
return jieba.cut(text, useHMM)
} else {
const results = jieba.tokenize(text, 'search', useHMM)
const startOffsetsSet = new Set<number>()
const endOffsetsSet = new Set<number>()
for (const result of results) {
startOffsetsSet.add(result.start)
endOffsetsSet.add(result.end)
}
const startOffsets = Array.from(startOffsetsSet)
startOffsets.sort((a, b) => a - b)
const endOffsets = Array.from(endOffsetsSet)
endOffsets.sort((a, b) => a - b)
const words: string[] = []
for (let i = 0; i < startOffsets.length; i++) {
const wordStartOffset = startOffsets[i]
function getWordEndOffset() {
if (i < startOffsets.length - 1) {
const nextWordStartOffset = startOffsets[i + 1]
for (let j = 0; j < endOffsets.length - 1; j++) {
const currentEndOffset = endOffsets[j]
const nextEndOffset = endOffsets[j + 1]
if (currentEndOffset >= nextWordStartOffset) {
return nextWordStartOffset
} else if (
currentEndOffset > wordStartOffset &&
currentEndOffset < nextWordStartOffset &&
nextEndOffset > nextWordStartOffset) {
return currentEndOffset
}
}
}
return endOffsets[endOffsets.length - 1]
}
const wordEndOffset = getWordEndOffset()
words.push(text.substring(wordStartOffset, wordEndOffset))
}
return words
}
}
let JiebaWasmInstance: typeof import('jieba-wasm')
async function getJiebaWasmInstance() {
if (!JiebaWasmInstance) {
const JiebaWasm = await import('jieba-wasm')
JiebaWasmInstance = JiebaWasm
}
return JiebaWasmInstance
}