-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.ts
149 lines (124 loc) · 3.94 KB
/
parser.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import {isPunctuation, isSymbol, Root} from "nlcst-types";
import * as NODE_TYPE from "./node";
import stringWidth from "string-width";
import {Node, Parent} from "unist";
import {isHyphen, punctuationNodeSetMeta} from "./punctuation";
const Parser = require("parse-english");
const modifyChildren = require("unist-util-modify-children");
const visit = require("unist-util-visit");
const toString = require("nlcst-to-string");
const NEWLINE_CHARS = "。!?";
// @ts-ignore
function ParseChinese(doc, file) {
// @ts-ignore
if (!(this instanceof ParseChinese)) {
// @ts-ignore
return new ParseChinese(doc, file);
}
// @ts-ignore
Parser.apply(this, arguments);
}
module.exports = ParseChinese;
ParserPrototype.prototype = Parser.prototype;
function ParserPrototype() {
}
// @ts-ignore
let proto = new ParserPrototype();
ParseChinese.prototype = proto;
proto.tokenizeRootPlugins = [
modifyChildren(addSentenceMeta)
].concat(proto.tokenizeRootPlugins);
function addSentenceMeta(tree: Root) {
visit(tree, NODE_TYPE.Paragraph, (parent: any) => {
let children = [];
for (let node of parent.children) {
if (node.type === NODE_TYPE.Sentence) {
let sentenceChildren = [];
for (let sentence_node of node.children) {
sentenceChildren.push(sentence_node);
// 中文换行
if (
sentence_node.type === NODE_TYPE.Punctuation &&
NEWLINE_CHARS.indexOf(sentence_node.value) > -1
) {
children.push(createNewSentence(sentenceChildren));
sentenceChildren = [];
}
if (sentence_node.type === NODE_TYPE.Word) {
const v = toString(sentence_node);
sentence_node.isFull = v.length !== stringWidth(v);
}
}
if (sentenceChildren.length > 0) {
let sen = createNewSentence(sentenceChildren);
if (!isShortCode(sen)) {
children.push(createNewSentence(sentenceChildren));
}
}
} else {
children.push(node);
}
}
parent.children = children;
});
visit(tree, "SentenceNode", (sentence: any) => {
sentence.isFull = false;
sentence.index = {punctuation: 0};
visit(sentence, ["TextNode", "PunctuationNode"], (node: any, i: number, parent: Parent) => {
if (node.type === "TextNode") {
node.isFull = node.value.length !== stringWidth(node.value);
} else if (node.type === "PunctuationNode") {
punctuationNodeSetMeta(node);
if (isHyphen(node) && !isStartOrEndInArray(i, parent.children)) {
const last = parent.children[i - 1];
const next = parent.children[i + 1];
if (isSymbol(last) && last.value === "<") {
last.value = "<-"
parent.children.splice(i, 1)
} else if (isSymbol(next) && next.value === ">") {
last.value = "->"
parent.children.splice(i, 1)
}
}
}
});
for (let node of sentence.children) {
if (node.type === NODE_TYPE.Word) {
sentence.isFull = node.isFull || sentence.isFull;
}
if (node.type === NODE_TYPE.Punctuation) {
sentence.index.punctuation += 1;
}
}
});
}
function isShortCode(sen: Parent) {
let first, last;
for (let child of sen.children) {
if (isPunctuation(child)) {
if (!first) {
first = child;
}
last = child;
}
}
return isPunctuation(first) &&
first.value === "{{" &&
isPunctuation(last) &&
last.value === "}}";
}
export const isStartOrEndInArray = (i: number, children: Node[]) => {
return i === 0 || children.length - 1 === i;
};
function createNewSentence(children: Node[]): Parent {
const start = children[0];
const end = children[children.length - 1];
return {
type: "SentenceNode",
children: children,
position: (start.position && end.position) ? {
start: start.position.start,
end: end.position.end
} : undefined,
};
}