-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoken.flex
116 lines (84 loc) · 3.59 KB
/
token.flex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
%{
/* Flex tokeniser and sentence boundary finder for Malay text in plain
ASCII or Latin-1 (8-bit) format.
Author: Tim Baldwin, University of Melbourne
Adapted from code by John Carroll, in turn adapted from code by Guido
Minnen, Erik Hektoen, and Greg Grefenstette.
All control chars, as well as space and Latin-1 non-breakable space, are
regarded as whitespace. This file consists only ASCII characters.
*/
#define YY_INPUT(buf,result,max_size) \
{ \
int c = getchar(); \
result = (c == EOF) ? YY_NULL : (buf[0] = c, 1); \
}
#define ECHO (void) fwrite( yytext, yyleng, 1, yyout ); fflush(yyout)
int sent = 0;
int nl = 0;
%}
/* Character classes */
upper [A-Z\xC0-\xD6\xD8-\xDE]
lower [a-z\xDF-\xF6\xF8-\xFF]
lcons [bcdfghj-np-tvxz]
letter [A-Za-z\xC0-\xD6\xD8-\xDE\xDF-\xF6\xF8-\xFF]
/* {lower}|{upper} */
digit [0-9]
symbol [!-/:-@[-`{-~\xA1-\xBF\xD7\xF7]
ns {letter}|{digit}|{symbol}
sp " "|\t
nl \n|\r
spnl {sp}|{nl}
trail [!?;:,|/)\]'".]|''
trail_nop [!?;:,|/)\]'"]|''
trail_mid [!?;:,|/]
trail_end [)\]'"]|''
end {trail}*{spnl}
end_nop {trail_nop}*{spnl}
/* Abbrevations */
/* All abbreviations end with a period '.' */
abbrev "Abd."|"Ab."|"Mohd."|"Md."|"Muhd."|"Bhd."|"Drs."|"Dr."|"Dt."|"Inc."|"Sdn."|"St."|"Jln."|"Kapt."|"kg."|"kump"|"LL.B."|"LL.M"|"Lt."|"per."|"Pn."|"Pt."|"Rp."|"Tmn."|"Tn."|"Tkt."|"Tj."|"Y.bhg"|"ABD."|"AB."|"MOHD."|"MD."|"MUHD."|"BHD."|"DRS."|"DR."|"DT."|"INC."|"SDN."|"ST."|"JLN."|"KAPT."|"KG."|"KUMP"|"LL.B."|"LL.M"|"LT."|"PER."|"PN."|"PT."|"RP."|"TMN."|"TN."|"TKT."|"TJ."|"Y.BHG"|({letter}"."({letter}".")+|{upper}"."|{upper}{lcons}+".")
%s new_token
%%
/* Sentence splitting */
("!"|"?"|"."){trail_end}?/{spnl} {yyless(1); tok(1);}
{trail_end} {if (sent==1) {printf(" "); ECHO; printf(" ");} else {REJECT;}}
<new_token>{abbrev}/{end_nop} {tok(0); printf(" ");}
/* SGML entities */
£ {tok(0); printf(" ");}
{ns}/&percnt {tok(0); printf(" ");}
&{upper}*{lower}+{digit}*(;)? tok(0);
/* Contractions, possessives */
Dato/'{end} {tok(0); printf(" ");}
/* Leading punctuation */
`` {tok(0); printf(" ");}
{ns}[(\[$\xA3`"] tok(0);
` {tok(0); printf("` ");}
["] {if (sent==1) {printf(" "); ECHO; printf(" ");} else {REJECT;}}
[(\[$\xA3`"] {tok(0); printf(" ");}
/* Trailing punctuation */
"."/"..." {tok(0); printf(" ");}
"..." {printf(" "); tok(0); printf(" ");}
''/{end} {tok(0); printf(" ");}
{ns}"."/{trail_mid}{spnl} {tok(0); printf(" ");}
{ns}"."/{trail_end}{spnl} {yyless(1); tok(0); printf(" ");}
{ns}{trail_nop}+"."?/{spnl} {yyless(1); tok(0); printf(" ");}
{ns}"."/{spnl} {yyless(1); tok(0); printf(" ");}
/* Inside/outside a token */
{ns} {tok(0); BEGIN(INITIAL);}
{sp} {ECHO; BEGIN(new_token);}
{nl} {ECHO; nl++; BEGIN(new_token);}
%%
int tok(int s)
{
if (sent==1 || nl>1) /* last token indicated end of sentence. Or new para */
{printf("^ ");}
ECHO;
sent=s; nl=0;
BEGIN(new_token);
}
int main(int argc, char **argv)
{
BEGIN(new_token);
printf("^ ");
yylex();
}