-
Notifications
You must be signed in to change notification settings - Fork 9
/
__convert.py
328 lines (283 loc) · 12.2 KB
/
__convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# Quick script to produce complete collatinus transformation
# Data come from https://github.com/biblissima/collatinus/tree/master/bin/data
# See README.md at lemmata/collatinus/README.md
#
# This file needs to be run in collatinus working dir with python 3.4 or higher
#
import json
import re
from copy import deepcopy
import unicodedata
def normalize_unicode(lines):
""" Transform lines into diacritics bytes free unicode
"""
return unicodedata.normalize('NFKD', lines).encode('ASCII', 'ignore').decode()
#############################################################
# Convert morphos.la
# Each line of Morphos.la represents a declension name
#############################################################
search = re.compile(r'[^-]')
morphos = {}
with open("src/morphos.la") as f:
for line in f.readlines():
line = line.strip()
if line.startswith("!") or len(line) < 2:
pass
else:
# Person
# If we have a person, we know it's a verb
line = line\
.replace("1ère", "v1-------")\
.replace("2ème", "v2-------")\
.replace("3ème", "v3-------")
# Number
line = line\
.replace("singulier", "--s------")\
.replace("pluriel", "--p------")
# Tense
line = line\
.replace("présent", "---p-----")\
.replace("imparfait", "---i-----")\
.replace("parfait", "---r-----")\
.replace("PQP", "---l-----")\
.replace("futur antérieur", "---t-----")\
.replace("futur", "---f-----")
# Mood
line = line\
.replace("indicatif", "----i----")\
.replace("subjonctif", "----s----")\
.replace("infinitif", "v---n----")\
.replace("impératif", "----m----")\
.replace("gérondif", "----g----")\
.replace("adjectif verbal", "----g----")\
.replace("participe", "g---p----")\
.replace("supin en -um", "----u----")\
.replace("supin en -u", "----u----")
# Voice
line = line\
.replace("actif", "-----a---")\
.replace("passif", "-----p---")
# Gender
line = line\
.replace("masculin", "------m--")\
.replace("féminin", "------f--")\
.replace("neutre", "------n--")
# Case
line = line\
.replace("nominatif", "-------n-")\
.replace("génitif", "-------g-")\
.replace("accusatif", "-------a-")\
.replace("datif", "-------d-")\
.replace("ablatif", "-------b-")\
.replace("vocatif", "-------v-")\
.replace("locatif", "-------l-")
# Degree
line = line\
.replace("comparatif", "a-------c")\
.replace("superlatif", "a-------s")\
.replace("positif", "a-------p")
line = line.replace("461:", "416:--------")
# Then we merge the tags
new_tag = "---------"
index, tags = line.split(":")
tags = tags.split()
for tag in tags:
for x in search.finditer(tag):
i = x.start()
new_tag = new_tag[:i] + tag[i] + new_tag[i+1:]
morphos[index] = new_tag
assert morphos["190"] == "g-sppamv-"
assert morphos["121"] == "v1spia---"
#############################################################
# Convert modeles.la
# Line starting with $ are variable that can be reused
# Set of line starting with model are models
# R:int:int,int (Root number, Character to remove to get canonical form, number of character to add to get the root)
# -> eg. : for uita, R:1:1,0 would get root 1, 1 character to remove, 0 to add -> uit
# -> eg. : for epulae, R:1:2,0 would get root 1, 2 character to remove, 0 to add : epul
#############################################################
def parse_range(des_number):
""" Range
:return: Int reprenting element of the range
"""
ids = []
for des_group in des_number.split(","): # When we have ";", we should parse it normally
if "-" in des_group:
start, end = tuple([int(x) for x in des_group.split("-")])
ids += list(range(start, end + 1))
else:
ids += [int(des_group)]
return ids
def convert_models(lines):
models = {}
__model = {
"R": {},
"abs": [], # Unused desinence if inherits
"des": {}, # Dict of desinences
"suf": [], # Dict of Suffixes
"sufd": [] # Possible endings
}
__R = re.compile("^R:(?P<root>\d+):(?P<remove>-|\w+)[,:]?(?P<add>\w+)?", flags=re.UNICODE)
__des = re.compile("^des[\+]?:(?P<range>[\d\-,]+):(?P<root>\d+):(?P<des>[\w\-,;]+)?$", flags=re.UNICODE)
last_model = None
variable_replacement = {}
for lineno, line in enumerate(lines):
line = line.strip()
# If we get a variable
if line.startswith("$"):
# We split the line on =
var, rep = tuple(line.split("="))
# We create a replacement variable
variable_replacement[var] = rep
elif len(line) > 0 and not line.startswith("!"):
if line.startswith("modele:"):
last_model = line[7:]
models[last_model] = deepcopy(__model)
elif line.startswith("pere:"):
# Inherits from parent
models[last_model].update(
deepcopy(models[line[5:]])
)
elif line.startswith("R:"):
# Still do not know how to deal with "K"
root, remove, chars = __R.match(line).groups()
if remove == "-":
# ToDo: Check how radical with "-" should work
continue
if chars == "0":
chars = ""
models[last_model]["R"][root] = [remove, chars]
elif line.startswith("des"):
# We have new endings
# des:range:root_number:list_of_des
# First we apply desinence variables replacement
if "$" in line:
for var, rep in variable_replacement.items():
# First we replace +$
line = re.sub(
"(\w+)(\+?\{})".format(var),
lambda x: (
";".join([x.group(1) + r for r in rep.split(";")])
),
line, flags=re.UNICODE
)
line = line.replace(var, rep)
if "$" not in line:
break
try:
des_number, root, des = __des.match(line).groups()
except AttributeError as E:
print(line, lineno)
raise E
if not des:
# ToDo : "Deal with empty value in desinence ?"
continue
nums = parse_range(des_number)
desinence = des.split(";")
last_des = []
for desinence_index, desinence_num in enumerate(nums):
if desinence_index >= len(desinence):
# We might have ranges where number of item < ranges. This seems to mean last item is repeated.
current_des = last_des
else:
current_des = desinence[desinence_index].replace("-", "").split(",")
if current_des:
desinence_int = int(desinence_num)
# If we have des+, we add to the known desinence
if line.startswith("des+") and desinence_int in models[last_model]["des"]:
models[last_model]["des"][desinence_int].append((root, current_des))
else:
models[last_model]["des"][desinence_int] = [(root, current_des)]
last_des = current_des
else:
print("Line %s : No desinence for id %s (%s)" % (lineno, desinence_num, last_model))
elif line.startswith("abs:"):
models[last_model]["abs"] = parse_range(line[4:]) # Add the one we should not find as desi
elif line.startswith("suf:"):
rng, suf = tuple(line[4:].split(":"))
models[last_model]["suf"].append([suf, list(parse_range(rng))]) # Suffixes are alternative ending
elif line.startswith("sufd:"):
models[last_model]["sufd"] += line[5:].split(",") # Sufd are suffix always present
else:
if line.startswith("pos"):
continue
print(line.split(":")[0], lineno)
return models
with open("./src/modeles.la") as f:
lines = normalize_unicode(f.read()).split("\n")
norm_models = convert_models(lines)
assert norm_models["fortis"]["des"][13] == [("4", [''])],\
"Root 4, Empty string (originally '-') expected, found %s %s" % norm_models["fortis"]["des"][13][0]
assert norm_models["fortis"]["des"][51] == [("1", ["iorem"])],\
"Root 4, iorem expected, found %s %s" % norm_models["fortis"]["des"][50][0]
assert norm_models["dico"]["des"][181] == [("0", ["e"]), ("0", [""])],\
"[(0, e), (0, ''), found %s %s " % tuple([str(x) for x in norm_models["dico"]["des"][181]])
assert norm_models["edo"]["des"][122] == [("0", ["is"]), ("3", ["es"])],\
"[(0, is), (3, es), found %s %s " % tuple([str(x) for x in norm_models["edo"]["des"][122]])
############################################
#
# Get the lemma converter
#
# lemma=lemma|model|genitive/infectum|perfectu|morpho indications
#
############################################
def parseLemma(lines):
"""
:param lines:
:param normalize:
:return:
# ToDo: Fix issue with
Caeres2=Cāeres|miles|Cāerĭt,Cāerĭtēt||ĭtis, (-ētis), f.|2
Caerēs=Cāerēs|diues|Cāerĭt||ĭtis|2
"""
lemmas = {}
lemma_without_variations = re.compile(
r"^(?P<lemma>\w+\d?){1}(?:\=(?P<quantity>[\w,]+))?\|"
r"(?P<model>\w+)?\|"
r"[-]*(?P<geninf>[\w,]+)?[-]*\|"
r"[-]*(?P<perf>[\w,]+)?[-]*\|"
r"(?P<lexicon>.*)?",
flags=re.UNICODE
)
for lineno, line in enumerate(lines):
if not line.startswith("!") and "|" in line:
if line.count("|") != 4:
# We need to clean up the mess
# Some line lacks a |
# I assume this means we need;ĭbŭs to add as many before the dictionary
should_have = 4
missing = should_have - line.count("|")
last_one = line.rfind("|")
line = line[:last_one] + "|" * missing + line[last_one:]
result = lemma_without_variations.match(line)
if result:
result = result.groupdict(default=None)
# we always normalize the key
lemmas[normalize_unicode(result["lemma"])] = result
else:
print("Unable to parse lemma", line)
return lemmas
with open("./src/lemmes.la") as f:
lines = normalize_unicode(f.read()).split("\n")
lemmas = parseLemma(lines)
assert lemmas["volumen"]["geninf"] == "volumin"
assert lemmas["volumen"]["lemma"] == "volumen"
assert lemmas["volumen"]["model"] == "corpus"
with open("./src/lem_ext.la") as f:
lines = normalize_unicode(f.read()).split("\n")
lemmas.update(parseLemma(lines))
with open("./collected.json", "w") as f:
json.dump(
{
"pos": morphos,
"models": norm_models,
"lemmas": lemmas,
"maps": {
quantity: lemma
for lemma, infos in lemmas.items()
if infos["quantity"] and "," in infos["quantity"]
for quantity in infos["quantity"].split(",")
}
},
f
)