-
Notifications
You must be signed in to change notification settings - Fork 0
/
jsontotext.py
493 lines (429 loc) · 21.3 KB
/
jsontotext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
"""Convert provided MAM JSON file into a simple JSON file and an HTML file
in which each verse is plain Unicode text."""
import argparse
import inspect
import io
import json
import os
from html import escape
from pathlib import Path
import pprint
arg_parser = argparse.ArgumentParser(description=__doc__)
arg_parser.add_argument("input_file_path")
arg_parser.add_argument(
"--template-table",
dest="template_table",
default=False,
action="store_true",
help="show table of templates found",
)
args = arg_parser.parse_args()
# NOTE: This strips things down to a plain text. TODO: Create version that preserves special formatting, notes, etc.
# TODO: remove odd "HTML" tag in Deut. 11:21 - asked about fix upstream
# TODO: Handle all templates in the Tanakh (currently only handles those in Torah).
# TODO: Can this process be replaced by the simplified version for Sefaria that Ben has now in Github?
class JsonToText:
"""Turns MAM JSON into plain text HTML."""
def __init__(self):
self.found_templates = {}
def dispatch_template(self, item):
# template_data = item["tmpl"]
template_name = item["tmpl_name"]
if template_name in self.found_templates:
self.found_templates[template_name]["count"] += 1
else:
self.found_templates[template_name] = {"count": 1}
if template_name == "נוסח":
# Handle the "documentation template".
return self.process_template_nusach(item)
if template_name == "מ:פסק":
return self.process_template_psik()
if template_name == "מ:לגרמיה-2":
# TODO: ask Ben why MAM-parsed "plus version" has the "-2"
# Currently has the same result as psik except for one less space.
# A more advanced formatting application (beyond plain Unicode text) might handle it differently.
return self.process_template_legarmeih() # TODO: something different?
if template_name == "מ:קמץ":
return self.process_template_kamatz(item)
if template_name == "מ:ששש":
return self.process_template_setumah()
if template_name == "ר3":
return self.process_template_setumah()
if template_name == 'קו"כ-אם':
return self.process_template_nusach(item)
if template_name == "מ:הערה":
# TODO: still need this if "plus" version has "-2"?
# Footnotes are not included in the output.
# A more advanced formatting application might include them.
return ""
if template_name == "מ:הערה-2":
# TODO: ask Ben why MAM-parsed "plus version" has the "-2"
# it seems to be a different format in that the text being "footnoted" is inside the template
return self.process_template_nusach(item)
if template_name == 'כו"ק':
return self.process_template_ketiv_keri(item)
if template_name == "שני טעמים באות אחת":
return self.process_template_two_tropes(item)
if template_name == "שני טעמים באות אחת קמץ-תחתון-פתח-עליון":
return self.process_template_qupo(item)
if template_name == "מ:כפול":
return self.process_template_kaful(item)
if template_name == "מ:אות מנוקדת":
return self.process_template_nusach(item)
if template_name == "מ:אות-ג":
return self.process_template_nusach(item)
if template_name == "מ:אות-ק":
return self.process_template_nusach(item)
if template_name == 'קו"כ':
# Handle "keri ketiv" like "ketiv keri".
# On the related Wikisource project, "keri ketiv" is a way of formatting "ketiv keri" pairs that involve a maqef.
# This application handles both cases the same way.
return self.process_template_ketiv_keri(item)
if template_name == "סס":
return self.process_template_setumah()
if template_name == "ססס":
return self.process_template_setumah()
if template_name == "מ:גרשיים ותלישא גדולה":
return self.process_template_gershayim_telisha_gedolah()
if template_name == 'מ:כו"ק כתיב מילה חדה וקרי תרתין מילין':
return self.process_template_ketiv_keri(item)
if template_name == 'מ:כו"ק כתיב תרתין מילין וקרי מילה חדה':
return self.process_template_ketiv_keri(item)
if template_name == "קרי ולא כתיב":
return self.process_template_nusach(item)
if template_name == "כתיב ולא קרי":
return self.process_template_nusach(item)
# it seems like instead of the "פסקא באמצע פסוק" template that used
# to be in the source JSON, there's now just a "פפ" inside a verse
if template_name == "פפ":
return self.process_template_setumah()
# if template_name == "פסקא באמצע פסוק":
# return self.process_template_setumah()
if template_name == 'מ:נו"ן הפוכה':
return self.process_template_nusach(item)
if template_name == "ירח בן יומו":
return "\u05AA"
if template_name == "מ:טעם":
return self.process_template_taam(item)
if template_name == "מ:אות-מיוחדת-במילה":
return self.process_template_special_letter(item)
self.found_templates[template_name]["handled"] = False
return self.process_template(item)
def process_template_nusach(self, item):
"""Handle documentation template, or any other case where desired output is first template parameter."""
template_items = item["tmpl_args"]
template_items.insert(0, item["tmpl_name"])
if self.current_verse == "לב:ו":
print("AAA")
print(template_items)
template_items = [[ti] for ti in template_items] # weird legacy format
# FUTURE: footnotes or similar?
# return '<span style="color: green">{}</span>'.format(self.process_templates(template_items[1]))
if self.current_verse == "לב:ו":
print("BBB")
print(template_items[1])
return self.process_templates(template_items[1])
def process_template_psik(self):
return " \u05C0 "
def process_template_legarmeih(self):
return " \u05C0"
def process_template_kamatz(self, item):
# print(item) # debugging
template_items = item["tmpl_args"] if "tmpl_args" in item else []
template_items.insert(0, item["tmpl_name"])
template_items = [[ti] for ti in template_items] # weird legacy format
# print(template_items) # debugging
for template_item in [ti[0] for ti in template_items]:
if template_item.startswith("ד="):
return template_item.split("=")[1]
raise NotImplementedError
def process_template_setumah(self):
return " "
def process_template_ketiv_keri(self, item):
template_items = item["tmpl_args"] if "tmpl_args" in item else []
template_items.insert(0, item["tmpl_name"])
template_items = [[ti] for ti in template_items] # weird legacy format
return "{} [{}]".format(
template_items[1][0], self.process_templates(template_items[2])
)
def process_template_two_tropes(self, item):
template_items = item["tmpl"]
return "{}{}".format(template_items[1][0], template_items[2][0])
def process_template_qupo(self, item):
template_items = item["tmpl"]
# 034F = COMBINING GRAPHEME JOINER
# 05B7 = HEBREW POINT PATAH
return "\u034F\u05B7" + self.process_templates(template_items[1])
def process_template_taam(self, item):
# this is a weird one, you throw away 1st character and return second
template_items = item["tmpl"]
return template_items[1][0][1]
def process_template_gershayim_telisha_gedolah(self):
return "\u05A0\u059E" # TODO: in Lev. 10:4, why not different mark on each character?
def process_template_kaful(self, item):
# print(self.current_verse) # debugging
# pprint.pprint(item) # debugging
template_items = item["tmpl_args"]
# The first argument seems to be either a string beginning with "כפול" or a list of items,
# the first one which is a string beginning with "כפול", but which then needs recursive processing.
if isinstance(template_items[0], str):
return template_items[0].replace("כפול=", "")
template_items[0][0] = template_items[0][0].replace("כפול=", "")
return self.process_templates(template_items[0])
def process_template_special_letter(self, item):
return self.process_templates([item["tmpl_args"][0]])
def process_template(self, item):
"""Default template handler."""
output_string = '<span style="color: red">TEMPLATE(</span>'
# template is a list of lists, although second level is (always?) a 1-item list, unless a nested template
is_first = True
# print(item) # debugging
template_items = item["tmpl_args"] if "tmpl_args" in item else []
template_items.insert(0, item["tmpl_name"])
template_items = [[ti] for ti in template_items] # weird legacy format
for template_item in template_items:
if is_first:
is_first = False
# first item of template is template type
output_string += '<span style="color: #99f">{}</span>'.format(
self.process_templates(template_item)
) # TODO: really need to call process_templates here?
else:
output_string += '<span style="color: red">, </span>'
# output_string += escape(str(template_item))
output_string += self.process_templates(template_item)
output_string += '<span style="color: red">)</span>'
return output_string
def process_templates(self, data):
if not isinstance(data, list):
raise ValueError("data must be a list")
output_string = ""
for item in data:
# it's a string, a template, or a list that contains strings and/or templates
if isinstance(item, list):
output_string += self.process_templates(item)
elif isinstance(item, str):
if "//" in item:
# TODO: is this in current version of source JSON?
item = item.replace("//", "")
# if self.current_verse == "ד:ה":
# print(" OH NO!")
# # print("data: ", data)
# print("SSS")
# print(item)
# if "tmpl" in item:
# print("very bad!")
output_string += escape(item)
else:
# if it looks like a template
if isinstance(item, dict) and "tmpl_name" in item:
s = self.dispatch_template(item)
if "tmpl" in s:
print("very bad!!")
print("---")
print(item)
print("---")
print(s)
print("---")
print("caller name:", inspect.stack()[1][3])
print("---")
print("---")
output_string += s
elif "custom_tag" in item:
# TODO: is this in current version of source JSON?
# I'm not even sure we were handling it correctly before because the live parsh.io
# site had some escaped tags (e.g., "<...>")
output_string += escape("<{}>".format(item["custom_tag"]))
# remove varika
# TODO: put this back in once everything else is working and we know what it is (sheva na?)
# output_string = output_string.replace("\u05B0", "") # HEBREW POINT SHEVA
# output_string = output_string.replace(
# "\uFB1E", ""
# ) # HEBREW POINT JUDEO-SPANISH VARIKA
if self.current_verse == "לב:ו":
print("CCC")
print(output_string)
return output_string
def main(self):
input_file_path = Path(args.input_file_path).resolve()
file_stem = input_file_path.stem
with open(
input_file_path.parent.parent.joinpath("book24names.json"), encoding="utf-8"
) as input_file:
book24_names = json.load(input_file)
# output_file_stem = None
book24_names.sort(key=lambda book24: book24["number"])
for book24_name in book24_names:
if book24_name["mam_parsed_file_stem"] == file_stem:
output_file_stem = book24_name["mam_parsed_file_stem_old"]
this_book_number = book24_name["number"]
break
books_in_file = [
book24_names["book24_name"]
for book24_names in book24_names
if book24_names["mam_parsed_file_stem_old"] == output_file_stem
]
book24_names_by_hebrew_name = {}
for book24_name in book24_names:
book24_names_by_hebrew_name[book24_name["book24_name"]] = book24_name
output_file_path = (
input_file_path.parent.parent.joinpath("miqra-json-html")
.joinpath(output_file_stem)
.with_suffix(".html")
)
output_json_file_path = (
input_file_path.parent.parent.joinpath("miqra-json-simple")
.joinpath(output_file_stem)
.with_suffix(".json")
)
print(output_file_path)
simple_books = []
temp_stream = io.StringIO()
# vc = 0 # debugging
with open(args.input_file_path, encoding="utf-8") as input_file:
data = json.load(input_file)
# In the source JSON, "book24" is the tradtional division of books (there are 24, some have sub-books),
# and "book39s" is each sub-book separately (there are 39).
# TODO: 5 sections of Psalms are not sub-books? (are they indicated in source JSON?)
for book in data["book39s"]:
simple_book = {}
simple_books.append(simple_book)
book_name = book["book24_name"]
simple_book["book_name"] = book_name
simple_book["sub_book_name"] = book["sub_book_name"]
simple_book["chapters"] = {}
temp_stream.write("\n<h1>{}</h1>".format(escape(book_name)))
if book["sub_book_name"]:
temp_stream.write(
"<h1>: {}</h1>".format(escape(book["sub_book_name"]))
)
chapters = book["chapters"]
for hebrew_chapter_number in chapters:
simple_book["chapters"][hebrew_chapter_number] = {}
temp_stream.write(
"\n<h2>פרק {}</h2>".format(escape(hebrew_chapter_number))
)
chapter = chapters[hebrew_chapter_number]
for hebrew_verse_number in chapter:
verse = chapter[hebrew_verse_number]
self.current_verse = "{}:{}".format(
hebrew_chapter_number, hebrew_verse_number
)
if (
len(verse) != 3
): # validate that there are 3 "columns" (from the spreadsheet) in a verse
raise "error" # TODO exception
# TODO: The actual text is in the third item - do something with other info.
# process_templates(verse[0], temp_stream)
# process_templates(verse[1], temp_stream)
template_items = verse[2]
# if (
# hebrew_chapter_number == "א" and hebrew_verse_number == "ב"
# ): # debugging
# print(book_name, template_items)
# vc += 1
# if vc > 10:
# continue
resolved_html = self.process_templates(template_items)
if resolved_html:
simple_book["chapters"][hebrew_chapter_number][
hebrew_verse_number
] = resolved_html
temp_stream.write(
"\n<h3>{}</h3>".format(escape(hebrew_verse_number))
)
# output_string = "\n<p>"
temp_stream.write(resolved_html)
# output_string += "</p>"
# TODO: Is this working?
if args.template_table:
self.write_template_table(temp_stream)
# load the output files if they already exist
# html
existing_output_data = []
if os.path.exists(output_file_path):
with open(
output_file_path, "r", encoding="utf-8"
) as existing_output_file:
existing_output_data = existing_output_file.readlines()
# json
existing_simple_books = []
if os.path.exists(output_json_file_path):
with open(
output_json_file_path, "r", encoding="utf-8"
) as existing_json_file:
existing_simple_books = json.load(existing_json_file)
existing_simple_books_by_name = {}
for existing_simple_book in existing_simple_books:
existing_simple_books_by_name[existing_simple_book["book_name"]] = (
existing_simple_book
)
# splice in this book
# new_simple_books = []
# for book in books_in_file:
# if book in existing_simple_books_by_name:
# new_simple_books.append(existing_simple_books_by_name[book])
# elif book ==
existing_simple_books.extend(simple_books)
def get_book_number(book_name):
return book24_names_by_hebrew_name[book_name]["number"]
existing_simple_books.sort(
key=lambda book: get_book_number(book["book_name"])
)
simple_books = existing_simple_books
existing_books = {}
current_book = []
for line in existing_output_data:
if line.startswith("<h1>"):
hebrew_book_name = line.split("<h1>")[1].split("</h1>")[0]
book_number = book24_names_by_hebrew_name[hebrew_book_name].get(
"number", 0
)
# print(
# "existing book: {} {}".format(hebrew_book_name, book_number)
# ) # debugging
current_book = []
existing_books[book_number] = current_book
if len(existing_books):
current_book.append(line)
# splice in this book
existing_books[this_book_number] = temp_stream.getvalue().split("\n")
# write the output file
with open(output_file_path, "w", encoding="utf-8") as output_file:
output_file.write("<!DOCTYPE html>\n")
output_file.write('<div dir="rtl">')
print("existing books: {}".format(existing_books.keys()))
for existing_book_key in sorted(existing_books.keys()):
existing_book = existing_books[existing_book_key]
for line in existing_book:
if line.strip():
output_file.write("\n" + line.strip())
if not line.endswith("</div>"):
output_file.write("</div>")
print('HTML written to "{}".'.format(output_file_path))
with open(output_json_file_path, "w", encoding="utf-8") as output_json_file:
json.dump(simple_books, output_json_file, indent=2, ensure_ascii=False)
print('JSON written to "{}".'.format(output_json_file_path))
def write_template_table(self, output_file):
output_file.write('\n<table border="1">')
for key in self.found_templates:
if (
"handled" in self.found_templates[key]
and not self.found_templates[key]["handled"]
):
output_file.write(
'\n<tr><td>{}</td><td style="color: red">{}</td></tr>'.format(
escape(str(key)),
escape(str(self.found_templates[key])),
)
)
else:
output_file.write(
"\n<tr><td>{}</td><td>{}</td></tr>".format(
escape(str(key)),
escape(str(self.found_templates[key]["count"])),
)
)
output_file.write("\n</table>\n")
if __name__ == "__main__":
JsonToText().main()