-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate.py
115 lines (91 loc) · 3.49 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import requests
from lxml import html as html_parser
from lxml.html import clean
from collections import defaultdict
import re
DEBUG = False
BREAK_ON = 1
poems = []
translations = []
second_translations = []
download = range(0, 96)
URI = "http://www.sacred-texts.com/cla/priap/prp{}.htm"
NOTES_INSERT = re.compile("\[(\d+)\]")
NOTES_TEXT = re.compile("\[?\d+\.")
with open("./template.xml") as f:
template = f.read()
def check_p(*args):
names = ["poem", "translation", "notes"]
for arg in range(0, len(args)):
print(names[arg], [html_parser.tostring(p, encoding=str) for p in args[arg]])
def normalize(unit):
return html_parser.tostring(unit, encoding=str).replace("<p>", "").replace("</p>", "").replace("\xa0", "")
def reformat(paragraphs):
for paragraph in paragraphs:
p = normalize(paragraph)
lines = p.split("<br>")
lines = ["<l>"+l.replace("\n", "")+"</l>" for l in lines]
lines = [x.split("Next:")[0] for x in lines]
yield "<div type=\"textpart\" subtype=\"poem\">{}</div>".format("\n".join(lines))
def replace_notes(translations, input_notes):
notes = defaultdict(lambda: None)
for note in NOTES_TEXT.split("\n".join([normalize(unit) for unit in input_notes])):
if len(note) > 0:
notes[len(notes)+1] = note.replace("<br>", "").replace("\n", "")
output = []
def sub(match):
index = int(match.groups()[0])
if index in notes:
return "<note type=\"footnote\">"+notes[index].split("]")[0]+"</note>"
else:
return ""
for translation in translations:
output.append(NOTES_INSERT.sub(sub, translation))
return output
cleaner = clean.Cleaner(allow_tags=["br", "p"], remove_unknown_tags=False)
for poem_index in download:
print("Doing {}".format(str(poem_index)))
data = requests.get(URI.format(str(poem_index).zfill(2))).content
html = html_parser.fromstring(data)
html = cleaner.clean_html(html)
poem, translation, notes = [], [], []
paragraph_index = 0
for p in html.xpath(".//p"):
if paragraph_index == 0:
poem = [p]
elif paragraph_index >= 100: # (We found the notes)
notes.append(p)
elif html_parser.tostring(p, encoding=str).startswith("<p>[1"):
notes.append(p)
paragraph_index = 100
else:
translation.append(p)
paragraph_index += 1
if DEBUG is True:
check_p(poem, translation, notes)
poem, translation = list(reformat(poem)), list(reformat([translation[0]] + translation[1:]))
translation = replace_notes(translation, notes)
poems.append(poem[0])
translations.append(translation[0])
second_translations.append(translation[1])
with open("data/phi1103/phi001/phi1103.phi001.lascivaroma-lat1.xml", "w") as f:
f.write(template.format(
title="Priapeia",
urn="urn:cts:latinLit:phi1103.phi001.lascivaroma-lat1",
xml="\n\n".join(poems),
lang="lat"
))
with open("data/phi1103/phi001/phi1103.phi001.lascivaroma-eng1.xml", "w") as f:
f.write(template.format(
title="Priapeia",
urn="urn:cts:latinLit:phi1103.phi001.lascivaroma-eng1",
xml="\n\n".join(translations),
lang="eng"
))
with open("data/phi1103/phi001/phi1103.phi001.lascivaroma-eng2.xml", "w") as f:
f.write(template.format(
title="Priapeia",
urn="urn:cts:latinLit:phi1103.phi001.lascivaroma-eng2",
xml="\n\n".join(second_translations),
lang="eng"
))