forked from ivoa-std/EPNTAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_source.py
430 lines (343 loc) · 11.2 KB
/
parse_source.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
# -*- coding: utf-8 -*
import os
import re
import requests
import sys
from bs4 import BeautifulSoup, NavigableString
# URL of the long parameter descriptions
DESCRIPTIONS_URL = ("https://voparis-confluence.obspm.fr/display"
"/VES/EPN-TAP+v2+parameter+description")
# URL of the document with the metadata table
TABLE_URL = ("https://voparis-confluence.obspm.fr/display/VES"
"/EPN-TAP+V2.0+parameters")
# Headers of DESCRIPTIONS sections that must be skipped
IGNORED_SECTIONS = frozenset([
"Europlanet2024-RI/VESPA Discussions Board",
"EPN-TAP v2 parameter description",
])
# a global state variable (really only used to kill dumb breaks;
# breaks suck)
ELEMENT_STACK = []
# set to false in operation (for development only)
CACHE_RESULTS = False
def get_with_cache(url, bypassCache=False):
cacheName = re.sub("[^\w]+", "", url)+".cache"
if not bypassCache and CACHE_RESULTS and os.path.exists(cacheName):
doc = open(cacheName).read()
else:
req = requests.get(url)
doc = req.content
if CACHE_RESULTS:
with open(cacheName, "wb") as f:
f.write(doc)
return doc
class Accumulator(object):
"""A singleton used to collect the output.
This is mainly so we can do some last-minute, RE-based cleanup
before actually shipping out the text.
"""
parts = []
@classmethod
def emit(cls, s):
"""adds a string s to the output result.
"""
if s is None:
raise Exception("Attempting to emit a None")
if not isinstance(s, bytes):
s = s.replace("\xa0", " ").encode("utf-8")
cls.parts.append(s)
@classmethod
def postprocess(cls, stuff):
"""does some ad-hoc postprocessing to clean up mess resulting
from the translation.
"""
# first, normalise runs of linefeeds
stuff = re.sub(b"\n\n+", b"\n\n", stuff)
# filter out spurious breaks
stuff = re.sub(b"(\n+"+br"\\\\ *"+b"\n)+", b"\n\n", stuff)
return stuff
@classmethod
def write_output(cls):
"""writes the accumulated to stdout, after doing some post-processing.
"""
content = b"".join(cls.parts)
os.write(sys.stdout.fileno(), cls.postprocess(content))
emit = Accumulator.emit
write_output = Accumulator.write_output
def find_siblings_until(element, sibling_type, stop_sibling):
"""yields siblings of to sibling_type until stop_sibling is
encountered (or the document ends).
This is used to collect headings of a certain level and thus
construct the document structure.
"""
while True:
element = element.nextSibling
if element is None:
break
elif not hasattr(element, "name") or element.name is None:
pass # skip over text
elif element.name==sibling_type:
yield element
elif element.name==stop_sibling:
break
else:
pass # skip over other elements
def collect_siblings_until(element, stop_set):
"""returns a list of all siblings of element until something in
stop_set is encountered (or the document ends).
stop_set must be a set of element type names (like h1, h2...).
"""
collection = []
while True:
element = element.nextSibling
if element is None:
break
elif hasattr(element, "name") and element.name in stop_set:
break
else:
collection.append(element)
return collection
def escape_LaTeX(s):
"""returns s with LaTeX active characters replaced.
I also take the liberty of improving quotes when I can get them.
"""
return re.sub('"([^"]+)"', r"``\1''",
s.replace( "\\" , "$\\backslash$"
).replace("}" , "\\}"
).replace("{" , "\\{"
).replace("&" , "\\&"
).replace("#" , "\\#"
).replace("%" , "\\%"
).replace("_" , "\\_"))
def make_formatter(template):
"""returns a formatter using template.
That's a function filling a formatted content into a template
with a single %s; this works for make HTML elements, but some
need custom handling.
"""
def formatter(el):
if el.contents:
body = format_to_TeX(el.contents)
else:
body = ""
return template%body
return formatter
def hack_table(literal):
"""returns a LaTeX table literal hacked based on knowledge that
we have about a table.
Ugh. Let's see how we can deal with this mess later.
"""
if "UDR" in literal:
# it's the level table
return ("\\begingroup\small"
+literal.replace(" (std data format)", ""
).replace("llllllll}", "lllllp{2cm}lp{0.35\\textwidth}}"
).replace(r"EPN-TAPdefault",
r"\vbox{\vskip 2pt\hbox{EPN-}\vskip 3pt\hbox{TAP2}}")
+"\\endgroup")
else:
raise Exception("Unknown table: {}".format(literal))
def format_table(el):
"""A formatter for (halfway sane) tables.
This doesn't do nested tables or anything else not well behaved,
and the resulting tables aren't terribly pretty. This also
assumes that the first tr has the table headings.
For non-trivial tables, you'll probably need to enable special
handling using; let's see how to do it if we really get more tables.
"""
rows = el.findAll("tr")
if not rows:
# empty table, don't care
return
def format_one_row(row_el):
return "&".join(
format_el(child) for child in row_el.findAll(re.compile("t[dh]"))
)+"\\\\"
parts = ["\\begin{inlinetable}",
"\\begin{tabular}{%s}"%("l"*len(rows[0].findAll(re.compile("t[dh]")))),
"\\sptablerule"]
parts.extend([
format_one_row(rows[0]),
"\\sptablerule\n"])
for row in rows[1:]:
parts.append(format_one_row(row))
parts.extend(["\\end{tabular}",
"\\end{inlinetable}"])
return hack_table("\n".join(parts))
def format_br(el):
"""makes a break if we think LaTeX won't balk on it.
This uses global state; we inhibit breaks within tables and at the
top level.
"""
if ELEMENT_STACK==['br'] or "table" in ELEMENT_STACK:
return ""
else:
return "\\\\"
def format_a(el):
"""formats a a link as anchor plus footnote.
"""
dest = el["href"]
if dest.startswith("/"):
dest = "https://voparis-wiki.obspm.fr"+dest
return "%s\\footnote{\\url{%s}}"%(
format_to_TeX(el.contents),
dest)
def format_p(el):
"""formats a paragraph.
The main thing here is that confluence has p tags within table cells.
These, we want to suppress.
"""
if "table" in ELEMENT_STACK:
return format_to_TeX(el.contents)
else:
return "%s\n\n"%format_to_TeX(el.contents)
def format_pre(el):
"""formats a pre element, where we, for now, don't do any escaping
at all. Let's see where that goes.
"""
res = "\\begin{verbatim}\n%s\n\\end{verbatim}"%(
" ".join(el.contents))
return res
LATEX_FORMATTERS = {
"p": format_p,
"em": make_formatter("\\emph{%s}"),
"u": make_formatter("\\emph{%s}"),
"b": make_formatter("\\textbf{%s}"),
"strong": make_formatter("\\textbf{%s}"),
"br": format_br,
"ul": make_formatter("\\begin{itemize}\n%s\\end{itemize}\n\n"),
"li": make_formatter("\\item %s\n"),
"pre": format_pre,
"span": make_formatter("%s"), # TODO: figure out what this is
"div": make_formatter("\n\n%s\n\n"), # TODO: figure out what this is
"a": format_a,
"s": make_formatter("%s (\\textbf{Deleted})"),
"table": format_table,
"colgroup": make_formatter("???%s"),
"col": make_formatter("???%s"),
"tbody": make_formatter("%s"),
"td": make_formatter("%s"),
"th": make_formatter("%s"),
"h1": make_formatter("\\subsection{%s}\n\n"),
"h2": make_formatter("\\subsubsection{%s}\n\n"),
"h3": make_formatter("\\paragraph{%s}\n\n"),
}
def format_el(el):
"""returns TeX for a BeautifulSoup element el.
This dispatches based on LATEX_FORMATTERS.
"""
ELEMENT_STACK.append(el.name)
try:
return LATEX_FORMATTERS[el.name](el)
finally:
ELEMENT_STACK.pop()
def format_to_TeX(elements):
"""returns BeautifulSoup elements in LaTeX.
"""
accum = []
for el in elements:
if isinstance(el, NavigableString):
accum.append(escape_LaTeX(el.string))
else:
accum.append(format_el(el))
return "".join(accum)
def write_column_description():
"""writes a TeX formatted version of the long descriptions document.
"""
soup = BeautifulSoup(get_with_cache(DESCRIPTIONS_URL), "html")
for h1 in soup.find_all("h1"):
if h1.text.strip() in IGNORED_SECTIONS:
continue
emit(
"%% To ignore the following section, add '%s' to IGNORED_SECTIONS\n"%
h1.text)
emit(format_el(h1))
for h2 in find_siblings_until(h1, "h2", "h1"):
emit(re.sub("\d\d?- ", "", format_el(h2)))
emit(format_to_TeX(collect_siblings_until(h2,
frozenset(["h1", "h2", "h3"]))))
for h3 in find_siblings_until(h2, "h3", "h2"):
emit(format_el(h3))
emit(format_to_TeX(collect_siblings_until(h3,
frozenset(["h1", "h2", "h3"]))))
write_output()
################# Column table below here
def is_stupid_header_row(row):
"""returns true if we believe row is what the EPN-TAP people used
as section separators in the columns table.
That is: the text is red:-)
"""
try:
perhaps_p = row.contents[0].contents[0]
perhaps_span = perhaps_p.contents[0]
if perhaps_span.get("style")=='color: rgb(255,0,0);':
return True
except (AttributeError, KeyError):
pass # Fall through to False
return False
def iter_column_meta():
"""yields dictionaries with the EPN-TAP column metadata snarfed
from TABLE_URL.
"""
soup = BeautifulSoup(get_with_cache(TABLE_URL), "html")
table = soup.find("table",
{"class": "wrapped relative-table confluenceTable"})
col_labels = ["name", "type", "unit", "description",
"ucd", "ucd_obscore", "utype", "comments"]
for row in table.findAll("tr"):
first_cell = row.contents[0]
if first_cell.name=="th":
# Skip the header row
continue
# screw the stupid header lines
elif is_stupid_header_row(row):
yield {"headline": format_el(first_cell)}
else:
yield dict(zip(
col_labels,
[format_el(e).strip() for e in row.findAll("td")]))
def write_column_table():
"""write a TeX formatted rendering of the metadata table to stdout.
"""
ELEMENT_STACK.append("table")
emit("\\begingroup\\scriptsize")
emit("\\begin{longtable}{p{4cm}p{1.6cm}p{1cm}p{8.2cm}"
"p{3.5cm}}\n")
head = ("\\sptablerule\n\\textbf{Name}"
"&\\textbf{Type}"
"&\\textbf{Unit}"
"&\\textbf{Description}"
"&\\textbf{UCD}\\\\"
"\\sptablerule")
emit("%s\\endfirsthead\n%s\\endhead\n"%(
head, head))
for rec in iter_column_meta():
if "headline" in rec:
emit("\\multicolumn{5}{c}{\\vrule width 0pt height 20pt depth 12pt"
" \\textbf{%(headline)s}}\\\\\n"%rec)
else:
rec["unit"] = re.sub(r"\((\d)\)",
lambda mat: "Note~\\ref{atn-%s}"%mat.group(1),
rec["unit"])
rec["ucd"] = re.sub(r"\((\d)\)",
lambda mat: "Note~\\ref{atn-%s}"%mat.group(1),
rec["ucd"])
if re.match("\w+\.", rec["ucd"]):
rec["ucd"] = "\\ucd{%s}"%rec["ucd"]
emit("%(name)s&%(type)s&%(unit)s&%(description)s"
"&%(ucd)s\\\\\n"%rec)
emit("\\sptablerule\n")
emit("\\end{longtable}\n")
emit("\\endgroup\n")
write_output()
if __name__=="__main__":
what = None
if len(sys.argv)==2:
what = sys.argv[1]
if what=="columntable":
write_column_table()
elif what=="columndescription":
write_column_description()
else:
sys.stderr.write("Usage: %s columndescription|columntable\n")
# vi:ts=2:et:sta