-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathhtml2schedule.py
executable file
·123 lines (99 loc) · 3.68 KB
/
html2schedule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: UTF-8 -*-
import os
import sys
from collections import OrderedDict
from datetime import datetime
import locale
import argparse
import requests
import pytz
from bs4 import BeautifulSoup, element
import voc.tools
from voc.schedule import Schedule, Event
locale.setlocale(locale.LC_ALL, 'de_DE.UTF-8')
tz = pytz.timezone("Europe/Amsterdam")
parser = argparse.ArgumentParser()
parser.add_argument('acronym', action="store", help="Series acronym, e.g. cadusdebate")
parser.add_argument('--title', action="store", help="Series title, e.g. CADUS Debate!", default='CADUS Debate!')
parser.add_argument('--url', action="store", help="Source url, e.g. https://cadus.org/debate", default='https://cadus.org/debate')
parser.add_argument('-o', action="store", dest="output", help="output filename, e.g. current", default='current')
args = parser.parse_args()
acronym = args.acronym.lower()
def fetch_schedule(series_title, source_url):
print("Requesting source")
soup = BeautifulSoup(requests.get(source_url).text, 'html5lib')
infobox = soup.select('div.info-box > p')
date = infobox[0].get_text().replace('Start', '')
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
try:
start = tz.localize(datetime.strptime(date, '%A %d.%m.%Y %H:%M'))
except ValueError:
time = next(filter(lambda x: 'Stream:' in x.get_text(), infobox)).get_text().strip().split(' ')[-1]
start = tz.localize(datetime.strptime(f'{date} {time}', '%A %d.%m.%Y %H:%M'))
duration = 2 * 60
schedule = Schedule.from_template(
title=series_title,
acronym=acronym,
year=start.year,
month=start.month,
day=start.day)
schedule.schedule().version = '1.0'
guid = voc.tools.gen_uuid('{}-{}'.format(start, acronym))
local_id = sum(1 for node in soup.find('h3').parent.children if type(node) is element.Tag)
title = soup.select('h2')[0].text
abstract = None
body = soup.select('div.ce_rs_column_start > div.ce_text.block > p')[0]
persons = []
external_links = {}
try:
external_links = voc.tools.parse_html_formatted_links(body)
except Exception:
pass
print("Found event on {} with title '{}'".format(start, title))
schedule.add_event(Event({
'id': local_id,
'guid': guid,
# ('logo', None,
'date': start.isoformat(),
'start': start.strftime('%H:%M'),
'duration': '%d:%02d' % divmod(duration, 60),
'room': 'Crisis Response Makerspace',
'slug': '{slug}-{id}-{name}'.format(
slug=acronym,
id=local_id,
name=voc.tools.normalise_string(title.lower())
),
'url': source_url,
'title': title,
'subtitle': 'debate {id}'.format(id=local_id),
'track': None,
'type': None,
'language': 'de',
'abstract': abstract or '',
'description': str(body),
'persons': [{
'id': 0,
'public_name': p.strip(),
} for p in persons],
'links': [
{'url': link_url, 'title': link_title} for link_url, link_title in external_links.items()
]
}))
return schedule
def main():
schedule = fetch_schedule(args.title, args.url)
schedule.export(args.output)
print('')
print('end')
if __name__ == '__main__':
output_dir = "/srv/www/" + acronym
secondary_output_dir = "./" + acronym
if len(sys.argv) == 2:
output_dir = sys.argv[1]
if not os.path.exists(output_dir):
if not os.path.exists(secondary_output_dir):
os.mkdir(output_dir)
else:
output_dir = secondary_output_dir
os.chdir(output_dir)
main()