-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmf2util.py
871 lines (727 loc) · 31.4 KB
/
mf2util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
"""Utilities for interpreting mf2 data.
Microformats2 is a general way to mark up any HTML document with
classes and propeties. This module uses domain-specific assumptions
about the classes (specifically h-entry and h-event) to extract
certain interesting properties."""
from __future__ import unicode_literals
from collections import deque
from datetime import tzinfo, timedelta, datetime, date
import logging
import re
import string
import unicodedata
import sys
PY3 = sys.version_info[0] >= 3
# 2/3 compatibility
if PY3:
from urllib.parse import urljoin
from datetime import timezone
utc = timezone.utc
timezone_from_offset = timezone
string_type = str
else:
from urlparse import urljoin
string_type = unicode
# timezone shims for py2
class UTC(tzinfo):
"""UTC timezone, from Python documentation
https://docs.python.org/2/library/datetime.html#tzinfo-objects"""
def utcoffset(self, dt):
return timedelta(0)
def tzname(self, dt):
return "UTC"
def dst(self, dt):
return timedelta(0)
class FixedOffset(tzinfo):
"""A class building tzinfo objects for fixed-offset time zones.
Note that FixedOffset(0, "UTC") is a different way to build a
UTC tzinfo object.
Fixed offset in minutes east from UTC. from Python 2 documentation
https://docs.python.org/2/library/datetime.html#tzinfo-objects"""
def __init__(self, offset, name):
self.__offset = offset
self.__name = name
def utcoffset(self, dt):
return self.__offset
def tzname(self, dt):
return self.__name
def dst(self, dt):
return timedelta(0)
utc = UTC()
timezone_from_offset = FixedOffset
URL_ATTRIBUTES = {
'a': ['href'],
'link': ['href'],
'img': ['src'],
'audio': ['src'],
'video': ['src', 'poster'],
'source': ['src'],
}
# From https://indieweb.org/location#How_to_determine_the_location_of_a_microformat
LOCATION_PROPERTIES = frozenset((
'street-address',
'extended-address',
'post-office-box',
'locality',
'region',
'postal-code',
'country-name',
'label',
'latitude',
'longitude',
'altitude',
'name',
))
def find_first_entry(parsed, types):
"""Find the first interesting h-* object in BFS-order
:param dict parsed: a mf2py parsed dict
:param list types: target types, e.g. ['h-entry', 'h-event']
:return: an mf2py item that is one of `types`, or None
"""
return next(_find_all_entries(parsed, types, False), None)
def find_all_entries(parsed, types, include_properties=False):
"""Find all h-* objects of a given type in BFS-order. Traverses the
top-level items and their children and descendents. Includes property
values (e.g. finding all h-cards would not find values of
"p-author h-card") only if `include_properties` is True.
:param dict parsed: a mf2py parsed dict
:param list types: target types, e.g. ['h-entry', 'h-event']
:param boolean include_properties: include properties in search of entries
:return: all entries with any of the the target types
"""
return list(_find_all_entries(parsed, types, include_properties))
def _find_all_entries(parsed, types, include_properties):
queue = deque(item for item in parsed['items'])
while queue:
item = queue.popleft()
if any(h_class in item.get('type', []) for h_class in types):
yield item
queue.extend(item.get('children', []))
if include_properties:
queue.extend(prop for props in item.get('properties', {}).values()
for prop in props if isinstance(prop, dict))
def find_datetimes(parsed):
"""Find published, updated, start, and end dates.
:param dict parsed: a mf2py parsed dict
:return: a dictionary from property type to datetime or date
"""
hentry = find_first_entry(parsed)
result = {}
if hentry:
for prop in ('published', 'updated', 'start', 'end'):
date_strs = hentry['properties'].get(prop, [])
result[prop] = parse_datetime(' '.join(date_strs))
def get_plain_text(values, strip=True):
"""Get the first value in a list of values that we expect to be plain-text.
If it is a dict, then return the value of "value".
:param list values: a list of values
:param boolean strip: true if we should strip the plaintext value
:return: a string or None
"""
if values:
v = values[0]
if isinstance(v, dict):
v = v.get('value', '')
if strip:
v = v.strip()
return v
def classify_comment(parsed, target_urls):
"""Find and categorize comments that reference any of a collection of
target URLs. Looks for references of type reply, like, and repost.
:param dict parsed: a mf2py parsed dict
:param list target_urls: a collection of urls that represent the
target post. this can include alternate or shortened URLs.
:return: a list of applicable comment types ['like', 'reply', 'repost']
"""
def process_references(objs, reftypes, result):
for obj in objs:
if isinstance(obj, dict):
if any(url in target_urls for url
in obj.get('properties', {}).get('url', [])):
result += (r for r in reftypes if r not in result)
elif obj in target_urls:
result += (r for r in reftypes if r not in result)
result = []
hentry = find_first_entry(parsed, ['h-entry'])
if hentry:
reply_type = []
if 'rsvp' in hentry['properties']:
reply_type.append('rsvp')
if 'invitee' in hentry['properties']:
reply_type.append('invite')
reply_type.append('reply')
# TODO handle rel=in-reply-to
for prop in ('in-reply-to', 'reply-to', 'reply'):
process_references(
hentry['properties'].get(prop, []), reply_type, result)
for prop in ('like-of', 'like'):
process_references(
hentry['properties'].get(prop, []), ('like',), result)
for prop in ('repost-of', 'repost'):
process_references(
hentry['properties'].get(prop, []), ('repost',), result)
return result
def parse_author(obj):
"""Parse the value of a u-author property, can either be a compound
h-card or a single name or url.
:param object obj: the mf2 property value, either a dict or a string
:result: a dict containing the author's name, photo, and url
"""
result = {}
if isinstance(obj, dict):
names = obj['properties'].get('name')
photos = obj['properties'].get('photo')
urls = obj['properties'].get('url')
if names:
result['name'] = names[0]
if photos:
result['photo'] = photos[0]
if urls:
result['url'] = urls[0]
elif obj:
if obj.startswith('http://') or obj.startswith('https://'):
result['url'] = obj
else:
result['name'] = obj
return result
def find_author(parsed, source_url=None, hentry=None, fetch_mf2_func=None):
"""Use the authorship discovery algorithm
https://indiewebcamp.com/authorship to determine an h-entry's
author.
:param dict parsed: an mf2py parsed dict.
:param str source_url: the source of the parsed document.
:param hentry dict: optional, the h-entry we're examining, if omitted,
we'll just use the first one
:param fetch_mf2_func callable: optional function that takes a URL
and returns parsed mf2
:return: a dict containing the author's name, photo, and url
"""
def find_hentry_author(hentry):
for obj in hentry['properties'].get('author', []):
return parse_author(obj)
def find_parent_hfeed_author(hentry):
for hfeed in _find_all_entries(parsed, ['h-feed'], False):
# find the h-entry's parent h-feed
if hentry in hfeed.get('children', []):
for obj in hfeed['properties'].get('author', []):
return parse_author(obj)
if not hentry:
hentry = find_first_entry(parsed, ['h-entry'])
if not hentry:
return None
author_page = None
# 3. if the h-entry has an author property, use that
author = find_hentry_author(hentry)
# 4. otherwise if the h-entry has a parent h-feed with author property,
# use that
if not author:
author = find_parent_hfeed_author(hentry)
# 5. if an author property was found
if author:
# 5.2 otherwise if author property is an http(s) URL, let the
# author-page have that URL
if list(author.keys()) == ['url']:
author_page = author['url']
# 5.1 if it has an h-card, use it, exit.
# 5.3 otherwise use the author property as the author name,
# exit.
else:
return author
# 6. if there is no author-page and the h-entry's page is a permalink page
if not author_page:
# 6.1 if the page has a rel-author link, let the author-page's
# URL be the href of the rel-author link
rel_authors = parsed.get('rels', {}).get('author', [])
if rel_authors:
author_page = rel_authors[0]
# 7. if there is an author-page URL
if author_page:
if not fetch_mf2_func:
return {'url': author_page}
# 7.1 get the author-page from that URL and parse it for microformats2
parsed = fetch_mf2_func(author_page)
hcards = find_all_entries(parsed, ['h-card'])
# 7.2 if author-page has 1+ h-card with url == uid ==
# author-page's URL, then use first such h-card, exit.
for hcard in hcards:
hcard_url = get_plain_text(hcard['properties'].get('url'))
hcard_uid = get_plain_text(hcard['properties'].get('uid'))
if (hcard_url and hcard_uid and hcard_url == hcard_uid
and hcard_url == author_page):
return parse_author(hcard)
# 7.3 else if author-page has 1+ h-card with url property
# which matches the href of a rel-me link on the author-page
# (perhaps the same hyperlink element as the u-url, though not
# required to be), use first such h-card, exit.
rel_mes = parsed.get('rels', {}).get('me', [])
for hcard in hcards:
hcard_url = get_plain_text(hcard['properties'].get('url'))
if hcard_url and hcard_url in rel_mes:
return parse_author(hcard)
# 7.4 if the h-entry's page has 1+ h-card with url ==
# author-page URL, use first such h-card, exit.
for hcard in hcards:
hcard_url = get_plain_text(hcard['properties'].get('url'))
if hcard_url and hcard_url == author_page:
return parse_author(hcard)
# 8. otherwise no deterministic author can be found.
return None
def representative_hcard(parsed, source_url):
"""Find the representative h-card for a URL
http://microformats.org/wiki/representative-h-card-parsing
:param dict parsed: an mf2 parsed dict
:param str source_url: the source of the parsed document.
:return: the representative h-card if one is found
"""
hcards = find_all_entries(parsed, ['h-card'], include_properties=True)
# uid and url both match source_url
for hcard in hcards:
if (source_url in hcard['properties'].get('uid', [])
and source_url in hcard['properties'].get('url', [])):
return hcard
# url that is also a rel=me
for hcard in hcards:
if any(url in parsed.get('rels', {}).get('me', [])
for url in hcard['properties'].get('url', [])):
return hcard
# single hcard with matching url
found = None
count = 0
for hcard in hcards:
if source_url in hcard['properties'].get('url', []):
found = hcard
count += 1
if count == 1:
return found
def convert_relative_paths_to_absolute(source_url, base_href, html):
"""Attempt to convert relative paths in foreign content
to absolute based on the source url of the document. Useful for
displaying images or links in reply contexts and comments.
Gets list of tags/attributes from `URL_ATTRIBUTES`. Note that this
function uses a regular expression to avoid adding a library
dependency on a proper parser.
:param str source_url: the source of the parsed document.
:param str html: the text of the source document
:return: the document with relative urls replaced with absolute ones
"""
def do_convert(match):
base_url = urljoin(source_url, base_href) if base_href else source_url
return (match.string[match.start(0):match.start(1)] +
urljoin(base_url, match.group(1)) +
match.string[match.end(1):match.end(0)])
if source_url:
for tagname, attributes in URL_ATTRIBUTES.items():
for attribute in attributes:
pattern = re.compile(
'<%s[^>]*?%s\s*=\s*[\'"](.*?)[\'"]' % (tagname, attribute),
flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
html = pattern.sub(do_convert, html)
return html
def is_name_a_title(name, content):
"""Determine whether the name property represents an explicit title.
Typically when parsing an h-entry, we check whether p-name ==
e-content (value). If they are non-equal, then p-name likely
represents a title.
However, occasionally we come across an h-entry that does not
provide an explicit p-name. In this case, the name is
automatically generated by converting the entire h-entry content
to plain text. This definitely does not represent a title, and
looks very bad when displayed as such.
To handle this case, we broaden the equality check to see if
content is a subset of name. We also strip out non-alphanumeric
characters just to make the check a little more forgiving.
:param str name: the p-name property that may represent a title
:param str content: the plain-text version of an e-content property
:return: True if the name likely represents a separate, explicit title
"""
def normalize(s):
if not isinstance(s, string_type):
s = s.decode('utf-8')
s = unicodedata.normalize('NFKD', s)
s = s.lower()
s = re.sub('[' + string.whitespace + string.punctuation + ']', '', s)
return s
if not content:
return True
if not name:
return False
return normalize(content) not in normalize(name)
def post_type_discovery(hentry):
"""Implementation of the post-type discovery algorithm
defined here https://indiewebcamp.com/post-type-discovery#Algorithm
:param dict hentry: mf2 item representing the entry to test
:return: string, one of: 'org', 'person', 'event', 'rsvp',
'invite', 'reply', 'repost', 'like', 'photo',
'article', 'note', 'follow'
"""
props = hentry.get('properties', {})
if 'h-card' in hentry.get('type', []):
name = get_plain_text(props.get('name'))
org = get_plain_text(props.get('org'))
if name and org and name == org:
return 'org'
return 'person'
if 'h-event' in hentry.get('type', []):
return 'event'
for prop, implied_type in [
('rsvp', 'rsvp'),
('invitee', 'invite'),
('in-reply-to', 'reply'),
('repost-of', 'repost'),
('like-of', 'like'),
('follow-of', 'follow'),
('photo', 'photo'),
]:
if props.get(prop) is not None:
return implied_type
# check name ~= content
name = get_plain_text(props.get('name'))
content = get_plain_text(props.get('content'))
if not content:
content = get_plain_text(props.get('summary'))
if content and name and is_name_a_title(name, content):
return 'article'
return 'note'
def parse_datetime(s):
"""The definition for microformats2 dt-* properties are fairly
lenient. This method converts an mf2 date string into either a
datetime.date or datetime.datetime object. Datetimes will be naive
unless a timezone is specified.
:param str s: a mf2 string representation of a date or datetime
:return: datetime.date or datetime.datetime
:raises ValueError: if the string is not recognizable
"""
if not s:
return None
s = re.sub('\s+', ' ', s)
date_re = "(?P<year>\d{4,})-(?P<month>\d{1,2})-(?P<day>\d{1,2})"
time_re = "(?P<hour>\d{1,2}):(?P<minute>\d{2})(:(?P<second>\d{2})(\.(?P<microsecond>\d+))?)?"
tz_re = "(?P<tzz>Z)|(?P<tzsign>[+-])(?P<tzhour>\d{1,2}):?(?P<tzminute>\d{2})"
dt_re = "%s((T| )%s ?(%s)?)?$" % (date_re, time_re, tz_re)
m = re.match(dt_re, s)
if not m:
raise ValueError('unrecognized datetime %s' % s)
year = m.group('year')
month = m.group('month')
day = m.group('day')
hour = m.group('hour')
if not hour:
return date(int(year), int(month), int(day))
minute = m.group('minute') or "00"
second = m.group('second') or "00"
if hour:
dt = datetime(int(year), int(month), int(day), int(hour),
int(minute), int(second))
if m.group('tzz'):
dt = dt.replace(tzinfo=utc)
else:
tzsign = m.group('tzsign')
tzhour = m.group('tzhour')
tzminute = m.group('tzminute') or "00"
if tzsign and tzhour:
offset = timedelta(hours=int(tzhour),
minutes=int(tzminute))
if tzsign == '-':
offset = -offset
dt = dt.replace(tzinfo=timezone_from_offset(
offset, '%s%s:%s' % (tzsign, tzhour, tzminute)))
return dt
parse_dt = parse_datetime # backcompat
def _interpret_common_properties(
parsed, source_url, base_href, hentry, use_rel_syndication,
want_json, fetch_mf2_func):
result = {}
props = hentry['properties']
for prop in ('url', 'uid', 'photo', 'featured' 'logo'):
value = get_plain_text(props.get(prop))
if value:
result[prop] = value
for prop in ('start', 'end', 'published', 'updated', 'deleted'):
date_str = get_plain_text(props.get(prop))
if date_str:
if want_json:
result[prop] = date_str
else:
result[prop + '-str'] = date_str
try:
date = parse_datetime(date_str)
if date:
result[prop] = date
except ValueError:
logging.warn('Failed to parse datetime %s', date_str)
author = find_author(parsed, source_url, hentry, fetch_mf2_func)
if author:
result['author'] = author
content_prop = props.get('content')
content_value = None
if content_prop:
if isinstance(content_prop[0], dict):
content_html = content_prop[0].get('html', '').strip()
content_value = content_prop[0].get('value', '').strip()
else:
content_value = content_html = content_prop[0]
result['content'] = convert_relative_paths_to_absolute(
source_url, base_href, content_html)
result['content-plain'] = content_value
summary_prop = props.get('summary')
if summary_prop:
if isinstance(summary_prop[0], dict):
result['summary'] = summary_prop[0]['value']
else:
result['summary'] = summary_prop[0]
# Collect location objects, then follow this algorithm to consolidate their
# properties:
# https://indieweb.org/location#How_to_determine_the_location_of_a_microformat
location_stack = [props]
for prop in 'location', 'adr':
vals = props.get(prop)
if vals:
if isinstance(vals[0], string_type):
location_stack.append({'name': vals})
else:
location_stack.append(vals[0].get('properties', {}))
geo = props.get('geo')
if geo:
if isinstance(geo[0], dict):
location_stack.append(geo[0].get('properties', {}))
else:
if geo[0].startswith('geo:'):
# a geo: URL. try to parse it. https://tools.ietf.org/html/rfc5870
parts = geo[0][len('geo:'):].split(';')[0].split(',')
if len(parts) >= 2:
location_stack.append({
'latitude': [parts[0]],
'longitude': [parts[1]],
'altitude': [parts[2]] if len(parts) >= 3 else [],
})
for prop in LOCATION_PROPERTIES:
for obj in location_stack:
if obj and obj.get(prop) and not (obj == props and prop == 'name'):
result.setdefault('location', {})[prop] = obj[prop][0]
if use_rel_syndication:
result['syndication'] = list(set(
parsed.get('rels', {}).get('syndication', []) +
hentry['properties'].get('syndication', [])))
else:
result['syndication'] = hentry['properties'].get('syndication', [])
return result
def interpret_event(
parsed, source_url, base_href=None, hevent=None,
use_rel_syndication=True, want_json=False, fetch_mf2_func=None):
"""Given a document containing an h-event, return a dictionary::
{
'type': 'event',
'url': the permalink url of the document (may be different than source_url),
'start': datetime or date,
'end': datetime or date,
'name': plain-text event name,
'content': body of event description (contains HTML)
}
:param dict parsed: the result of parsing a document containing mf2 markup
:param str source_url: the URL of the parsed document, not currently used
:param str base_href: (optional) the href value of the base tag
:param dict hevent: (optional) the item in the above document representing
the h-event. if provided, we can avoid a redundant call to
find_first_entry
:param boolean use_rel_syndication: (optional, default True) Whether
to include rel=syndication in the list of syndication sources. Sometimes
useful to set this to False when parsing h-feeds that erroneously include
rel=syndication on each entry.
:param boolean want_json: (optional, default false) if true, the result
will be pure json with datetimes as strings instead of python objects
:param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
output for a given URL.
:return: a dict with some or all of the described properties
"""
# find the h-event if it wasn't provided
if not hevent:
hevent = find_first_entry(parsed, ['h-event'])
if not hevent:
return {}
result = _interpret_common_properties(
parsed, source_url, base_href, hevent, use_rel_syndication, want_json,
fetch_mf2_func)
result['type'] = 'event'
name_value = get_plain_text(hevent['properties'].get('name'))
if name_value:
result['name'] = name_value
return result
def interpret_entry(
parsed, source_url, base_href=None, hentry=None,
use_rel_syndication=True, want_json=False, fetch_mf2_func=None):
"""Given a document containing an h-entry, return a dictionary::
{
'type': 'entry',
'url': the permalink url of the document (may be different than source_url),
'published': datetime or date,
'updated': datetime or date,
'name': title of the entry,
'content': body of entry (contains HTML),
'author': {
'name': author name,
'url': author url,
'photo': author photo
},
'syndication': [
'syndication url',
...
],
'in-reply-to': [...],
'like-of': [...],
'repost-of': [...],
}
:param dict parsed: the result of parsing a document containing mf2 markup
:param str source_url: the URL of the parsed document, used by the
authorship algorithm
:param str base_href: (optional) the href value of the base tag
:param dict hentry: (optional) the item in the above document
representing the h-entry. if provided, we can avoid a redundant
call to find_first_entry
:param boolean use_rel_syndication: (optional, default True) Whether
to include rel=syndication in the list of syndication sources. Sometimes
useful to set this to False when parsing h-feeds that erroneously include
rel=syndication on each entry.
:param boolean want_json: (optional, default False) if true, the result
will be pure json with datetimes as strings instead of python objects
:param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
output for a given URL.
:return: a dict with some or all of the described properties
"""
# find the h-entry if it wasn't provided
if not hentry:
hentry = find_first_entry(parsed, ['h-entry'])
if not hentry:
return {}
result = _interpret_common_properties(
parsed, source_url, base_href, hentry, use_rel_syndication, want_json,
fetch_mf2_func)
if 'h-cite' in hentry.get('type', []):
result['type'] = 'cite'
else:
result['type'] = 'entry'
title = get_plain_text(hentry['properties'].get('name'))
if title and is_name_a_title(title, result.get('content-plain')):
result['name'] = title
for prop in ('in-reply-to', 'like-of', 'repost-of', 'bookmark-of',
'comment', 'like', 'repost'):
for url_val in hentry['properties'].get(prop, []):
if isinstance(url_val, dict):
result.setdefault(prop, []).append(
interpret(parsed, source_url, base_href, url_val,
use_rel_syndication=False,
want_json=want_json,
fetch_mf2_func=fetch_mf2_func))
else:
result.setdefault(prop, []).append({
'url': url_val,
})
return result
def interpret_feed(parsed, source_url, base_href=None, hfeed=None,
want_json=False, fetch_mf2_func=None):
"""Interpret a source page as an h-feed or as an top-level collection
of h-entries.
:param dict parsed: the result of parsing a mf2 document
:param str source_url: the URL of the source document (used for authorship
discovery)
:param str base_href: (optional) the href value of the base tag
:param dict hfedd: (optional) the h-feed to be parsed. If provided,
this will be used instead of the first h-feed on the page.
:param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
output for a given URL.
:return: a dict containing 'entries', a list of entries, and possibly other
feed properties (like 'name').
"""
result = {}
# find the first feed if it wasn't provided
if not hfeed:
hfeed = find_first_entry(parsed, ['h-feed'])
if hfeed:
names = hfeed['properties'].get('name')
if names:
result['name'] = names[0]
children = hfeed.get('children', [])
# just use the top level 'items' as the feed children
else:
children = parsed.get('items', [])
entries = []
for child in children:
entry = interpret(
parsed, source_url, base_href, item=child,
use_rel_syndication=False, want_json=want_json,
fetch_mf2_func=fetch_mf2_func)
if entry:
entries.append(entry)
result['entries'] = entries
return result
def interpret(parsed, source_url, base_href=None, item=None,
use_rel_syndication=True, want_json=False, fetch_mf2_func=None):
"""Interpret a permalink of unknown type. Finds the first interesting
h-* element, and delegates to :func:`interpret_entry` if it is an
h-entry or :func:`interpret_event` for an h-event
:param dict parsed: the result of parsing a mf2 document
:param str source_url: the URL of the source document (used for authorship
discovery)
:param str base_href: (optional) the href value of the base tag
:param dict item: (optional) the item to be parsed. If provided,
this will be used instead of the first element on the page.
:param boolean use_rel_syndication: (optional, default True) Whether
to include rel=syndication in the list of syndication sources. Sometimes
useful to set this to False when parsing h-feeds that erroneously include
rel=syndication on each entry.
:param boolean want_json: (optional, default False) If true, the result
will be pure json with datetimes as strings instead of python objects
:param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
output for a given URL.
:return: a dict as described by interpret_entry or interpret_event, or None
"""
if not item:
item = find_first_entry(parsed, ['h-entry', 'h-event'])
if item:
types = item.get('type', [])
if 'h-event' in types:
return interpret_event(
parsed, source_url, base_href=base_href, hevent=item,
use_rel_syndication=use_rel_syndication, want_json=want_json,
fetch_mf2_func=fetch_mf2_func)
elif 'h-entry' in types or 'h-cite' in types:
return interpret_entry(
parsed, source_url, base_href=base_href, hentry=item,
use_rel_syndication=use_rel_syndication, want_json=want_json,
fetch_mf2_func=fetch_mf2_func)
def interpret_comment(parsed, source_url, target_urls, base_href=None,
want_json=False, fetch_mf2_func=None):
"""Interpret received webmentions, and classify as like, reply, or
repost (or a combination thereof). Returns a dict as described
in :func:`interpret_entry`, with the additional fields::
{
'comment_type': a list of strings, zero or more of
'like', 'reply', or 'repost'
'rsvp': a string containing the rsvp response (optional)
}
:param dict parsed: a parsed mf2 parsed document
:param str source_url: the URL of the source document
:param list target_urls: a collection containing the URL of the target\
document, and any alternate URLs (e.g., shortened links) that should\
be considered equivalent when looking for references
:param str base_href: (optional) the href value of the base tag
:param boolean want_json: (optional, default False) If true, the result
will be pure json with datetimes as strings instead of python objects
:param callable fetch_mf2_func: (optional) function to fetch mf2 parsed
output for a given URL.
:return: a dict as described above, or None
"""
item = find_first_entry(parsed, ['h-entry'])
if item:
result = interpret_entry(parsed, source_url, base_href=base_href,
hentry=item, want_json=want_json,
fetch_mf2_func=fetch_mf2_func)
if result:
result['comment_type'] = classify_comment(parsed, target_urls)
rsvp = get_plain_text(item['properties'].get('rsvp'))
if rsvp:
result['rsvp'] = rsvp.lower()
invitees = item['properties'].get('invitee')
if invitees:
result['invitees'] = [
parse_author(inv) for inv in invitees]
return result