Skip to content

Commit

Permalink
reduced implied urls (fixes microformats#117), added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Sven Knebel committed Jul 24, 2018
1 parent 769bd2a commit bfef837
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 19 deletions.
37 changes: 18 additions & 19 deletions mf2py/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,22 +165,22 @@ def handle_microformat(root_class_names, el, value_property=None,
properties = self.dict_class()
children = []
self._default_date = None
# flag for processing implied name
do_implied_name = True
# for processing implied properties: collects if property types (p, e, u, d(t)) or children (h) have been processed
parsed_types_aggregation = set()

if backcompat_mode:
el = backcompat.apply_rules(el, self.__html_parser__)
root_class_names = mf2_classes.root(el.get('class',[]))

# parse for properties and children
for child in get_children(el):
child_props, child_children, child_stops_implied_name = parse_props(child)
child_props, child_children, child_parsed_types_aggregation = parse_props(child)
for key, new_value in child_props.items():
prop_value = properties.get(key, [])
prop_value.extend(new_value)
properties[key] = prop_value
children.extend(child_children)
do_implied_name = do_implied_name and not child_stops_implied_name
parsed_types_aggregation.update(child_parsed_types_aggregation)

# complex h-* objects can take their "value" from the
# first explicit property ("name" for p-* or "url" for u-*)
Expand All @@ -190,7 +190,7 @@ def handle_microformat(root_class_names, el, value_property=None,
# if some properties not already found find in implied ways unless in backcompat mode
if not backcompat_mode:
# stop implied name if any p-*, e-*, h-* is already found
if "name" not in properties and do_implied_name:
if "name" not in properties and parsed_types_aggregation.isdisjoint("peh"):

properties["name"] = [implied_properties.name(el, base_url=self.__url__)]

Expand All @@ -199,7 +199,7 @@ def handle_microformat(root_class_names, el, value_property=None,
if x is not None:
properties["photo"] = [x]

if "url" not in properties:
if "url" not in properties and parsed_types_aggregation.isdisjoint("uh"):
x = implied_properties.url(el, base_url=self.__url__)
if x is not None:
properties["url"] = [x]
Expand Down Expand Up @@ -241,8 +241,8 @@ def parse_props(el):
"""
props = self.dict_class()
children = []
# Does this element stop implied name?
stops_implied_name = False
# for processing implied properties: collects if property types (p, e, u, d(t)) or children (h) have been processed
parsed_types_aggregation = set()

classes = el.get("class", [])
# Is this element a microformat2 root?
Expand All @@ -254,6 +254,9 @@ def parse_props(el):
root_class_names = backcompat.root(classes)
backcompat_mode = True

if root_class_names:
parsed_types_aggregation.add('h')

# Is this a property element (p-*, u-*, etc.) flag
# False is default
is_property_el = False
Expand All @@ -262,16 +265,14 @@ def parse_props(el):
p_value = None
for prop_name in mf2_classes.text(classes):
is_property_el = True
stops_implied_name = True
parsed_types_aggregation.add('p')
prop_value = props.setdefault(prop_name, [])

# if value has not been parsed then parse it
if p_value is None:
p_value = text_type(parse_property.text(el, base_url=self.__url__))


if root_class_names:
stops_implied_name = True
prop_value.append(handle_microformat(
root_class_names, el, value_property="name",
simple_value=p_value, backcompat_mode=backcompat_mode))
Expand All @@ -282,14 +283,14 @@ def parse_props(el):
u_value = None
for prop_name in mf2_classes.url(classes):
is_property_el = True
parsed_types_aggregation.add('u')
prop_value = props.setdefault(prop_name, [])

# if value has not been parsed then parse it
if u_value is None:
u_value = parse_property.url(el, self.dict_class, self.__img_with_alt__, base_url=self.__url__)

if root_class_names:
stops_implied_name = True
prop_value.append(handle_microformat(
root_class_names, el, value_property="url",
simple_value=u_value, backcompat_mode=backcompat_mode))
Expand All @@ -303,6 +304,7 @@ def parse_props(el):
dt_value = None
for prop_name in mf2_classes.datetime(classes):
is_property_el = True
parsed_types_aggregation.add('d')
prop_value = props.setdefault(prop_name, [])

# if value has not been parsed then parse it
Expand All @@ -326,7 +328,7 @@ def parse_props(el):
e_value = None
for prop_name in mf2_classes.embedded(classes):
is_property_el = True
stops_implied_name = True
parsed_types_aggregation.add('e')
prop_value = props.setdefault(prop_name, [])

# if value has not been parsed then parse it
Expand All @@ -347,21 +349,18 @@ def parse_props(el):
# if this is not a property element, but it is a h-* microformat,
# add it to our list of children
if not is_property_el and root_class_names:
stops_implied_name = True
children.append(handle_microformat(root_class_names, el, backcompat_mode=backcompat_mode))

# parse child tags, provided this isn't a microformat root-class
if not root_class_names:
for child in get_children(el):
child_properties, child_microformats, child_stops_implied_name = parse_props(child)
child_properties, child_microformats, child_parsed_types_aggregation = parse_props(child)
for prop_name in child_properties:
v = props.get(prop_name, [])
v.extend(child_properties[prop_name])
props[prop_name] = v
children.extend(child_microformats)
stops_implied_name = stops_implied_name or child_stops_implied_name

return props, children, stops_implied_name
parsed_types_aggregation.update(child_parsed_types_aggregation)
return props, children, parsed_types_aggregation

def parse_rels(el):
"""Parse an element for rel microformats
Expand Down
17 changes: 17 additions & 0 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,23 @@ def test_implied_url():
for i in range(12, 23):
assert_false("url" in result["items"][i]["properties"])

def test_stop_implied_url():
"""testing that explicit properties case implied url-parsing to be aborted"""

result = parse_fixture("implied_properties/stop_implied_url.html")

assert_false("url" in result["items"][0]["properties"])
assert_false("url" in result["items"][1]["properties"])
assert_false("url" in result["items"][2]["properties"])
assert_false("url" in result["items"][3]["properties"])
assert_false("url" in result["items"][4]["properties"])
assert_false("url" in result["items"][5]["properties"])

assert_equal(result["items"][6]["properties"]["url"], ["http://example.com/"])
assert_equal(result["items"][7]["properties"]["url"], ["http://example.com/"])
assert_equal(result["items"][8]["properties"]["url"], ["http://example.com/"])
assert_equal(result["items"][9]["properties"]["url"], ["http://example.com/"])

def test_implied_nested_photo():
result = parse_fixture("implied_properties/implied_properties.html", url="http://bar.org")
assert_equal(result["items"][2]["properties"]["photo"][0],
Expand Down

0 comments on commit bfef837

Please sign in to comment.