Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduced implied urls (fixes #117), added tests #119

Merged
merged 2 commits into from
Jul 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 19 additions & 20 deletions mf2py/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,22 +165,22 @@ def handle_microformat(root_class_names, el, value_property=None,
properties = self.dict_class()
children = []
self._default_date = None
# flag for processing implied name
do_implied_name = True
# for processing implied properties: collects if property types (p, e, u, d(t)) or children (h) have been processed
parsed_types_aggregation = set()

if backcompat_mode:
el = backcompat.apply_rules(el, self.__html_parser__)
root_class_names = mf2_classes.root(el.get('class',[]))

# parse for properties and children
for child in get_children(el):
child_props, child_children, child_stops_implied_name = parse_props(child)
child_props, child_children, child_parsed_types_aggregation = parse_props(child)
for key, new_value in child_props.items():
prop_value = properties.get(key, [])
prop_value.extend(new_value)
properties[key] = prop_value
children.extend(child_children)
do_implied_name = do_implied_name and not child_stops_implied_name
parsed_types_aggregation.update(child_parsed_types_aggregation)

# complex h-* objects can take their "value" from the
# first explicit property ("name" for p-* or "url" for u-*)
Expand All @@ -190,16 +190,16 @@ def handle_microformat(root_class_names, el, value_property=None,
# if some properties not already found find in implied ways unless in backcompat mode
if not backcompat_mode:
# stop implied name if any p-*, e-*, h-* is already found
if "name" not in properties and do_implied_name:

if "name" not in properties and parsed_types_aggregation.isdisjoint("peh"):
properties["name"] = [implied_properties.name(el, base_url=self.__url__)]

if "photo" not in properties:
x = implied_properties.photo(el, self.dict_class, self.__img_with_alt__, base_url=self.__url__)
if x is not None:
properties["photo"] = [x]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be good to add a comment like in the implied name thing

if "url" not in properties:
# stop implied url if any u-* or h-* is already found
if "url" not in properties and parsed_types_aggregation.isdisjoint("uh"):
x = implied_properties.url(el, base_url=self.__url__)
if x is not None:
properties["url"] = [x]
Expand Down Expand Up @@ -241,8 +241,8 @@ def parse_props(el):
"""
props = self.dict_class()
children = []
# Does this element stop implied name?
stops_implied_name = False
# for processing implied properties: collects if property types (p, e, u, d(t)) or children (h) have been processed
parsed_types_aggregation = set()

classes = el.get("class", [])
# Is this element a microformat2 root?
Expand All @@ -254,6 +254,9 @@ def parse_props(el):
root_class_names = backcompat.root(classes)
backcompat_mode = True

if root_class_names:
parsed_types_aggregation.add('h')

# Is this a property element (p-*, u-*, etc.) flag
# False is default
is_property_el = False
Expand All @@ -262,16 +265,14 @@ def parse_props(el):
p_value = None
for prop_name in mf2_classes.text(classes):
is_property_el = True
stops_implied_name = True
parsed_types_aggregation.add('p')
prop_value = props.setdefault(prop_name, [])

# if value has not been parsed then parse it
if p_value is None:
p_value = text_type(parse_property.text(el, base_url=self.__url__))


if root_class_names:
stops_implied_name = True
prop_value.append(handle_microformat(
root_class_names, el, value_property="name",
simple_value=p_value, backcompat_mode=backcompat_mode))
Expand All @@ -282,14 +283,14 @@ def parse_props(el):
u_value = None
for prop_name in mf2_classes.url(classes):
is_property_el = True
parsed_types_aggregation.add('u')
prop_value = props.setdefault(prop_name, [])

# if value has not been parsed then parse it
if u_value is None:
u_value = parse_property.url(el, self.dict_class, self.__img_with_alt__, base_url=self.__url__)

if root_class_names:
stops_implied_name = True
prop_value.append(handle_microformat(
root_class_names, el, value_property="url",
simple_value=u_value, backcompat_mode=backcompat_mode))
Expand All @@ -303,6 +304,7 @@ def parse_props(el):
dt_value = None
for prop_name in mf2_classes.datetime(classes):
is_property_el = True
parsed_types_aggregation.add('d')
prop_value = props.setdefault(prop_name, [])

# if value has not been parsed then parse it
Expand All @@ -326,7 +328,7 @@ def parse_props(el):
e_value = None
for prop_name in mf2_classes.embedded(classes):
is_property_el = True
stops_implied_name = True
parsed_types_aggregation.add('e')
prop_value = props.setdefault(prop_name, [])

# if value has not been parsed then parse it
Expand All @@ -347,21 +349,18 @@ def parse_props(el):
# if this is not a property element, but it is a h-* microformat,
# add it to our list of children
if not is_property_el and root_class_names:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't this add h to the parsed_type_aggregation?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be covered in Line 258. Since it doesn't matter if it is a child or a nested property, I add it there already. Unless I missed a code path?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aah yes I missed that change

stops_implied_name = True
children.append(handle_microformat(root_class_names, el, backcompat_mode=backcompat_mode))

# parse child tags, provided this isn't a microformat root-class
if not root_class_names:
for child in get_children(el):
child_properties, child_microformats, child_stops_implied_name = parse_props(child)
child_properties, child_microformats, child_parsed_types_aggregation = parse_props(child)
for prop_name in child_properties:
v = props.get(prop_name, [])
v.extend(child_properties[prop_name])
props[prop_name] = v
children.extend(child_microformats)
stops_implied_name = stops_implied_name or child_stops_implied_name

return props, children, stops_implied_name
parsed_types_aggregation.update(child_parsed_types_aggregation)
return props, children, parsed_types_aggregation

def parse_rels(el):
"""Parse an element for rel microformats
Expand Down
59 changes: 59 additions & 0 deletions test/examples/implied_properties/stop_implied_url.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<h1>real world example</h1>
<article class="post h-entry">
<h1 class="p-name"><a href="/Waar-te-beginnen-met-Webmentions/">Waar te beginnen met Webmentions</a></h1>
<div class="entry e-content">
<p>Er zijn van die momenten dat ik het liefste de hele dag ga zitten puzzelen hoe ik nu de <a href="/webmentions/">webmentions</a> op deze site in orde moet maken. Het loopt allemaal nog steeds niet lekker, maar ik weet niet goed welke kant ik op moet denken en werken voor een oplossing.</p>
<p>Waar loop ik nog tegen aan?</p>
<p>...</p>
</div>
<a href="https://news.indieweb.org/nl" class="u-syndication"></a>
</article>

<h1>synthetic test cases</h1>

<article class="h-entry">
<h1>u- on only link stops implied url</h1>
<a href="http://example.com/" class="u-property"></a>
</article>

<article class="h-entry">
<h1><a href="http://example.com/post" class="u-property">u- on second link stops implied url</a></h1>
<a href="http://example.com/"></a>
</article>

<article class="h-entry">
<h1 class="p-property h-entry">nested object in property stops u-url parsing</h1>
<a href="http://example.com/"></a>
</article>

<article class="h-entry">
<h1 class="h-entry">nested child object stops u-url parsing</h1>
<a href="http://example.com/"></a>
</article>

<article class="h-entry">
<div><span><h1 class="h-entry">deeper nested child object stops u-url parsing</h1></span></div>
<a href="http://example.com/"></a>
</article>

<article class="h-entry">
<h1 class="p-name">p- property doesn't stop implied url parsing</h1>
<a href="http://example.com/"></a>
</article>

<article class="h-entry">
<p class="e-content">e-property doesn't stop implied url parsing"</p>
<a href="http://example.com/"></a>
</article>

<article class="h-entry">
<h1>implied u-photo does not stop implied u-url parsing</h1>
<img src="http://example.com/avatar.png">
<a href="http://example.com/"></a>
</article>

<article class="h-entry">
<h1>implied u-photo does not stop implied u-url parsing</h1>
<img src="http://example.com/avatar.png">
<a href="http://example.com/"></a>
</article>
17 changes: 17 additions & 0 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,23 @@ def test_implied_url():
for i in range(12, 23):
assert_false("url" in result["items"][i]["properties"])

def test_stop_implied_url():
"""testing that explicit properties case implied url-parsing to be aborted"""

result = parse_fixture("implied_properties/stop_implied_url.html")

assert_false("url" in result["items"][0]["properties"])
assert_false("url" in result["items"][1]["properties"])
assert_false("url" in result["items"][2]["properties"])
assert_false("url" in result["items"][3]["properties"])
assert_false("url" in result["items"][4]["properties"])
assert_false("url" in result["items"][5]["properties"])

assert_equal(result["items"][6]["properties"]["url"], ["http://example.com/"])
assert_equal(result["items"][7]["properties"]["url"], ["http://example.com/"])
assert_equal(result["items"][8]["properties"]["url"], ["http://example.com/"])
assert_equal(result["items"][9]["properties"]["url"], ["http://example.com/"])

def test_implied_nested_photo():
result = parse_fixture("implied_properties/implied_properties.html", url="http://bar.org")
assert_equal(result["items"][2]["properties"]["photo"][0],
Expand Down