Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add block list for known conflicting root class names #211

Merged
merged 4 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions mf2py/backcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def root(classes):
return unordered_list([c for c in classes if c in _CLASSIC_MAP])


def apply_rules(el, html_parser):
def apply_rules(el, html_parser, filtered_roots):
"""add modern classnames for older mf1 classnames

returns a copy of el and does not modify the original
Expand All @@ -164,7 +164,7 @@ def apply_prop_rules_to_children(parent, rules):
rule(child)

# recurse if it's not a nested mf1 or mf2 root
if not (mf2_classes.root(classes) or root(classes)):
if not (mf2_classes.root(classes, filtered_roots) or root(classes)):
apply_prop_rules_to_children(child, rules)

# add mf2 root equivalent
Expand Down
22 changes: 11 additions & 11 deletions mf2py/implied_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
)


def name(el, base_url=""):
def name(el, base_url, filtered_roots):
"""Find an implied name property

Args:
Expand Down Expand Up @@ -40,7 +40,7 @@ def non_empty(val):
poss_child = children[0]

# ignore if mf2 root
if mf2_classes.root(poss_child.get("class", [])):
if mf2_classes.root(poss_child.get("class", []), filtered_roots):
poss_child = None

# if it is not img, area, abbr then find grandchild
Expand All @@ -51,7 +51,7 @@ def non_empty(val):
poss_child = grandchildren[0]
# if it is not img, area, abbr or is mf2 root then no possible child
if poss_child.name not in ("img", "area", "abbr") or mf2_classes.root(
poss_child.get("class", [])
poss_child.get("class", []), filtered_roots
):
poss_child = None

Expand All @@ -73,7 +73,7 @@ def non_empty(val):
return get_textContent(el, replace_img=True, img_to_src=False, base_url=base_url)


def photo(el, base_url=""):
def photo(el, base_url, filtered_roots):
"""Find an implied photo property

Args:
Expand All @@ -92,15 +92,15 @@ def get_photo_child(children):
poss_imgs = [c for c in children if c.name == "img"]
if len(poss_imgs) == 1:
poss_img = poss_imgs[0]
if not mf2_classes.root(poss_img.get("class", [])):
if not mf2_classes.root(poss_img.get("class", []), filtered_roots):
return poss_img

# if element has one object child use data if exists and object is
# not root class
poss_objs = [c for c in children if c.name == "object"]
if len(poss_objs) == 1:
poss_obj = poss_objs[0]
if not mf2_classes.root(poss_obj.get("class", [])):
if not mf2_classes.root(poss_obj.get("class", []), filtered_roots):
return poss_obj

def resolve_relative_url(prop_value):
Expand Down Expand Up @@ -128,7 +128,7 @@ def resolve_relative_url(prop_value):
if (
poss_child is None
and len(children) == 1
and not mf2_classes.root(children[0].get("class", []))
and not mf2_classes.root(children[0].get("class", []), filtered_roots)
):
grandchildren = list(get_children(children[0]))
poss_child = get_photo_child(grandchildren)
Expand All @@ -144,7 +144,7 @@ def resolve_relative_url(prop_value):
return resolve_relative_url(prop_value)


def url(el, base_url=""):
def url(el, base_url, filtered_roots):
"""Find an implied url property

Args:
Expand All @@ -162,14 +162,14 @@ def get_url_child(children):
poss_as = [c for c in children if c.name == "a"]
if len(poss_as) == 1:
poss_a = poss_as[0]
if not mf2_classes.root(poss_a.get("class", [])):
if not mf2_classes.root(poss_a.get("class", []), filtered_roots):
return poss_a

# if element has one area child use if not root class
poss_areas = [c for c in children if c.name == "area"]
if len(poss_areas) == 1:
poss_area = poss_areas[0]
if not mf2_classes.root(poss_area.get("class", [])):
if not mf2_classes.root(poss_area.get("class", []), filtered_roots):
return poss_area

# if element is a <a> or area use its href if exists
Expand All @@ -187,7 +187,7 @@ def get_url_child(children):
if (
poss_child is None
and len(children) == 1
and not mf2_classes.root(children[0].get("class", []))
and not mf2_classes.root(children[0].get("class", []), filtered_roots)
):
grandchildren = list(get_children(children[0]))
poss_child = get_url_child(grandchildren)
Expand Down
8 changes: 6 additions & 2 deletions mf2py/mf2_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
_mf2_properties_re = re.compile("(p|e|u|dt)-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$")
_mf2_e_properties_re = re.compile("e-(:?[a-z0-9]+-)?[a-z]+(:?-[a-z]+)*$")

CONFLICTING_ROOTS_TAILWIND = {"auto", "fit", "full", "max", "min", "px", "screen"}


def filter_classes(classes, regex=_mf2_classes_re):
"""detect classes that are valid names for mf2, sort in dictionary by prefix"""
Expand All @@ -20,8 +22,10 @@ def filter_classes(classes, regex=_mf2_classes_re):
return types


def root(classes):
return {c for c in classes if _mf2_roots_re.match(c)}
def root(classes, filtered_roots):
return {
c for c in classes if _mf2_roots_re.match(c) and c[2:] not in filtered_roots
}


def is_property_class(class_):
Expand Down
29 changes: 20 additions & 9 deletions mf2py/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .version import __version__


def parse(doc=None, url=None, html_parser=None):
def parse(doc=None, url=None, html_parser=None, filter_roots=False):
"""
Parse a microformats2 document or url and return a json dictionary.

Expand All @@ -28,7 +28,7 @@ def parse(doc=None, url=None, html_parser=None):

Return: a json dict represented the structured data in this document.
"""
return Parser(doc, url, html_parser).to_dict()
return Parser(doc, url, html_parser, filter_roots).to_dict()


class Parser(object):
Expand All @@ -54,7 +54,7 @@ class Parser(object):
ua_url = "https://github.com/microformats/mf2py"
useragent = "{0} - version {1} - {2}".format(ua_desc, __version__, ua_url)

def __init__(self, doc=None, url=None, html_parser=None):
def __init__(self, doc=None, url=None, html_parser=None, filter_roots=False):
self.__url__ = None
self.__doc__ = None
self._preserve_doc = False
Expand All @@ -68,6 +68,13 @@ def __init__(self, doc=None, url=None, html_parser=None):
"version": __version__,
},
}
try:
self.filtered_roots = set(filter_roots)
except TypeError:
if filter_roots:
self.filtered_roots = mf2_classes.CONFLICTING_ROOTS_TAILWIND
else:
self.filtered_roots = []

# use default parser if none specified
self.__html_parser__ = html_parser or "html5lib"
Expand Down Expand Up @@ -158,8 +165,12 @@ def handle_microformat(
parsed_types_aggregation = set()

if backcompat_mode:
el = backcompat.apply_rules(el, self.__html_parser__)
root_class_names = mf2_classes.root(el.get("class", []))
el = backcompat.apply_rules(
el, self.__html_parser__, self.filtered_roots
)
root_class_names = mf2_classes.root(
el.get("class", []), self.filtered_roots
)

# parse for properties and children
for child in get_children(el):
Expand Down Expand Up @@ -187,21 +198,21 @@ def handle_microformat(
"peh"
):
properties["name"] = [
implied_properties.name(el, base_url=self.__url__)
implied_properties.name(el, self.__url__, self.filtered_roots)
]

if "photo" not in properties and parsed_types_aggregation.isdisjoint(
"uh"
):
x = implied_properties.photo(el, base_url=self.__url__)
x = implied_properties.photo(el, self.__url__, self.filtered_roots)
if x is not None:
properties["photo"] = [x]

# stop implied url if any u-* or h-* is already found
if "url" not in properties and parsed_types_aggregation.isdisjoint(
"uh"
):
x = implied_properties.url(el, base_url=self.__url__)
x = implied_properties.url(el, self.__url__, self.filtered_roots)
if x is not None:
properties["url"] = [x]

Expand Down Expand Up @@ -455,7 +466,7 @@ def parse_el(el, ctx):
classes = el.get("class", [])

# find potential microformats in root classnames h-*
potential_microformats = mf2_classes.root(classes)
potential_microformats = mf2_classes.root(classes, self.filtered_roots)

# if potential microformats found parse them
if potential_microformats:
Expand Down
9 changes: 9 additions & 0 deletions test/examples/filter_roots.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<h2>Tailwind root filter</h2>
<div class=h-card>fnord</div>
<div class=h-auto>fnord</div>
<div class=h-fit>fnord</div>
<div class=h-full>fnord</div>
<div class=h-max>fnord</div>
<div class=h-min>fnord</div>
<div class=h-px>fnord</div>
<div class=h-screen>fnord</div>
6 changes: 6 additions & 0 deletions test/examples/filter_roots_custom.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<h2>Custom root filter</h2>
<div class=h-card>fnord</div>
<div class=h-foo>fnord</div>
<div class=h-bar>fnord</div>
<div class=h-bat>fnord</div>
<div class=h-baz>fnord</div>
18 changes: 16 additions & 2 deletions test/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
TEST_DIR = "test/examples/"


def parse_fixture(path, url=None):
def parse_fixture(path, url=None, filter_roots=False):
with open(os.path.join(TEST_DIR, path)) as f:
p = Parser(doc=f, url=url, html_parser="html5lib")
p = Parser(doc=f, url=url, html_parser="html5lib", filter_roots=filter_roots)
return p.to_dict()


Expand Down Expand Up @@ -1040,3 +1040,17 @@ def test_all_u_cases():
make_labelled_cmp("all_u_cases_" + str(i))(
"http://example.com/test", result["items"][0]["properties"]["url"][i]
)


def test_blocked_roots():
""""""
result = parse_fixture("filter_roots.html")
assert len(result["items"]) == 8

result = parse_fixture("filter_roots.html", filter_roots=True)
assert len(result["items"]) == 1

result = parse_fixture(
"filter_roots_custom.html", filter_roots={"foo", "bar", "bat", "baz"}
)
assert len(result["items"]) == 1
Loading