Merge pull request #106 from kartikprabhu/master

new version 1.1.1 - looks good to me
microformats · Jul 7, 2018 · 9a84d56 · 9a84d56
2 parents dda3a59 + 71997eb
commit 9a84d56
Show file tree

Hide file tree

Showing 35 changed files with 1,387 additions and 263 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,26 @@
 # Change Log
 All notable changes to this project will be documented in this file.
 
+## 1.1.1 - 2018-06-15
+
+- streamline backcompat to use JSON only.
+- fix multiple mf1 root rel-tag parsing 
+- correct url and photo for hreview.
+- add rules for nested hreview. update backcompat to use multiple matches in old properties.
+- fix `rel-tag` to `p-category` conversion so that other classes are not lost.
+- use original authored html for `e-*` parsing in backcompat
+- make classes and rels into unordered (alphabetically ordered) deduped arrays.
+- only use class names for mf2 which follow the naming rules
+- fix `parse` method to use default html parser.
+- always use the first value for attributes for rels.
+- correct AM/PM conversion in datetime value class pattern.
+ - add ordinal date parsing to datetimes value class pattern. ordinal date is normalised to YYYY-MM-DD
+- remove hack for html tag classes since that is fixed in new BS
+- better whitespace algorithm for `name` and `html.value` parsing
+- experimental flag for including `alt` in `u-photo` parsing
+- make a copy of the BeautifulSoup given by user to work on for parsing to prevent changes to original doc
+- bump version to 1.1.1 
+
 ## 1.1.0 - 2018-03-16
 
 - bump version to 1.1.0 since it is a "major" change 

diff --git a/README.md b/README.md
@@ -54,6 +54,10 @@ Filter by microformat type
 
     p.to_dict(filter_by_type="h-entry")
     p.to_json(filter_by_type="h-entry")
+
+Experimental features
+---------------------
+- pass the optional argument `img_with_alt=True` to either the `Parser` object or to the `parse` method to enable parsing of the `alt` attribute of `<img>` tags according to [issue: image alt text is lost during parsing](https://github.com/microformats/microformats2-parsing/issues/2). By default this is `False` to be backwards compatible.
 
 Frontends
 -------------
@@ -68,3 +72,4 @@ Contributions
 We welcome contributions and bug reports via Github, and on the microformats wiki.
 
 We try to follow the [IndieWebCamp code of conduct](http://indiewebcamp.com/code-of-conduct). Please be respectful of other contributors, and forge a spirit of positive co-operation without discrimination or disrespect.
+
diff --git a/mf2py/backcompat-rules/hentry.json b/mf2py/backcompat-rules/hentry.json
@@ -35,5 +35,13 @@
         "longitude": [
             "p-longitude"
         ]
+    },
+    "rels": {
+        "bookmark": [
+            "u-url"
+        ],
+        "tag": [
+            "p-category"
+        ]
     }
-}
+}
diff --git a/mf2py/backcompat-rules/hfeed.json b/mf2py/backcompat-rules/hfeed.json
@@ -18,5 +18,10 @@
         "title": [
             "p-name"
         ]
+    },
+    "rels": {
+        "tag": [
+            "p-category"
+        ]
     }
-}
+}
diff --git a/mf2py/backcompat-rules/hproduct.json b/mf2py/backcompat-rules/hproduct.json
@@ -26,11 +26,10 @@
         ], 
         "review": [
             "p-review", 
-            "h-review", 
-            "e-description"
+            "h-review" 
         ], 
         "fn": [
             "p-name"
         ]
     }
-}
+}
diff --git a/mf2py/backcompat-rules/hrecipe.json b/mf2py/backcompat-rules/hrecipe.json
@@ -30,6 +30,14 @@
         ], 
         "ingredient": [
             "p-ingredient"
+        ],
+        "category": [
+            "p-category"
+        ]
+    },
+    "rels": {
+        "tag": [
+            "p-category"
         ]
     }
-}
+}
diff --git a/mf2py/backcompat-rules/hreview.json b/mf2py/backcompat-rules/hreview.json
@@ -17,9 +17,13 @@
             "h-card"
         ], 
         "url": [
+            "p-item", 
+            "h-item", 
             "u-url"
         ], 
         "photo": [
+            "p-item", 
+            "h-item", 
             "u-photo"
         ], 
         "best": [
@@ -35,6 +39,26 @@
         ], 
         "summary": [
             "p-name"
+        ],
+        "item vcard": [
+            "p-item",
+            "vcard"
+        ],
+        "item vevent": [
+            "p-item",
+            "vevent"
+        ],
+        "item hproduct": [
+            "p-item",
+            "hproduct"
+        ]
+    },
+    "rels": {
+        "self bookmark": [
+            "u-url"
+        ],
+        "tag": [
+            "p-category"
         ]
     }
-}
+}
diff --git a/mf2py/backcompat-rules/vcard.json b/mf2py/backcompat-rules/vcard.json
@@ -97,6 +97,12 @@
         ], 
         "organization-name": [
             "p-organization-name"
+        ],
+        "title": [
+            "p-job-title"
+        ],
+        "role": [
+            "p-role"
         ]
     }
-}
+}
diff --git a/mf2py/backcompat.py b/mf2py/backcompat.py
@@ -4,7 +4,8 @@
 """
 
 from __future__ import unicode_literals, print_function
-from .dom_helpers import get_descendents
+from .dom_helpers import get_children
+from .mf_helpers import unordered_list
 from . import mf2_classes
 import bs4
 import copy
@@ -18,11 +19,8 @@
 else:
     from urllib.parse import unquote
 
-# Classic Root Classname map
-CLASSIC_ROOT_MAP = {}
-
-# Classic Root properties map
-CLASSIC_PROPERTY_MAP = {}
+# Classic map
+_CLASSIC_MAP = {}
 
 # populate backcompat rules from JSON files
 
@@ -34,86 +32,93 @@
     with codecs.open(file_path, 'r', 'utf-8') as f:
         rules = json.load(f)
 
-    CLASSIC_ROOT_MAP[root] = rules['type'][0]
-    CLASSIC_PROPERTY_MAP[root] = rules['properties']
-
-
-
-def root(classes):
-    """get all backcompat root classnames
-    """
-    return [c for c in classes if c in CLASSIC_ROOT_MAP]
+    _CLASSIC_MAP[root] = rules 
 
 
-def make_classes_rule(old_class, new_classes):
+def _make_classes_rule(old_classes, new_classes):
     """Builds a rule for augmenting an mf1 class with its mf2
     equivalent(s).
     """
     def f(child, **kwargs):
+        child_original = child.original or copy.copy(child)
         child_classes = child.get('class', [])
-        if old_class in child_classes:
-            child_classes += [c for c in new_classes
-                              if c not in child_classes]
+        if all(cl in child_classes for cl in old_classes):
+            child_classes.extend([cl for cl in new_classes if cl not in child_classes])
             child['class'] = child_classes
-    return f
-
-
-# The RULES map has a list of rules for each root class type.
-# We'll build the vast majority of it from the CLASSIC_PROPERTY_MAP
-RULES = dict(
-    (old_root, [make_classes_rule(old_class, new_classes)
-                for old_class, new_classes in properties.items()])
-    for old_root, properties in CLASSIC_PROPERTY_MAP.items())
-
-
-def rel_bookmark_to_url_rule(child, **kwargs):
-    """rel=bookmark gets augmented with class="u-url
-    """
-    child_classes = child.get('class', [])
-    if ('bookmark' in child.get('rel', [])
-            and 'u-url' not in child_classes):
-        child_classes.append('u-url')
-        child['class'] = child_classes
 
+            # if any new class is e-* attach original to parse originally authored HTML
+            if mf2_classes.embedded(child_classes) and child.original is None:
+                child.original = child_original
+    return f
 
-def rel_tag_to_category_rule(child, **kwargs):
+def _rel_tag_to_category_rule(child, html_parser, **kwargs):
     """rel=tag converts to p-category using a special transformation (the
-    category becomes the tag href's last path segment). This rule adds a new
-    data tag so that
-    <a rel="tag" href="http://example.com/tags/cat"></a> gets augmented with
+    category becomes the tag href's last path segment). This rule adds a new data tag so that
+    <a rel="tag" href="http://example.com/tags/cat"></a> gets replaced with
     <data class="p-category" value="cat"></data>
     """
+
+    href = child.get('href', '')
     rels = child.get('rel', [])
-    classes = child.get('class', [])
-    if ('tag' in rels and child.get('href')
-            and 'p-category' not in classes
-            and 'u-category' not in classes):
-        segments = [seg for seg in child.get('href').split('/') if seg]
+    if 'tag' in rels and href:
+        segments = [seg for seg in href.split('/') if seg]
         if segments:
-            data = bs4.BeautifulSoup('<data></data>').data
-            # use mf1 class here so it doesn't get removed later
-            data['class'] = ['category']
+            if html_parser:
+                soup = bs4.BeautifulSoup('', features=html_parser)
+            else:
+                soup = bs4.BeautifulSoup('')
+
+            data = soup.new_tag('data')
+            # this does not use what's given in the JSON
+            # but that is not a problem currently
+            # use mf1 class so it doesn't get removed later
+            data['class'] = ['p-category']
             data['value'] = unquote(segments[-1])
-            child.parent.append(data)
+            child.insert_before(data)
+            # remove tag from rels to avoid repeat
+            child['rel'] = [r for r in rels if r != 'tag']
 
 
-# Augment with special rules
-RULES['hentry'] += [
-    rel_bookmark_to_url_rule,
-    rel_tag_to_category_rule,
-]
+def _make_rels_rule(old_rels, new_classes, html_parser):
+    """Builds a rule for augmenting an mf1 rel with its mf2 class equivalent(s).
+    """
+
+    # need to special case rel=tag as it operates differently
+
+    def f(child, **kwargs):
+        child_rels = child.get('rel', [])
+        child_classes = child.get('class', [])
+        if all(r in child_rels for r in old_rels):
+            if 'tag' in old_rels:
+                _rel_tag_to_category_rule(child, html_parser, **kwargs)
+            else:
+                child_classes.extend([cl for cl in new_classes if cl not in child_classes])
+                child['class'] = child_classes
+    return f
 
-def apply_rules(el):
-    """add modern classnames for older mf1 classnames
 
-    returns a copy of el and does not modify the original
+def _get_rules(old_root, html_parser):
+    """ for given mf1 root get the rules as a list of functions to act on children """
+
+    class_rules = [_make_classes_rule(old_classes.split(), new_classes)
+                for old_classes, new_classes in _CLASSIC_MAP[old_root].get('properties', {}).items()]
+    rel_rules = [_make_rels_rule(old_rels.split(), new_classes, html_parser)
+                for old_rels, new_classes in _CLASSIC_MAP[old_root].get('rels', {}).items()]
+
+    return class_rules + rel_rules
+
+def root(classes):
+    """get all backcompat root classnames
     """
+    return unordered_list([c for c in classes if c in _CLASSIC_MAP])
 
-    el_copy = copy.copy(el)
+def apply_rules(el, html_parser):
+    """add modern classnames for older mf1 classnames
+    """
 
     def apply_prop_rules_to_children(parent, rules):
 
-        for child in (c for c in parent.children if isinstance(c, bs4.Tag)):
+        for child in get_children(parent):
             classes = child.get('class',[])
             # find existing mf2 properties if any and delete them
             mf2_props = mf2_classes.property_classes(classes)
@@ -129,19 +134,19 @@ def apply_prop_rules_to_children(parent, rules):
 
 
     # add mf2 root equivalent
-    classes = el_copy.get('class', [])
+    classes = el.get('class', [])
     old_roots = root(classes)
     for old_root in old_roots:
-        new_root = CLASSIC_ROOT_MAP[old_root]
-        if new_root not in classes:
-            el_copy['class'].append(new_root)
+        new_roots = _CLASSIC_MAP[old_root]['type']
+        classes.extend(new_roots)
+    el['class'] = classes
 
 
     # add mf2 prop equivalent to descendents and remove existing mf2 props
     rules = []
     for old_root in old_roots:
-        rules.extend(RULES.get(old_root,[]))
+        rules.extend(_get_rules(old_root, html_parser))
 
-    apply_prop_rules_to_children(el_copy, rules)
+    apply_prop_rules_to_children(el, rules)
 
-    return el_copy
+    return el