diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..7d5c014
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,10 @@
+# Exclude files from deployment:
+.gitattributes export-ignore
+.gitignore export-ignore
+docs export-ignore
+/screenshots export-ignore
+/tools export-ignore
+html_cleaner/test.py export-ignore
+obsolete export-ignore
+# Adjust GitHub linguist settings:
+ANKIWEB.md linguist-documentation
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..28d21a9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,12 @@
+*.pyo
+*.pyc
+*.sublime-project
+*.sublime-workspace
+.hidden
+.directory
+obsolete
+html_cleaner/forms
+docs/todo.md
+docs/description.html
+html-cleaner-*.zip
+.gitold
\ No newline at end of file
diff --git a/HTML Cleaner.py b/HTML Cleaner.py
new file mode 100644
index 0000000..455655d
--- /dev/null
+++ b/HTML Cleaner.py	
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+"""
+Anki Add-on: HTML Cleaner
+
+Entry point for the add-on into Anki
+
+Please don't edit this if you don't know what you're doing.
+
+Copyright: (c) Glutanimate 2017
+License: GNU AGPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
+"""
+
+import html_cleaner.main
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e42c952
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,33 @@
+# builds zip file for AnkiWeb (among other things)
+
+VERSION = `git describe HEAD --tags --abbrev=0`
+ADDON = "html-cleaner"
+ADDONDIR = "html_cleaner"
+
+all: ui zip
+
+clean:
+	rm -rf dist
+	rm $(ADDON)-*.zip
+
+ui:
+	rm -rf $(ADDON)/forms
+	./tools/build_ui.sh
+
+zip:
+	rm -rf dist
+	mkdir -p dist
+	find . -name '*.pyc' -delete
+	cp *.py dist/
+	cp -r $(ADDONDIR) dist/
+	cd dist && zip -r ../$(ADDON)-current.zip *
+	rm -rf dist
+
+release:
+	rm -rf dist
+	mkdir -p dist
+	find . -name '*.pyc' -delete
+	git archive --format tar $(VERSION) | tar -x -C dist/
+	cd dist &&  \
+		zip -r ../$(ADDON)-release-$(VERSION).zip $(ADDONDIR) *.py
+	rm -rf dist
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e9b2329
--- /dev/null
+++ b/README.md
@@ -0,0 +1,28 @@
+## HTML Cleaner Add-on for Anki
+
+Cleans and minifies HTML content of the current field, removing extraneous tags and attributes copied over from apps like Word, Chrome, etc.
+
+## Usage
+
+The add-on comes with a button and two hotkeys:
+
+- Clicking on the *cH* button in the editor will clean the HTML code of the active field. The same functionality can also be invoked via <kbd>Alt</kbd> + <kbd>H</kbd>
+- Shift-clicking the button or combining the aforementioned hotkey with Shift will undo the changes to the current field. (Anki's inbuilt undo functionality does not work with the add-on. This is a limitation that can't be solved trivially, I'm afraid.)
+- <kbd>Alt</kbd> + <kbd>V</kbd> will clean the clipboard selection and paste the processed text into the current field
+
+## Configuration
+
+The add-on's HTML processing is highly configurable. All options can be accessed by editing the configuration section of `html_cleaner/main.py`.
+
+## License and Credits
+
+*Cloze Overlapper* is *Copyright © 2016-2017 [Aristotelis P.](https://github.com/Glutanimate)*
+
+Licensed under the [GNU AGPL v3](https://www.gnu.org/licenses/agpl.html).
+
+This add-on would not not have been possible without the following open-source libraries:
+
+- [Bleach](https://github.com/mozilla/bleach) 2.0.0. Copyright (c) 2014-2017 Mozilla Foundation. Licensed under the Apache License 2.0
+- [html5lib](https://github.com/html5lib/) 0.999999999. Copyright (c) 2006-2013 James Graham and other contributors. Licensed under the MIT license.
+- [webencodings](https://github.com/gsnedders/python-webencodings) 0.5.1. Copyright (c) 2012-2017 Geoffrey Sneddon. Licensed under the BSD license.
+- [six](https://github.com/benjaminp/six) 1.10.0. Copyright (c) 2010-2015 Benjamin Peterson. Licensed under the MIT license
\ No newline at end of file
diff --git a/docs/description.md b/docs/description.md
new file mode 100644
index 0000000..e69de29
diff --git a/html_cleaner/LICENSES/LICENSE_BLEACH b/html_cleaner/LICENSES/LICENSE_BLEACH
new file mode 100644
index 0000000..467c38e
--- /dev/null
+++ b/html_cleaner/LICENSES/LICENSE_BLEACH
@@ -0,0 +1,13 @@
+Copyright (c) 2014-2017, Mozilla Foundation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/html_cleaner/LICENSES/LICENSE_HTML5LIB b/html_cleaner/LICENSES/LICENSE_HTML5LIB
new file mode 100644
index 0000000..c87fa7a
--- /dev/null
+++ b/html_cleaner/LICENSES/LICENSE_HTML5LIB
@@ -0,0 +1,20 @@
+Copyright (c) 2006-2013 James Graham and other contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/html_cleaner/LICENSES/LICENSE_SIX b/html_cleaner/LICENSES/LICENSE_SIX
new file mode 100644
index 0000000..e558f9d
--- /dev/null
+++ b/html_cleaner/LICENSES/LICENSE_SIX
@@ -0,0 +1,18 @@
+Copyright (c) 2010-2015 Benjamin Peterson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/html_cleaner/LICENSES/LICENSE_WEBENCODINGS b/html_cleaner/LICENSES/LICENSE_WEBENCODINGS
new file mode 100644
index 0000000..8ff5876
--- /dev/null
+++ b/html_cleaner/LICENSES/LICENSE_WEBENCODINGS
@@ -0,0 +1,18 @@
+Copyright (c) 2012-2017 Geoffrey Sneddon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/html_cleaner/__init__.py b/html_cleaner/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/html_cleaner/bleach/__init__.py b/html_cleaner/bleach/__init__.py
new file mode 100644
index 0000000..c9a7fe4
--- /dev/null
+++ b/html_cleaner/bleach/__init__.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+from bleach.linkifier import (
+    DEFAULT_CALLBACKS,
+    Linker,
+    LinkifyFilter,
+)
+from bleach.sanitizer import (
+    ALLOWED_ATTRIBUTES,
+    ALLOWED_PROTOCOLS,
+    ALLOWED_STYLES,
+    ALLOWED_TAGS,
+    BleachSanitizerFilter,
+    Cleaner,
+)
+from bleach.version import __version__, VERSION # flake8: noqa
+
+__all__ = ['clean', 'linkify']
+
+
+def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
+          strip_comments=True):
+    """Clean an HTML fragment of malicious content and return it
+
+    This function is a security-focused function whose sole purpose is to
+    remove malicious content from a string such that it can be displayed as
+    content in a web page.
+
+    This function is not designed to use to transform content to be used in
+    non-web-page contexts.
+
+    Example::
+
+        import bleach
+
+        better_text = bleach.clean(yucky_text)
+
+
+    .. Note::
+
+       If you're cleaning a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.sanitizer.Cleaner` instance.
+
+    :arg str text: the text to clean
+
+    :arg list tags: allowed list of tags; defaults to
+        ``bleach.sanitizer.ALLOWED_TAGS``
+
+    :arg dict attributes: allowed attributes; can be a callable, list or dict;
+        defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+    :arg list styles: allowed list of css styles; defaults to
+        ``bleach.sanitizer.ALLOWED_STYLES``
+
+    :arg list protocols: allowed list of protocols for links; defaults
+        to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+    :arg bool strip: whether or not to strip disallowed elements
+
+    :arg bool strip_comments: whether or not to strip HTML comments
+
+    :returns: cleaned text as unicode
+
+    """
+    cleaner = Cleaner(
+        tags=tags,
+        attributes=attributes,
+        styles=styles,
+        protocols=protocols,
+        strip=strip,
+        strip_comments=strip_comments,
+    )
+    return cleaner.clean(text)
+
+
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
+    """Convert URL-like strings in an HTML fragment to links
+
+    This function converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    linkify does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
+
+    .. Note::
+
+       If you're linking a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.linkifier.Linker` instance.
+
+    .. Note::
+
+       If you have text that you want to clean and then linkify, consider using
+       the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
+       pass. That way you're not parsing the HTML twice.
+
+    :arg str text: the text to linkify
+
+    :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+        defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+    :arg list skip_tags: list of tags that you don't want to linkify the
+        contents of; for example, you could set this to ``['pre']`` to skip
+        linkifying contents of ``pre`` tags
+
+    :arg bool parse_email: whether or not to linkify email addresses
+
+    :returns: linkified text as unicode
+
+    """
+    linker = Linker(
+        callbacks=callbacks,
+        skip_tags=skip_tags,
+        parse_email=parse_email
+    )
+    return linker.linkify(text)
diff --git a/html_cleaner/bleach/callbacks.py b/html_cleaner/bleach/callbacks.py
new file mode 100644
index 0000000..d2ba101
--- /dev/null
+++ b/html_cleaner/bleach/callbacks.py
@@ -0,0 +1,25 @@
+"""A set of basic callbacks for bleach.linkify."""
+from __future__ import unicode_literals
+
+
+def nofollow(attrs, new=False):
+    href_key = (None, u'href')
+    if href_key not in attrs or attrs[href_key].startswith(u'mailto:'):
+        return attrs
+
+    rel_key = (None, u'rel')
+    rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]
+    if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
+        rel_values.append(u'nofollow')
+    attrs[rel_key] = u' '.join(rel_values)
+
+    return attrs
+
+
+def target_blank(attrs, new=False):
+    href_key = (None, u'href')
+    if attrs[href_key].startswith(u'mailto:'):
+        return attrs
+
+    attrs[(None, u'target')] = u'_blank'
+    return attrs
diff --git a/html_cleaner/bleach/encoding.py b/html_cleaner/bleach/encoding.py
new file mode 100644
index 0000000..707adaa
--- /dev/null
+++ b/html_cleaner/bleach/encoding.py
@@ -0,0 +1,62 @@
+import datetime
+from decimal import Decimal
+import types
+import six
+
+
+def is_protected_type(obj):
+    """Determine if the object instance is of a protected type.
+
+    Objects of protected types are preserved as-is when passed to
+    force_unicode(strings_only=True).
+    """
+    return isinstance(obj, (
+        six.integer_types +
+        (types.NoneType,
+         datetime.datetime, datetime.date, datetime.time,
+         float, Decimal))
+    )
+
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Similar to smart_text, except that lazy instances are resolved to
+    strings, rather than kept as lazy objects.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    # Handle the common case first, saves 30-40% when s is an instance of
+    # six.text_type. This function gets called often in that setting.
+    if isinstance(s, six.text_type):
+        return s
+    if strings_only and is_protected_type(s):
+        return s
+    try:
+        if not isinstance(s, six.string_types):
+            if hasattr(s, '__unicode__'):
+                s = s.__unicode__()
+            else:
+                if six.PY3:
+                    if isinstance(s, bytes):
+                        s = six.text_type(s, encoding, errors)
+                    else:
+                        s = six.text_type(s)
+                else:
+                    s = six.text_type(bytes(s), encoding, errors)
+        else:
+            # Note: We use .decode() here, instead of six.text_type(s,
+            # encoding, errors), so that if s is a SafeBytes, it ends up being
+            # a SafeText at the end.
+            s = s.decode(encoding, errors)
+    except UnicodeDecodeError as e:
+        if not isinstance(s, Exception):
+            raise UnicodeDecodeError(*e.args)
+        else:
+            # If we get to here, the caller has passed in an Exception
+            # subclass populated with non-ASCII bytestring data without a
+            # working unicode method. Try to handle this without raising a
+            # further exception by individually forcing the exception args
+            # to unicode.
+            s = ' '.join([force_unicode(arg, encoding, strings_only,
+                          errors) for arg in s])
+    return s
diff --git a/html_cleaner/bleach/linkifier.py b/html_cleaner/bleach/linkifier.py
new file mode 100644
index 0000000..fc346c3
--- /dev/null
+++ b/html_cleaner/bleach/linkifier.py
@@ -0,0 +1,526 @@
+from __future__ import unicode_literals
+import re
+
+import html5lib
+from html5lib.filters.base import Filter
+from html5lib.filters.sanitizer import allowed_protocols
+from html5lib.serializer import HTMLSerializer
+
+from bleach import callbacks as linkify_callbacks
+from bleach.encoding import force_unicode
+from bleach.utils import alphabetize_attributes
+
+
+#: List of default callbacks
+DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
+
+
+TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
+       ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
+       cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
+       dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
+       gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
+       im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
+       kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
+       ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
+       net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
+       pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
+       sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
+       tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
+       xn xxx ye yt yu za zm zw""".split()
+
+# Make sure that .com doesn't get matched by .co first
+TLDS.reverse()
+
+
+def build_url_re(tlds=TLDS, protocols=allowed_protocols):
+    """Builds the url regex used by linkifier
+
+   If you want a different set of tlds or allowed protocols, pass those in
+   and stomp on the existing ``url_re``::
+
+       from bleach import linkifier
+
+       my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
+
+       linker = LinkifyFilter(url_re=my_url_re)
+
+    """
+    return re.compile(
+        r"""\(*  # Match any opening parentheses.
+        \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)?  # http://
+        ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b   # xx.yy.tld(:##)?
+        (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
+            # /path/zz (excluding "unsafe" chars from RFC 1738,
+            # except for # and ~, which happen in practice)
+        """.format('|'.join(protocols), '|'.join(tlds)),
+        re.IGNORECASE | re.VERBOSE | re.UNICODE)
+
+
+URL_RE = build_url_re()
+
+
+PROTO_RE = re.compile(r'^[\w-]+:/{0,3}', re.IGNORECASE)
+
+
+EMAIL_RE = re.compile(
+    r"""(?<!//)
+    (([-!#$%&'*+/=?^_`{}|~0-9A-Z]+
+        (\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*  # dot-atom
+    |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
+        |\\[\001-\011\013\014\016-\177])*"  # quoted-string
+    )@(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6})  # domain
+    """,
+    re.IGNORECASE | re.MULTILINE | re.VERBOSE)
+
+
+class Linker(object):
+    """Convert URL-like strings in an HTML fragment to links
+
+    This function converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    linkify does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
+
+    """
+    def __init__(self, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False,
+                 url_re=URL_RE, email_re=EMAIL_RE):
+        """Creates a Linker instance
+
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+        :arg list skip_tags: list of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``['pre']`` to skip
+            linkifying contents of ``pre`` tags
+
+        :arg bool parse_email: whether or not to linkify email addresses
+
+        :arg re url_re: url matching regex
+
+        :arg re email_re: email matching regex
+
+        :returns: linkified text as unicode
+
+        """
+        self.callbacks = callbacks
+        self.skip_tags = skip_tags
+        self.parse_email = parse_email
+        self.url_re = url_re
+        self.email_re = email_re
+
+        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+        self.walker = html5lib.getTreeWalker('etree')
+        self.serializer = HTMLSerializer(
+            quote_attr_values='always',
+            omit_optional_tags=False,
+
+            # linkify does not sanitize
+            sanitize=False,
+
+            # linkify alphabetizes
+            alphabetical_attributes=False,
+        )
+
+    def linkify(self, text):
+        """Linkify specified text
+
+        :arg str text: the text to add links to
+
+        :returns: linkified text as unicode
+
+        """
+        text = force_unicode(text)
+
+        if not text:
+            return u''
+
+        dom = self.parser.parseFragment(text)
+        filtered = LinkifyFilter(
+            source=self.walker(dom),
+            callbacks=self.callbacks,
+            skip_tags=self.skip_tags,
+            parse_email=self.parse_email,
+            url_re=self.url_re,
+            email_re=self.email_re,
+        )
+        return self.serializer.render(filtered)
+
+
+class LinkifyFilter(Filter):
+    """html5lib filter that linkifies text
+
+    This will do the following:
+
+    * convert email addresses into links
+    * convert urls into links
+    * edit existing links by running them through callbacks--the default is to
+      add a ``rel="nofollow"``
+
+    This filter can be used anywhere html5lib filters can be used.
+
+    """
+    def __init__(self, source, callbacks=None, skip_tags=None, parse_email=False,
+                 url_re=URL_RE, email_re=EMAIL_RE):
+        """Creates a LinkifyFilter instance
+
+        :arg TreeWalker source: stream
+
+        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+        :arg list skip_tags: list of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``['pre']`` to skip
+            linkifying contents of ``pre`` tags
+
+        :arg bool parse_email: whether or not to linkify email addresses
+
+        :arg re url_re: url matching regex
+
+        :arg re email_re: email matching regex
+
+        """
+        super(LinkifyFilter, self).__init__(source)
+
+        self.callbacks = callbacks or []
+        self.skip_tags = skip_tags or []
+        self.parse_email = parse_email
+
+        self.url_re = url_re
+        self.email_re = email_re
+
+    def apply_callbacks(self, attrs, is_new):
+        """Given an attrs dict and an is_new bool, runs through callbacks
+
+        Callbacks can return an adjusted attrs dict or ``None``. In the case of
+        ``None``, we stop going through callbacks and return that and the link
+        gets dropped.
+
+        :arg dict attrs: map of ``(namespace, name)`` -> ``value``
+
+        :arg bool is_new: whether or not this link was added by linkify
+
+        :returns: adjusted attrs dict or ``None``
+
+        """
+        for cb in self.callbacks:
+            attrs = cb(attrs, is_new)
+            if attrs is None:
+                return None
+        return attrs
+
+    def extract_character_data(self, token_list):
+        """Extracts and squashes character sequences in a token stream"""
+        # FIXME(willkg): This is a terrible idea. What it does is drop all the
+        # tags from the token list and merge the Characters and SpaceCharacters
+        # tokens into a single text.
+        #
+        # So something like this::
+        #
+        #     "<span>" "<b>" "some text" "</b>" "</span>"
+        #
+        # gets converted to "some text".
+        #
+        # This gets used to figure out the ``_text`` fauxttribute value for
+        # linkify callables.
+        #
+        # I'm not really sure how else to support that ``_text`` fauxttribute and
+        # maintain some modicum of backwards compatability with previous versions
+        # of Bleach.
+
+        out = []
+        for token in token_list:
+            token_type = token['type']
+            if token_type in ['Characters', 'SpaceCharacters']:
+                out.append(token['data'])
+
+        return u''.join(out)
+
+    def handle_email_addresses(self, src_iter):
+        """Handle email addresses in character tokens"""
+        for token in src_iter:
+            if token['type'] == 'Characters':
+                text = token['data']
+                new_tokens = []
+                end = 0
+
+                # For each email address we find in the text
+                for match in self.email_re.finditer(text):
+                    if match.start() > end:
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': text[end:match.start()]}
+                        )
+
+                    # Run attributes through the callbacks to see what we
+                    # should do with this match
+                    attrs = {
+                        (None, u'href'): u'mailto:%s' % match.group(0),
+                        u'_text': match.group(0)
+                    }
+                    attrs = self.apply_callbacks(attrs, True)
+
+                    if attrs is None:
+                        # Just add the text--but not as a link
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': match.group(0)}
+                        )
+
+                    else:
+                        # Add an "a" tag for the new link
+                        _text = attrs.pop(u'_text', '')
+                        attrs = alphabetize_attributes(attrs)
+                        new_tokens.extend([
+                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
+                            {u'type': u'Characters', u'data': force_unicode(_text)},
+                            {u'type': u'EndTag', u'name': 'a'}
+                        ])
+                    end = match.end()
+
+                if new_tokens:
+                    # Yield the adjusted set of tokens and then continue
+                    # through the loop
+                    if end < len(text):
+                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
+
+                    for new_token in new_tokens:
+                        yield new_token
+
+                    continue
+
+            yield token
+
+    def strip_non_url_bits(self, fragment):
+        """Strips non-url bits from the url
+
+        This accounts for over-eager matching by the regex.
+
+        """
+        prefix = suffix = ''
+
+        while fragment:
+            # Try removing ( from the beginning and, if it's balanced, from the
+            # end, too
+            if fragment.startswith(u'('):
+                prefix = prefix + u'('
+                fragment = fragment[1:]
+
+                if fragment.endswith(u')'):
+                    suffix = u')' + suffix
+                    fragment = fragment[:-1]
+                continue
+
+            # Now try extraneous things from the end. For example, sometimes we
+            # pick up ) at the end of a url, but the url is in a parenthesized
+            # phrase like:
+            #
+            #     "i looked at the site (at http://example.com)"
+
+            if fragment.endswith(u')') and u'(' not in fragment:
+                fragment = fragment[:-1]
+                suffix = u')' + suffix
+                continue
+
+            # Handle commas
+            if fragment.endswith(u','):
+                fragment = fragment[:-1]
+                suffix = u',' + suffix
+                continue
+
+            # Handle periods
+            if fragment.endswith(u'.'):
+                fragment = fragment[:-1]
+                suffix = u'.' + suffix
+                continue
+
+            # Nothing matched, so we're done
+            break
+
+        return fragment, prefix, suffix
+
+    def handle_links(self, src_iter):
+        """Handle links in character tokens"""
+        for token in src_iter:
+            if token['type'] == 'Characters':
+                text = token['data']
+                new_tokens = []
+                end = 0
+
+                for match in self.url_re.finditer(text):
+                    if match.start() > end:
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': text[end:match.start()]}
+                        )
+
+                    url = match.group(0)
+                    prefix = suffix = ''
+
+                    # Sometimes we pick up too much in the url match, so look for
+                    # bits we should drop and remove them from the match
+                    url, prefix, suffix = self.strip_non_url_bits(url)
+
+                    # If there's no protocol, add one
+                    if PROTO_RE.search(url):
+                        href = url
+                    else:
+                        href = u'http://%s' % url
+
+                    attrs = {
+                        (None, u'href'): href,
+                        u'_text': url
+                    }
+                    attrs = self.apply_callbacks(attrs, True)
+
+                    if attrs is None:
+                        # Just add the text
+                        new_tokens.append(
+                            {u'type': u'Characters', u'data': prefix + url + suffix}
+                        )
+
+                    else:
+                        # Add the "a" tag!
+                        if prefix:
+                            new_tokens.append(
+                                {u'type': u'Characters', u'data': prefix}
+                            )
+
+                        _text = attrs.pop(u'_text', '')
+                        attrs = alphabetize_attributes(attrs)
+
+                        new_tokens.extend([
+                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
+                            {u'type': u'Characters', u'data': force_unicode(_text)},
+                            {u'type': u'EndTag', u'name': 'a'},
+                        ])
+
+                        if suffix:
+                            new_tokens.append(
+                                {u'type': u'Characters', u'data': suffix}
+                            )
+
+                    end = match.end()
+
+                if new_tokens:
+                    # Yield the adjusted set of tokens and then continue
+                    # through the loop
+                    if end < len(text):
+                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})
+
+                    for new_token in new_tokens:
+                        yield new_token
+
+                    continue
+
+            yield token
+
+    def handle_a_tag(self, token_buffer):
+        """Handle the "a" tag
+
+        This could adjust the link or drop it altogether depending on what the
+        callbacks return.
+
+        This yields the new set of tokens.
+
+        """
+        a_token = token_buffer[0]
+        if a_token['data']:
+            attrs = a_token['data']
+        else:
+            attrs = {}
+        text = self.extract_character_data(token_buffer)
+        attrs['_text'] = text
+
+        attrs = self.apply_callbacks(attrs, False)
+
+        if attrs is None:
+            # We're dropping the "a" tag and everything else and replacing
+            # it with character data. So emit that token.
+            yield {'type': 'Characters', 'data': text}
+
+        else:
+            new_text = attrs.pop('_text', '')
+            a_token['data'] = alphabetize_attributes(attrs)
+
+            if text == new_text:
+                # The callbacks didn't change the text, so we yield the new "a"
+                # token, then whatever else was there, then the end "a" token
+                yield a_token
+                for mem in token_buffer[1:]:
+                    yield mem
+
+            else:
+                # If the callbacks changed the text, then we're going to drop
+                # all the tokens between the start and end "a" tags and replace
+                # it with the new text
+                yield a_token
+                yield {'type': 'Characters', 'data': force_unicode(new_text)}
+                yield token_buffer[-1]
+
+    def __iter__(self):
+        in_a = False
+        in_skip_tag = None
+
+        token_buffer = []
+
+        for token in super(LinkifyFilter, self).__iter__():
+            if in_a:
+                # Handle the case where we're in an "a" tag--we want to buffer tokens
+                # until we hit an end "a" tag.
+                if token['type'] == 'EndTag' and token['name'] == 'a':
+                    # Add the end tag to the token buffer and then handle them
+                    # and yield anything returned
+                    token_buffer.append(token)
+                    for new_token in self.handle_a_tag(token_buffer):
+                        yield new_token
+
+                    # Clear "a" related state and continue since we've yielded all
+                    # the tokens we're going to yield
+                    in_a = False
+                    token_buffer = []
+                    continue
+
+                else:
+                    token_buffer.append(token)
+                    continue
+
+            elif token['type'] in ['StartTag', 'EmptyTag']:
+                if token['name'] in self.skip_tags:
+                    # Skip tags start a "special mode" where we don't linkify
+                    # anything until the end tag.
+                    in_skip_tag = token['name']
+
+                elif token['name'] == 'a':
+                    # The "a" tag is special--we switch to a slurp mode and
+                    # slurp all the tokens until the end "a" tag and then
+                    # figure out what to do with them there.
+                    in_a = True
+                    token_buffer.append(token)
+
+                    # We buffer the start tag, so we don't want to yield it,
+                    # yet
+                    continue
+
+            elif in_skip_tag and self.skip_tags:
+                # NOTE(willkg): We put this clause here since in_a and
+                # switching in and out of in_a takes precedence.
+                if token['type'] == 'EndTag' and token['name'] == in_skip_tag:
+                    in_skip_tag = None
+
+            elif not in_a and not in_skip_tag and token['type'] == 'Characters':
+                new_stream = iter([token])
+                if self.parse_email:
+                    new_stream = self.handle_email_addresses(new_stream)
+
+                new_stream = self.handle_links(new_stream)
+
+                for token in new_stream:
+                    yield token
+
+                # We've already yielded this token, so continue
+                continue
+
+            yield token
diff --git a/html_cleaner/bleach/sanitizer.py b/html_cleaner/bleach/sanitizer.py
new file mode 100644
index 0000000..539711a
--- /dev/null
+++ b/html_cleaner/bleach/sanitizer.py
@@ -0,0 +1,368 @@
+from __future__ import unicode_literals
+import re
+from xml.sax.saxutils import unescape
+
+import html5lib
+from html5lib.constants import namespaces
+from html5lib.filters import sanitizer
+from html5lib.serializer import HTMLSerializer
+
+from bleach.encoding import force_unicode
+from bleach.utils import alphabetize_attributes
+
+
+#: List of allowed tags
+ALLOWED_TAGS = [
+    'a',
+    'abbr',
+    'acronym',
+    'b',
+    'blockquote',
+    'code',
+    'em',
+    'i',
+    'li',
+    'ol',
+    'strong',
+    'ul',
+]
+
+
+#: Map of allowed attributes by tag
+ALLOWED_ATTRIBUTES = {
+    'a': ['href', 'title'],
+    'abbr': ['title'],
+    'acronym': ['title'],
+}
+
+
+#: List of allowed styles
+ALLOWED_STYLES = []
+
+
+#: List of allowed protocols
+ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
+
+
+class Cleaner(object):
+    """Cleaner for cleaning HTML fragments of malicious content
+
+    This cleaner is a security-focused function whose sole purpose is to remove
+    malicious content from a string such that it can be displayed as content in
+    a web page.
+
+    This cleaner is not designed to use to transform content to be used in
+    non-web-page contexts.
+
+    To use::
+
+        from bleach.sanitizer import Cleaner
+
+        cleaner = Cleaner()
+
+        for text in all_the_yucky_things:
+            sanitized = cleaner.clean(text)
+
+    """
+
+    def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+                 styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
+                 strip_comments=True, filters=None):
+        """Initializes a Cleaner
+
+        :arg list tags: allowed list of tags; defaults to
+            ``bleach.sanitizer.ALLOWED_TAGS``
+
+        :arg dict attributes: allowed attributes; can be a callable, list or dict;
+            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+        :arg list styles: allowed list of css styles; defaults to
+            ``bleach.sanitizer.ALLOWED_STYLES``
+
+        :arg list protocols: allowed list of protocols for links; defaults
+            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+        :arg bool strip: whether or not to strip disallowed elements
+
+        :arg bool strip_comments: whether or not to strip HTML comments
+
+        :arg list filters: list of html5lib Filter classes to pass streamed content through
+
+            .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
+
+            .. Warning::
+
+               Using filters changes the output of ``bleach.Cleaner.clean``.
+               Make sure the way the filters change the output are secure.
+
+        """
+        self.tags = tags
+        self.attributes = attributes
+        self.styles = styles
+        self.protocols = protocols
+        self.strip = strip
+        self.strip_comments = strip_comments
+        self.filters = filters or []
+
+        self.parser = html5lib.HTMLParser(namespaceHTMLElements=False)
+        self.walker = html5lib.getTreeWalker('etree')
+        self.serializer = HTMLSerializer(
+            quote_attr_values='always',
+            omit_optional_tags=False,
+
+            # Bleach has its own sanitizer, so don't use the html5lib one
+            sanitize=False,
+
+            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+            alphabetical_attributes=False,
+        )
+
+    def clean(self, text):
+        """Cleans text and returns sanitized result as unicode
+
+        :arg str text: text to be cleaned
+
+        :returns: sanitized text as unicode
+
+        """
+        if not text:
+            return u''
+
+        text = force_unicode(text)
+
+        dom = self.parser.parseFragment(text)
+        filtered = BleachSanitizerFilter(
+            source=self.walker(dom),
+
+            # Bleach-sanitizer-specific things
+            attributes=self.attributes,
+            strip_disallowed_elements=self.strip,
+            strip_html_comments=self.strip_comments,
+
+            # html5lib-sanitizer things
+            allowed_elements=self.tags,
+            allowed_css_properties=self.styles,
+            allowed_protocols=self.protocols,
+            allowed_svg_properties=[],
+        )
+
+        # Apply any filters after the BleachSanitizerFilter
+        for filter_class in self.filters:
+            filtered = filter_class(source=filtered)
+
+        return self.serializer.render(filtered)
+
+
+def attribute_filter_factory(attributes):
+    """Generates attribute filter function for the given attributes value
+
+    The attributes value can take one of several shapes. This returns a filter
+    function appropriate to the attributes value. One nice thing about this is
+    that there's less if/then shenanigans in the ``allow_token`` method.
+
+    """
+    if callable(attributes):
+        return attributes
+
+    if isinstance(attributes, dict):
+        def _attr_filter(tag, attr, value):
+            if tag in attributes:
+                attr_val = attributes[tag]
+                if callable(attr_val):
+                    return attr_val(tag, attr, value)
+
+                if attr in attr_val:
+                    return True
+
+            if '*' in attributes:
+                attr_val = attributes['*']
+                if callable(attr_val):
+                    return attr_val(tag, attr, value)
+
+                return attr in attr_val
+
+            return False
+
+        return _attr_filter
+
+    if isinstance(attributes, list):
+        def _attr_filter(tag, attr, value):
+            return attr in attributes
+
+        return _attr_filter
+
+    raise ValueError('attributes needs to be a callable, a list or a dict')
+
+
+class BleachSanitizerFilter(sanitizer.Filter):
+    """html5lib Filter that sanitizes text
+
+    This filter can be used anywhere html5lib filters can be used.
+
+    """
+    def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
+                 strip_disallowed_elements=False, strip_html_comments=True,
+                 **kwargs):
+        """Creates a BleachSanitizerFilter instance
+
+        :arg Treewalker source: stream
+
+        :arg list tags: allowed list of tags; defaults to
+            ``bleach.sanitizer.ALLOWED_TAGS``
+
+        :arg dict attributes: allowed attributes; can be a callable, list or dict;
+            defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+        :arg list styles: allowed list of css styles; defaults to
+            ``bleach.sanitizer.ALLOWED_STYLES``
+
+        :arg list protocols: allowed list of protocols for links; defaults
+            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+        :arg bool strip_disallowed_elements: whether or not to strip disallowed
+            elements
+
+        :arg bool strip_html_comments: whether or not to strip HTML comments
+
+        """
+        self.attr_filter = attribute_filter_factory(attributes)
+
+        self.strip_disallowed_elements = strip_disallowed_elements
+        self.strip_html_comments = strip_html_comments
+
+        return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
+
+    def sanitize_token(self, token):
+        """Sanitize a token either by HTML-encoding or dropping.
+
+        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
+        ['attribute', 'pairs'], 'tag': callable}.
+
+        Here callable is a function with two arguments of attribute name and
+        value. It should return true of false.
+
+        Also gives the option to strip tags instead of encoding.
+
+        """
+        token_type = token['type']
+        if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
+            if token['name'] in self.allowed_elements:
+                return self.allow_token(token)
+
+            elif self.strip_disallowed_elements:
+                pass
+
+            else:
+                if 'data' in token:
+                    # Alphabetize the attributes before calling .disallowed_token()
+                    # so that the resulting string is stable
+                    token['data'] = alphabetize_attributes(token['data'])
+                return self.disallowed_token(token)
+
+        elif token_type == 'Comment':
+            if not self.strip_html_comments:
+                return token
+
+        else:
+            return token
+
+    def allow_token(self, token):
+        """Handles the case where we're allowing the tag"""
+        if 'data' in token:
+            # Loop through all the attributes and drop the ones that are not
+            # allowed, are unsafe or break other rules. Additionally, fix
+            # attribute values that need fixing.
+            #
+            # At the end of this loop, we have the final set of attributes
+            # we're keeping.
+            attrs = {}
+            for namespaced_name, val in token['data'].items():
+                namespace, name = namespaced_name
+
+                # Drop attributes that are not explicitly allowed
+                #
+                # NOTE(willkg): We pass in the attribute name--not a namespaced
+                # name.
+                if not self.attr_filter(token['name'], name, val):
+                    continue
+
+                # Look at attributes that have uri values
+                if namespaced_name in self.attr_val_is_uri:
+                    val_unescaped = re.sub(
+                        "[`\000-\040\177-\240\s]+",
+                        '',
+                        unescape(val)).lower()
+
+                    # Remove replacement characters from unescaped characters.
+                    val_unescaped = val_unescaped.replace("\ufffd", "")
+
+                    # Drop attributes with uri values that have protocols that
+                    # aren't allowed
+                    if (re.match(r'^[a-z0-9][-+.a-z0-9]*:', val_unescaped) and
+                            (val_unescaped.split(':')[0] not in self.allowed_protocols)):
+                        continue
+
+                # Drop values in svg attrs with non-local IRIs
+                if namespaced_name in self.svg_attr_val_allows_ref:
+                    new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                     ' ',
+                                     unescape(val))
+                    new_val = new_val.strip()
+                    if not new_val:
+                        continue
+
+                    else:
+                        # Replace the val with the unescaped version because
+                        # it's a iri
+                        val = new_val
+
+                # Drop href and xlink:href attr for svg elements with non-local IRIs
+                if (None, token['name']) in self.svg_allow_local_href:
+                    if namespaced_name in [(None, 'href'), (namespaces['xlink'], 'href')]:
+                        if re.search(r'^\s*[^#\s]', val):
+                            continue
+
+                # If it's a style attribute, sanitize it
+                if namespaced_name == (None, u'style'):
+                    val = self.sanitize_css(val)
+
+                # At this point, we want to keep the attribute, so add it in
+                attrs[namespaced_name] = val
+
+            token['data'] = alphabetize_attributes(attrs)
+
+        return token
+
+    def sanitize_css(self, style):
+        """Sanitizes css in style tags"""
+        # disallow urls
+        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+
+        # Validate the css in the style tag and if it's not valid, then drop
+        # the whole thing.
+        parts = style.split(';')
+        gauntlet = re.compile(
+            r"""^([-/:,#%.'"\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'\s*|"[\s\w]+"|\([\d,%\.\s]+\))*$"""
+        )
+
+        for part in parts:
+            if not gauntlet.match(part):
+                return ''
+
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall('([-\w]+)\s*:\s*([^:;]*)', style):
+            if not value:
+                continue
+
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
diff --git a/html_cleaner/bleach/utils.py b/html_cleaner/bleach/utils.py
new file mode 100644
index 0000000..d9c211f
--- /dev/null
+++ b/html_cleaner/bleach/utils.py
@@ -0,0 +1,23 @@
+from collections import OrderedDict
+
+
+def _attr_key(attr):
+    """Returns appropriate key for sorting attribute names
+
+    Attribute names are a tuple of ``(namespace, name)`` where namespace can be
+    ``None`` or a string. These can't be compared in Python 3, so we conver the
+    ``None`` to an empty string.
+
+    """
+    key = (attr[0][0] or ''), attr[0][1]
+    return key
+
+
+def alphabetize_attributes(attrs):
+    """Takes a dict of attributes (or None) and returns them alphabetized"""
+    if not attrs:
+        return attrs
+
+    return OrderedDict(
+        [(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]
+    )
diff --git a/html_cleaner/bleach/version.py b/html_cleaner/bleach/version.py
new file mode 100644
index 0000000..bcd8aff
--- /dev/null
+++ b/html_cleaner/bleach/version.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+VERSION = (2, 0, 0)
+__version__ = '.'.join([str(n) for n in VERSION])
diff --git a/html_cleaner/html5lib/__init__.py b/html_cleaner/html5lib/__init__.py
new file mode 100644
index 0000000..8ee9b53
--- /dev/null
+++ b/html_cleaner/html5lib/__init__.py
@@ -0,0 +1,25 @@
+"""
+HTML parsing library based on the WHATWG "HTML5"
+specification. The parser is designed to be compatible with existing
+HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage:
+
+import html5lib
+f = open("my_document.html")
+tree = html5lib.parse(f)
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .html5parser import HTMLParser, parse, parseFragment
+from .treebuilders import getTreeBuilder
+from .treewalkers import getTreeWalker
+from .serializer import serialize
+
+__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
+           "getTreeWalker", "serialize"]
+
+# this has to be at the top level, see how setup.py parses this
+__version__ = "0.999999999"
diff --git a/html_cleaner/html5lib/_ihatexml.py b/html_cleaner/html5lib/_ihatexml.py
new file mode 100644
index 0000000..d6d1d6f
--- /dev/null
+++ b/html_cleaner/html5lib/_ihatexml.py
@@ -0,0 +1,288 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+import warnings
+
+from .constants import DataLossWarning
+
+baseChar = """
+[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
+[#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
+[#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
+[#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
+[#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
+[#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
+[#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
+[#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
+[#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
+[#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
+[#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
+[#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
+[#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
+[#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
+[#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
+[#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
+[#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
+[#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
+[#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
+[#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
+[#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
+[#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
+[#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
+[#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
+[#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
+[#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
+[#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
+[#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
+[#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
+[#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
+#x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
+#x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
+#x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
+[#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
+[#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
+#x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
+[#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
+[#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
+[#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
+[#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
+[#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
+#x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
+[#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
+[#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
+[#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
+[#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+
+ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
+
+combiningCharacter = """
+[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
+[#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
+[#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
+[#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
+#x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
+[#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
+[#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
+#x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
+[#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
+[#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
+#x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
+[#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
+[#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
+[#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
+[#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
+[#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
+#x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
+[#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
+#x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
+[#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
+[#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
+#x3099 | #x309A"""
+
+digit = """
+[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
+[#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
+[#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
+[#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+
+extender = """
+#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
+#[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+
+letter = " | ".join([baseChar, ideographic])
+
+# Without the
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter,
+                   extender])
+nameFirst = " | ".join([letter, "_"])
+
+reChar = re.compile(r"#x([\d|A-F]{4,4})")
+reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
+
+
+def charStringToList(chars):
+    charRanges = [item.strip() for item in chars.split(" | ")]
+    rv = []
+    for item in charRanges:
+        foundMatch = False
+        for regexp in (reChar, reCharRange):
+            match = regexp.match(item)
+            if match is not None:
+                rv.append([hexToInt(item) for item in match.groups()])
+                if len(rv[-1]) == 1:
+                    rv[-1] = rv[-1] * 2
+                foundMatch = True
+                break
+        if not foundMatch:
+            assert len(item) == 1
+
+            rv.append([ord(item)] * 2)
+    rv = normaliseCharList(rv)
+    return rv
+
+
+def normaliseCharList(charList):
+    charList = sorted(charList)
+    for item in charList:
+        assert item[1] >= item[0]
+    rv = []
+    i = 0
+    while i < len(charList):
+        j = 1
+        rv.append(charList[i])
+        while i + j < len(charList) and charList[i + j][0] <= rv[-1][1] + 1:
+            rv[-1][1] = charList[i + j][1]
+            j += 1
+        i += j
+    return rv
+
+# We don't really support characters above the BMP :(
+max_unicode = int("FFFF", 16)
+
+
+def missingRanges(charList):
+    rv = []
+    if charList[0] != 0:
+        rv.append([0, charList[0][0] - 1])
+    for i, item in enumerate(charList[:-1]):
+        rv.append([item[1] + 1, charList[i + 1][0] - 1])
+    if charList[-1][1] != max_unicode:
+        rv.append([charList[-1][1] + 1, max_unicode])
+    return rv
+
+
+def listToRegexpStr(charList):
+    rv = []
+    for item in charList:
+        if item[0] == item[1]:
+            rv.append(escapeRegexp(chr(item[0])))
+        else:
+            rv.append(escapeRegexp(chr(item[0])) + "-" +
+                      escapeRegexp(chr(item[1])))
+    return "[%s]" % "".join(rv)
+
+
+def hexToInt(hex_str):
+    return int(hex_str, 16)
+
+
+def escapeRegexp(string):
+    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
+                         "[", "]", "|", "(", ")", "-")
+    for char in specialCharacters:
+        string = string.replace(char, "\\" + char)
+
+    return string
+
+# output from the above
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
+
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
+
+# Simpler things
+nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
+
+
+class InfosetFilter(object):
+    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
+
+    def __init__(self,
+                 dropXmlnsLocalName=False,
+                 dropXmlnsAttrNs=False,
+                 preventDoubleDashComments=False,
+                 preventDashAtCommentEnd=False,
+                 replaceFormFeedCharacters=True,
+                 preventSingleQuotePubid=False):
+
+        self.dropXmlnsLocalName = dropXmlnsLocalName
+        self.dropXmlnsAttrNs = dropXmlnsAttrNs
+
+        self.preventDoubleDashComments = preventDoubleDashComments
+        self.preventDashAtCommentEnd = preventDashAtCommentEnd
+
+        self.replaceFormFeedCharacters = replaceFormFeedCharacters
+
+        self.preventSingleQuotePubid = preventSingleQuotePubid
+
+        self.replaceCache = {}
+
+    def coerceAttribute(self, name, namespace=None):
+        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
+            warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
+            return None
+        elif (self.dropXmlnsAttrNs and
+              namespace == "http://www.w3.org/2000/xmlns/"):
+            warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
+            return None
+        else:
+            return self.toXmlName(name)
+
+    def coerceElement(self, name):
+        return self.toXmlName(name)
+
+    def coerceComment(self, data):
+        if self.preventDoubleDashComments:
+            while "--" in data:
+                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
+                data = data.replace("--", "- -")
+            if data.endswith("-"):
+                warnings.warn("Comments cannot end in a dash", DataLossWarning)
+                data += " "
+        return data
+
+    def coerceCharacters(self, data):
+        if self.replaceFormFeedCharacters:
+            for _ in range(data.count("\x0C")):
+                warnings.warn("Text cannot contain U+000C", DataLossWarning)
+            data = data.replace("\x0C", " ")
+        # Other non-xml characters
+        return data
+
+    def coercePubid(self, data):
+        dataOutput = data
+        for char in nonPubidCharRegexp.findall(data):
+            warnings.warn("Coercing non-XML pubid", DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            dataOutput = dataOutput.replace(char, replacement)
+        if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
+            warnings.warn("Pubid cannot contain single quote", DataLossWarning)
+            dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
+        return dataOutput
+
+    def toXmlName(self, name):
+        nameFirst = name[0]
+        nameRest = name[1:]
+        m = nonXmlNameFirstBMPRegexp.match(nameFirst)
+        if m:
+            warnings.warn("Coercing non-XML name", DataLossWarning)
+            nameFirstOutput = self.getReplacementCharacter(nameFirst)
+        else:
+            nameFirstOutput = nameFirst
+
+        nameRestOutput = nameRest
+        replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
+        for char in replaceChars:
+            warnings.warn("Coercing non-XML name", DataLossWarning)
+            replacement = self.getReplacementCharacter(char)
+            nameRestOutput = nameRestOutput.replace(char, replacement)
+        return nameFirstOutput + nameRestOutput
+
+    def getReplacementCharacter(self, char):
+        if char in self.replaceCache:
+            replacement = self.replaceCache[char]
+        else:
+            replacement = self.escapeChar(char)
+        return replacement
+
+    def fromXmlName(self, name):
+        for item in set(self.replacementRegexp.findall(name)):
+            name = name.replace(item, self.unescapeChar(item))
+        return name
+
+    def escapeChar(self, char):
+        replacement = "U%05X" % ord(char)
+        self.replaceCache[char] = replacement
+        return replacement
+
+    def unescapeChar(self, charcode):
+        return chr(int(charcode[1:], 16))
diff --git a/html_cleaner/html5lib/_inputstream.py b/html_cleaner/html5lib/_inputstream.py
new file mode 100644
index 0000000..79f2331
--- /dev/null
+++ b/html_cleaner/html5lib/_inputstream.py
@@ -0,0 +1,923 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import text_type, binary_type
+from six.moves import http_client, urllib
+
+import codecs
+import re
+
+import webencodings
+
+from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from .constants import ReparseException
+from . import _utils
+
+from io import StringIO
+
+try:
+    from io import BytesIO
+except ImportError:
+    BytesIO = StringIO
+
+# Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
+asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
+asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
+
+
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
+
+if _utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # eval. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
+                                    "]")
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
+
+non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+                                  0x10FFFE, 0x10FFFF])
+
+ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+
+# Cache for charsUntil()
+charsUntilRegEx = {}
+
+
+class BufferedStream(object):
+    """Buffering for streams that do not have buffering of their own
+
+    The buffer is implemented as a list of chunks on the assumption that
+    joining many strings will be slow since it is O(n**2)
+    """
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.buffer = []
+        self.position = [-1, 0]  # chunk number, offset
+
+    def tell(self):
+        pos = 0
+        for chunk in self.buffer[:self.position[0]]:
+            pos += len(chunk)
+        pos += self.position[1]
+        return pos
+
+    def seek(self, pos):
+        assert pos <= self._bufferedBytes()
+        offset = pos
+        i = 0
+        while len(self.buffer[i]) < offset:
+            offset -= len(self.buffer[i])
+            i += 1
+        self.position = [i, offset]
+
+    def read(self, bytes):
+        if not self.buffer:
+            return self._readStream(bytes)
+        elif (self.position[0] == len(self.buffer) and
+              self.position[1] == len(self.buffer[-1])):
+            return self._readStream(bytes)
+        else:
+            return self._readFromBuffer(bytes)
+
+    def _bufferedBytes(self):
+        return sum([len(item) for item in self.buffer])
+
+    def _readStream(self, bytes):
+        data = self.stream.read(bytes)
+        self.buffer.append(data)
+        self.position[0] += 1
+        self.position[1] = len(data)
+        return data
+
+    def _readFromBuffer(self, bytes):
+        remainingBytes = bytes
+        rv = []
+        bufferIndex = self.position[0]
+        bufferOffset = self.position[1]
+        while bufferIndex < len(self.buffer) and remainingBytes != 0:
+            assert remainingBytes > 0
+            bufferedData = self.buffer[bufferIndex]
+
+            if remainingBytes <= len(bufferedData) - bufferOffset:
+                bytesToRead = remainingBytes
+                self.position = [bufferIndex, bufferOffset + bytesToRead]
+            else:
+                bytesToRead = len(bufferedData) - bufferOffset
+                self.position = [bufferIndex, len(bufferedData)]
+                bufferIndex += 1
+            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
+            remainingBytes -= bytesToRead
+
+            bufferOffset = 0
+
+        if remainingBytes:
+            rv.append(self._readStream(remainingBytes))
+
+        return b"".join(rv)
+
+
+def HTMLInputStream(source, **kwargs):
+    # Work around Python bug #20007: read(0) closes the connection.
+    # http://bugs.python.org/issue20007
+    if (isinstance(source, http_client.HTTPResponse) or
+        # Also check for addinfourl wrapping HTTPResponse
+        (isinstance(source, urllib.response.addbase) and
+         isinstance(source.fp, http_client.HTTPResponse))):
+        isUnicode = False
+    elif hasattr(source, "read"):
+        isUnicode = isinstance(source.read(0), text_type)
+    else:
+        isUnicode = isinstance(source, text_type)
+
+    if isUnicode:
+        encodings = [x for x in kwargs if x.endswith("_encoding")]
+        if encodings:
+            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
+
+        return HTMLUnicodeInputStream(source, **kwargs)
+    else:
+        return HTMLBinaryInputStream(source, **kwargs)
+
+
+class HTMLUnicodeInputStream(object):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    _defaultChunkSize = 10240
+
+    def __init__(self, source):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+
+        if not _utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+        elif len("\U0010FFFF") == 1:
+            self.reportCharacterErrors = self.characterErrorsUCS4
+        else:
+            self.reportCharacterErrors = self.characterErrorsUCS2
+
+        # List of where new lines occur
+        self.newLines = [0]
+
+        self.charEncoding = (lookupEncoding("utf-8"), "certain")
+        self.dataStream = self.openStream(source)
+
+        self.reset()
+
+    def reset(self):
+        self.chunk = ""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+        self.errors = []
+
+        # number of (complete) lines in previous chunks
+        self.prevNumLines = 0
+        # number of columns in the last line of the previous chunk
+        self.prevNumCols = 0
+
+        # Deal with CR LF and surrogates split over chunk boundaries
+        self._bufferedCharacter = None
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = StringIO(source)
+
+        return stream
+
+    def _position(self, offset):
+        chunk = self.chunk
+        nLines = chunk.count('\n', 0, offset)
+        positionLine = self.prevNumLines + nLines
+        lastLinePos = chunk.rfind('\n', 0, offset)
+        if lastLinePos == -1:
+            positionColumn = self.prevNumCols + offset
+        else:
+            positionColumn = offset - (lastLinePos + 1)
+        return (positionLine, positionColumn)
+
+    def position(self):
+        """Returns (line, col) of the current position in the stream."""
+        line, col = self._position(self.chunkOffset)
+        return (line + 1, col)
+
+    def char(self):
+        """ Read one character from the stream or queue if available. Return
+            EOF when EOF is reached.
+        """
+        # Read a new chunk from the input stream if necessary
+        if self.chunkOffset >= self.chunkSize:
+            if not self.readChunk():
+                return EOF
+
+        chunkOffset = self.chunkOffset
+        char = self.chunk[chunkOffset]
+        self.chunkOffset = chunkOffset + 1
+
+        return char
+
+    def readChunk(self, chunkSize=None):
+        if chunkSize is None:
+            chunkSize = self._defaultChunkSize
+
+        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
+
+        self.chunk = ""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+
+        data = self.dataStream.read(chunkSize)
+
+        # Deal with CR LF and surrogates broken across chunks
+        if self._bufferedCharacter:
+            data = self._bufferedCharacter + data
+            self._bufferedCharacter = None
+        elif not data:
+            # We have no more data, bye-bye stream
+            return False
+
+        if len(data) > 1:
+            lastv = ord(data[-1])
+            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
+                self._bufferedCharacter = data[-1]
+                data = data[:-1]
+
+        if self.reportCharacterErrors:
+            self.reportCharacterErrors(data)
+
+        # Replace invalid characters
+        data = data.replace("\r\n", "\n")
+        data = data.replace("\r", "\n")
+
+        self.chunk = data
+        self.chunkSize = len(data)
+
+        return True
+
+    def characterErrorsUCS4(self, data):
+        for _ in range(len(invalid_unicode_re.findall(data))):
+            self.errors.append("invalid-codepoint")
+
+    def characterErrorsUCS2(self, data):
+        # Someone picked the wrong compile option
+        # You lose
+        skip = False
+        for match in invalid_unicode_re.finditer(data):
+            if skip:
+                continue
+            codepoint = ord(match.group())
+            pos = match.start()
+            # Pretty sure there should be endianness issues here
+            if _utils.isSurrogatePair(data[pos:pos + 2]):
+                # We have a surrogate pair!
+                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
+                if char_val in non_bmp_invalid_codepoints:
+                    self.errors.append("invalid-codepoint")
+                skip = True
+            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
+                  pos == len(data) - 1):
+                self.errors.append("invalid-codepoint")
+            else:
+                skip = False
+                self.errors.append("invalid-codepoint")
+
+    def charsUntil(self, characters, opposite=False):
+        """ Returns a string of characters from the stream up to but not
+        including any character in 'characters' or EOF. 'characters' must be
+        a container that supports the 'in' method and iteration over its
+        characters.
+        """
+
+        # Use a cache of regexps to find the required characters
+        try:
+            chars = charsUntilRegEx[(characters, opposite)]
+        except KeyError:
+            if __debug__:
+                for c in characters:
+                    assert(ord(c) < 128)
+            regex = "".join(["\\x%02x" % ord(c) for c in characters])
+            if not opposite:
+                regex = "^%s" % regex
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
+
+        rv = []
+
+        while True:
+            # Find the longest matching prefix
+            m = chars.match(self.chunk, self.chunkOffset)
+            if m is None:
+                # If nothing matched, and it wasn't because we ran out of chunk,
+                # then stop
+                if self.chunkOffset != self.chunkSize:
+                    break
+            else:
+                end = m.end()
+                # If not the whole chunk matched, return everything
+                # up to the part that didn't match
+                if end != self.chunkSize:
+                    rv.append(self.chunk[self.chunkOffset:end])
+                    self.chunkOffset = end
+                    break
+            # If the whole remainder of the chunk matched,
+            # use it all and read the next chunk
+            rv.append(self.chunk[self.chunkOffset:])
+            if not self.readChunk():
+                # Reached EOF
+                break
+
+        r = "".join(rv)
+        return r
+
+    def unget(self, char):
+        # Only one character is allowed to be ungotten at once - it must
+        # be consumed again before any further call to unget
+        if char is not None:
+            if self.chunkOffset == 0:
+                # unget is called quite rarely, so it's a good idea to do
+                # more work here if it saves a bit of work in the frequently
+                # called char and charsUntil.
+                # So, just prepend the ungotten character onto the current
+                # chunk:
+                self.chunk = char + self.chunk
+                self.chunkSize += 1
+            else:
+                self.chunkOffset -= 1
+                assert self.chunk[self.chunkOffset] == char
+
+
+class HTMLBinaryInputStream(HTMLUnicodeInputStream):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    def __init__(self, source, override_encoding=None, transport_encoding=None,
+                 same_origin_parent_encoding=None, likely_encoding=None,
+                 default_encoding="windows-1252", useChardet=True):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+        # Raw Stream - for unicode objects this will encode to utf-8 and set
+        #              self.charEncoding as appropriate
+        self.rawStream = self.openStream(source)
+
+        HTMLUnicodeInputStream.__init__(self, self.rawStream)
+
+        # Encoding Information
+        # Number of bytes to use when looking for a meta element with
+        # encoding information
+        self.numBytesMeta = 1024
+        # Number of bytes to use when using detecting encoding using chardet
+        self.numBytesChardet = 100
+        # Things from args
+        self.override_encoding = override_encoding
+        self.transport_encoding = transport_encoding
+        self.same_origin_parent_encoding = same_origin_parent_encoding
+        self.likely_encoding = likely_encoding
+        self.default_encoding = default_encoding
+
+        # Determine encoding
+        self.charEncoding = self.determineEncoding(useChardet)
+        assert self.charEncoding[0] is not None
+
+        # Call superclass
+        self.reset()
+
+    def reset(self):
+        self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
+        HTMLUnicodeInputStream.reset(self)
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            stream = BytesIO(source)
+
+        try:
+            stream.seek(stream.tell())
+        except:  # pylint:disable=bare-except
+            stream = BufferedStream(stream)
+
+        return stream
+
+    def determineEncoding(self, chardet=True):
+        # BOMs take precedence over everything
+        # This will also read past the BOM if present
+        charEncoding = self.detectBOM(), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # If we've been overriden, we've been overriden
+        charEncoding = lookupEncoding(self.override_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Now check the transport layer
+        charEncoding = lookupEncoding(self.transport_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Look for meta elements with encoding information
+        charEncoding = self.detectEncodingMeta(), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Parent document encoding
+        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
+        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
+            return charEncoding
+
+        # "likely" encoding
+        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Guess with chardet, if available
+        if chardet:
+            try:
+                from chardet.universaldetector import UniversalDetector
+            except ImportError:
+                pass
+            else:
+                buffers = []
+                detector = UniversalDetector()
+                while not detector.done:
+                    buffer = self.rawStream.read(self.numBytesChardet)
+                    assert isinstance(buffer, bytes)
+                    if not buffer:
+                        break
+                    buffers.append(buffer)
+                    detector.feed(buffer)
+                detector.close()
+                encoding = lookupEncoding(detector.result['encoding'])
+                self.rawStream.seek(0)
+                if encoding is not None:
+                    return encoding, "tentative"
+
+        # Try the default encoding
+        charEncoding = lookupEncoding(self.default_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Fallback to html5lib's default if even that hasn't worked
+        return lookupEncoding("windows-1252"), "tentative"
+
+    def changeEncoding(self, newEncoding):
+        assert self.charEncoding[1] != "certain"
+        newEncoding = lookupEncoding(newEncoding)
+        if newEncoding is None:
+            return
+        if newEncoding.name in ("utf-16be", "utf-16le"):
+            newEncoding = lookupEncoding("utf-8")
+            assert newEncoding is not None
+        elif newEncoding == self.charEncoding[0]:
+            self.charEncoding = (self.charEncoding[0], "certain")
+        else:
+            self.rawStream.seek(0)
+            self.charEncoding = (newEncoding, "certain")
+            self.reset()
+            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
+
+    def detectBOM(self):
+        """Attempts to detect at BOM at the start of the stream. If
+        an encoding can be determined from the BOM return the name of the
+        encoding otherwise return None"""
+        bomDict = {
+            codecs.BOM_UTF8: 'utf-8',
+            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
+            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        string = self.rawStream.read(4)
+        assert isinstance(string, bytes)
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict.get(string[:3])         # UTF-8
+        seek = 3
+        if not encoding:
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict.get(string)         # UTF-32
+            seek = 4
+            if not encoding:
+                encoding = bomDict.get(string[:2])  # UTF-16
+                seek = 2
+
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        if encoding:
+            self.rawStream.seek(seek)
+            return lookupEncoding(encoding)
+        else:
+            self.rawStream.seek(0)
+            return None
+
+    def detectEncodingMeta(self):
+        """Report the encoding declared by the meta element
+        """
+        buffer = self.rawStream.read(self.numBytesMeta)
+        assert isinstance(buffer, bytes)
+        parser = EncodingParser(buffer)
+        self.rawStream.seek(0)
+        encoding = parser.getEncoding()
+
+        if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
+            encoding = lookupEncoding("utf-8")
+
+        return encoding
+
+
+class EncodingBytes(bytes):
+    """String-like object with an associated position and various extra methods
+    If the position is ever greater than the string length then an exception is
+    raised"""
+    def __new__(self, value):
+        assert isinstance(value, bytes)
+        return bytes.__new__(self, value.lower())
+
+    def __init__(self, value):
+        # pylint:disable=unused-argument
+        self._position = -1
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        p = self._position = self._position + 1
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        return self[p:p + 1]
+
+    def next(self):
+        # Py2 compat
+        return self.__next__()
+
+    def previous(self):
+        p = self._position
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        self._position = p = p - 1
+        return self[p:p + 1]
+
+    def setPosition(self, position):
+        if self._position >= len(self):
+            raise StopIteration
+        self._position = position
+
+    def getPosition(self):
+        if self._position >= len(self):
+            raise StopIteration
+        if self._position >= 0:
+            return self._position
+        else:
+            return None
+
+    position = property(getPosition, setPosition)
+
+    def getCurrentByte(self):
+        return self[self.position:self.position + 1]
+
+    currentByte = property(getCurrentByte)
+
+    def skip(self, chars=spaceCharactersBytes):
+        """Skip past a list of characters"""
+        p = self.position               # use property for the error-checking
+        while p < len(self):
+            c = self[p:p + 1]
+            if c not in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def skipUntil(self, chars):
+        p = self.position
+        while p < len(self):
+            c = self[p:p + 1]
+            if c in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def matchBytes(self, bytes):
+        """Look for a sequence of bytes at the start of a string. If the bytes
+        are found return True and advance the position to the byte after the
+        match. Otherwise return False and leave the position alone"""
+        p = self.position
+        data = self[p:p + len(bytes)]
+        rv = data.startswith(bytes)
+        if rv:
+            self.position += len(bytes)
+        return rv
+
+    def jumpTo(self, bytes):
+        """Look for the next sequence of bytes matching a given sequence. If
+        a match is found advance the position to the last byte of the match"""
+        newPosition = self[self.position:].find(bytes)
+        if newPosition > -1:
+            # XXX: This is ugly, but I can't see a nicer way to fix this.
+            if self._position == -1:
+                self._position = 0
+            self._position += (newPosition + len(bytes) - 1)
+            return True
+        else:
+            raise StopIteration
+
+
+class EncodingParser(object):
+    """Mini parser for detecting character encoding from meta elements"""
+
+    def __init__(self, data):
+        """string - the data to work on for encoding detection"""
+        self.data = EncodingBytes(data)
+        self.encoding = None
+
+    def getEncoding(self):
+        methodDispatch = (
+            (b"<!--", self.handleComment),
+            (b"<meta", self.handleMeta),
+            (b"</", self.handlePossibleEndTag),
+            (b"<!", self.handleOther),
+            (b"<?", self.handleOther),
+            (b"<", self.handlePossibleStartTag))
+        for _ in self.data:
+            keepParsing = True
+            for key, method in methodDispatch:
+                if self.data.matchBytes(key):
+                    try:
+                        keepParsing = method()
+                        break
+                    except StopIteration:
+                        keepParsing = False
+                        break
+            if not keepParsing:
+                break
+
+        return self.encoding
+
+    def handleComment(self):
+        """Skip over comments"""
+        return self.data.jumpTo(b"-->")
+
+    def handleMeta(self):
+        if self.data.currentByte not in spaceCharactersBytes:
+            # if we have <meta not followed by a space so just keep going
+            return True
+        # We have a valid meta element we want to search for attributes
+        hasPragma = False
+        pendingEncoding = None
+        while True:
+            # Try to find the next attribute after the current position
+            attr = self.getAttribute()
+            if attr is None:
+                return True
+            else:
+                if attr[0] == b"http-equiv":
+                    hasPragma = attr[1] == b"content-type"
+                    if hasPragma and pendingEncoding is not None:
+                        self.encoding = pendingEncoding
+                        return False
+                elif attr[0] == b"charset":
+                    tentativeEncoding = attr[1]
+                    codec = lookupEncoding(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
+                        return False
+                elif attr[0] == b"content":
+                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
+                    tentativeEncoding = contentParser.parse()
+                    if tentativeEncoding is not None:
+                        codec = lookupEncoding(tentativeEncoding)
+                        if codec is not None:
+                            if hasPragma:
+                                self.encoding = codec
+                                return False
+                            else:
+                                pendingEncoding = codec
+
+    def handlePossibleStartTag(self):
+        return self.handlePossibleTag(False)
+
+    def handlePossibleEndTag(self):
+        next(self.data)
+        return self.handlePossibleTag(True)
+
+    def handlePossibleTag(self, endTag):
+        data = self.data
+        if data.currentByte not in asciiLettersBytes:
+            # If the next byte is not an ascii letter either ignore this
+            # fragment (possible start tag case) or treat it according to
+            # handleOther
+            if endTag:
+                data.previous()
+                self.handleOther()
+            return True
+
+        c = data.skipUntil(spacesAngleBrackets)
+        if c == b"<":
+            # return to the first step in the overall "two step" algorithm
+            # reprocessing the < byte
+            data.previous()
+        else:
+            # Read all attributes
+            attr = self.getAttribute()
+            while attr is not None:
+                attr = self.getAttribute()
+        return True
+
+    def handleOther(self):
+        return self.data.jumpTo(b">")
+
+    def getAttribute(self):
+        """Return a name,value pair for the next attribute in the stream,
+        if one is found, or None"""
+        data = self.data
+        # Step 1 (skip chars)
+        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
+        assert c is None or len(c) == 1
+        # Step 2
+        if c in (b">", None):
+            return None
+        # Step 3
+        attrName = []
+        attrValue = []
+        # Step 4 attribute name
+        while True:
+            if c == b"=" and attrName:
+                break
+            elif c in spaceCharactersBytes:
+                # Step 6!
+                c = data.skip()
+                break
+            elif c in (b"/", b">"):
+                return b"".join(attrName), b""
+            elif c in asciiUppercaseBytes:
+                attrName.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrName.append(c)
+            # Step 5
+            c = next(data)
+        # Step 7
+        if c != b"=":
+            data.previous()
+            return b"".join(attrName), b""
+        # Step 8
+        next(data)
+        # Step 9
+        c = data.skip()
+        # Step 10
+        if c in (b"'", b'"'):
+            # 10.1
+            quoteChar = c
+            while True:
+                # 10.2
+                c = next(data)
+                # 10.3
+                if c == quoteChar:
+                    next(data)
+                    return b"".join(attrName), b"".join(attrValue)
+                # 10.4
+                elif c in asciiUppercaseBytes:
+                    attrValue.append(c.lower())
+                # 10.5
+                else:
+                    attrValue.append(c)
+        elif c == b">":
+            return b"".join(attrName), b""
+        elif c in asciiUppercaseBytes:
+            attrValue.append(c.lower())
+        elif c is None:
+            return None
+        else:
+            attrValue.append(c)
+        # Step 11
+        while True:
+            c = next(data)
+            if c in spacesAngleBrackets:
+                return b"".join(attrName), b"".join(attrValue)
+            elif c in asciiUppercaseBytes:
+                attrValue.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrValue.append(c)
+
+
+class ContentAttrParser(object):
+    def __init__(self, data):
+        assert isinstance(data, bytes)
+        self.data = data
+
+    def parse(self):
+        try:
+            # Check if the attr name is charset
+            # otherwise return
+            self.data.jumpTo(b"charset")
+            self.data.position += 1
+            self.data.skip()
+            if not self.data.currentByte == b"=":
+                # If there is no = sign keep looking for attrs
+                return None
+            self.data.position += 1
+            self.data.skip()
+            # Look for an encoding between matching quote marks
+            if self.data.currentByte in (b'"', b"'"):
+                quoteMark = self.data.currentByte
+                self.data.position += 1
+                oldPosition = self.data.position
+                if self.data.jumpTo(quoteMark):
+                    return self.data[oldPosition:self.data.position]
+                else:
+                    return None
+            else:
+                # Unquoted value
+                oldPosition = self.data.position
+                try:
+                    self.data.skipUntil(spaceCharactersBytes)
+                    return self.data[oldPosition:self.data.position]
+                except StopIteration:
+                    # Return the whole remaining value
+                    return self.data[oldPosition:]
+        except StopIteration:
+            return None
+
+
+def lookupEncoding(encoding):
+    """Return the python codec name corresponding to an encoding or None if the
+    string doesn't correspond to a valid encoding."""
+    if isinstance(encoding, binary_type):
+        try:
+            encoding = encoding.decode("ascii")
+        except UnicodeDecodeError:
+            return None
+
+    if encoding is not None:
+        try:
+            return webencodings.lookup(encoding)
+        except AttributeError:
+            return None
+    else:
+        return None
diff --git a/html_cleaner/html5lib/_tokenizer.py b/html_cleaner/html5lib/_tokenizer.py
new file mode 100644
index 0000000..6078f66
--- /dev/null
+++ b/html_cleaner/html5lib/_tokenizer.py
@@ -0,0 +1,1721 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import unichr as chr
+
+from collections import deque
+
+from .constants import spaceCharacters
+from .constants import entities
+from .constants import asciiLetters, asciiUpper2Lower
+from .constants import digits, hexDigits, EOF
+from .constants import tokenTypes, tagTokenTypes
+from .constants import replacementCharacters
+
+from ._inputstream import HTMLInputStream
+
+from ._trie import Trie
+
+entitiesTrie = Trie(entities)
+
+
+class HTMLTokenizer(object):
+    """ This class takes care of tokenizing HTML.
+
+    * self.currentToken
+      Holds the token that is currently being processed.
+
+    * self.state
+      Holds a reference to the method to be invoked... XXX
+
+    * self.stream
+      Points to HTMLInputStream object.
+    """
+
+    def __init__(self, stream, parser=None, **kwargs):
+
+        self.stream = HTMLInputStream(stream, **kwargs)
+        self.parser = parser
+
+        # Setup the initial tokenizer state
+        self.escapeFlag = False
+        self.lastFourChars = []
+        self.state = self.dataState
+        self.escape = False
+
+        # The current token being created
+        self.currentToken = None
+        super(HTMLTokenizer, self).__init__()
+
+    def __iter__(self):
+        """ This is where the magic happens.
+
+        We do our usually processing through the states and when we have a token
+        to return we yield the token which pauses processing until the next token
+        is requested.
+        """
+        self.tokenQueue = deque([])
+        # Start processing. When EOF is reached self.state will return False
+        # instead of True and the loop will terminate.
+        while self.state():
+            while self.stream.errors:
+                yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
+            while self.tokenQueue:
+                yield self.tokenQueue.popleft()
+
+    def consumeNumberEntity(self, isHex):
+        """This function returns either U+FFFD or the character based on the
+        decimal or hexadecimal representation. It also discards ";" if present.
+        If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
+        """
+
+        allowed = digits
+        radix = 10
+        if isHex:
+            allowed = hexDigits
+            radix = 16
+
+        charStack = []
+
+        # Consume all the characters that are in range while making sure we
+        # don't hit an EOF.
+        c = self.stream.char()
+        while c in allowed and c is not EOF:
+            charStack.append(c)
+            c = self.stream.char()
+
+        # Convert the set of characters consumed to an int.
+        charAsInt = int("".join(charStack), radix)
+
+        # Certain characters get replaced with others
+        if charAsInt in replacementCharacters:
+            char = replacementCharacters[charAsInt]
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "illegal-codepoint-for-numeric-entity",
+                                    "datavars": {"charAsInt": charAsInt}})
+        elif ((0xD800 <= charAsInt <= 0xDFFF) or
+              (charAsInt > 0x10FFFF)):
+            char = "\uFFFD"
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "illegal-codepoint-for-numeric-entity",
+                                    "datavars": {"charAsInt": charAsInt}})
+        else:
+            # Should speed up this check somehow (e.g. move the set to a constant)
+            if ((0x0001 <= charAsInt <= 0x0008) or
+                (0x000E <= charAsInt <= 0x001F) or
+                (0x007F <= charAsInt <= 0x009F) or
+                (0xFDD0 <= charAsInt <= 0xFDEF) or
+                charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
+                                        0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                                        0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
+                                        0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
+                                        0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
+                                        0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
+                                        0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                        0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
+                                        0xFFFFF, 0x10FFFE, 0x10FFFF])):
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                        "data":
+                                        "illegal-codepoint-for-numeric-entity",
+                                        "datavars": {"charAsInt": charAsInt}})
+            try:
+                # Try/except needed as UCS-2 Python builds' unichar only works
+                # within the BMP.
+                char = chr(charAsInt)
+            except ValueError:
+                v = charAsInt - 0x10000
+                char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
+
+        # Discard the ; if present. Otherwise, put it back on the queue and
+        # invoke parseError on parser.
+        if c != ";":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "numeric-entity-without-semicolon"})
+            self.stream.unget(c)
+
+        return char
+
+    def consumeEntity(self, allowedChar=None, fromAttribute=False):
+        # Initialise to the default output for when no entity is matched
+        output = "&"
+
+        charStack = [self.stream.char()]
+        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
+                (allowedChar is not None and allowedChar == charStack[0])):
+            self.stream.unget(charStack[0])
+
+        elif charStack[0] == "#":
+            # Read the next character to see if it's hex or decimal
+            hex = False
+            charStack.append(self.stream.char())
+            if charStack[-1] in ("x", "X"):
+                hex = True
+                charStack.append(self.stream.char())
+
+            # charStack[-1] should be the first digit
+            if (hex and charStack[-1] in hexDigits) \
+                    or (not hex and charStack[-1] in digits):
+                # At least one digit found, so consume the whole number
+                self.stream.unget(charStack[-1])
+                output = self.consumeNumberEntity(hex)
+            else:
+                # No digits found
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                        "data": "expected-numeric-entity"})
+                self.stream.unget(charStack.pop())
+                output = "&" + "".join(charStack)
+
+        else:
+            # At this point in the process might have named entity. Entities
+            # are stored in the global variable "entities".
+            #
+            # Consume characters and compare to these to a substring of the
+            # entity names in the list until the substring no longer matches.
+            while (charStack[-1] is not EOF):
+                if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
+                    break
+                charStack.append(self.stream.char())
+
+            # At this point we have a string that starts with some characters
+            # that may match an entity
+            # Try to find the longest entity the string will match to take care
+            # of &noti for instance.
+            try:
+                entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
+                entityLength = len(entityName)
+            except KeyError:
+                entityName = None
+
+            if entityName is not None:
+                if entityName[-1] != ";":
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                            "named-entity-without-semicolon"})
+                if (entityName[-1] != ";" and fromAttribute and
+                    (charStack[entityLength] in asciiLetters or
+                     charStack[entityLength] in digits or
+                     charStack[entityLength] == "=")):
+                    self.stream.unget(charStack.pop())
+                    output = "&" + "".join(charStack)
+                else:
+                    output = entities[entityName]
+                    self.stream.unget(charStack.pop())
+                    output += "".join(charStack[entityLength:])
+            else:
+                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                        "expected-named-entity"})
+                self.stream.unget(charStack.pop())
+                output = "&" + "".join(charStack)
+
+        if fromAttribute:
+            self.currentToken["data"][-1][1] += output
+        else:
+            if output in spaceCharacters:
+                tokenType = "SpaceCharacters"
+            else:
+                tokenType = "Characters"
+            self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
+
+    def processEntityInAttribute(self, allowedChar):
+        """This method replaces the need for "entityInAttributeValueState".
+        """
+        self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
+
+    def emitCurrentToken(self):
+        """This method is a generic handler for emitting the tags. It also sets
+        the state to "data" because that's what's needed after a token has been
+        emitted.
+        """
+        token = self.currentToken
+        # Add token to the queue to be yielded
+        if (token["type"] in tagTokenTypes):
+            token["name"] = token["name"].translate(asciiUpper2Lower)
+            if token["type"] == tokenTypes["EndTag"]:
+                if token["data"]:
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                            "data": "attributes-in-end-tag"})
+                if token["selfClosing"]:
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                            "data": "self-closing-flag-on-end-tag"})
+        self.tokenQueue.append(token)
+        self.state = self.dataState
+
+    # Below are the various tokenizer states worked out.
+    def dataState(self):
+        data = self.stream.char()
+        if data == "&":
+            self.state = self.entityDataState
+        elif data == "<":
+            self.state = self.tagOpenState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\u0000"})
+        elif data is EOF:
+            # Tokenization ends.
+            return False
+        elif data in spaceCharacters:
+            # Directly after emitting a token you switch back to the "data
+            # state". At that point spaceCharacters are important so they are
+            # emitted separately.
+            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
+                                    data + self.stream.charsUntil(spaceCharacters, True)})
+            # No need to update lastFourChars here, since the first space will
+            # have already been appended to lastFourChars and will have broken
+            # any <!-- or --> sequences
+        else:
+            chars = self.stream.charsUntil(("&", "<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def entityDataState(self):
+        self.consumeEntity()
+        self.state = self.dataState
+        return True
+
+    def rcdataState(self):
+        data = self.stream.char()
+        if data == "&":
+            self.state = self.characterReferenceInRcdata
+        elif data == "<":
+            self.state = self.rcdataLessThanSignState
+        elif data == EOF:
+            # Tokenization ends.
+            return False
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data in spaceCharacters:
+            # Directly after emitting a token you switch back to the "data
+            # state". At that point spaceCharacters are important so they are
+            # emitted separately.
+            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
+                                    data + self.stream.charsUntil(spaceCharacters, True)})
+            # No need to update lastFourChars here, since the first space will
+            # have already been appended to lastFourChars and will have broken
+            # any <!-- or --> sequences
+        else:
+            chars = self.stream.charsUntil(("&", "<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def characterReferenceInRcdata(self):
+        self.consumeEntity()
+        self.state = self.rcdataState
+        return True
+
+    def rawtextState(self):
+        data = self.stream.char()
+        if data == "<":
+            self.state = self.rawtextLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data == EOF:
+            # Tokenization ends.
+            return False
+        else:
+            chars = self.stream.charsUntil(("<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def scriptDataState(self):
+        data = self.stream.char()
+        if data == "<":
+            self.state = self.scriptDataLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data == EOF:
+            # Tokenization ends.
+            return False
+        else:
+            chars = self.stream.charsUntil(("<", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def plaintextState(self):
+        data = self.stream.char()
+        if data == EOF:
+            # Tokenization ends.
+            return False
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + self.stream.charsUntil("\u0000")})
+        return True
+
+    def tagOpenState(self):
+        data = self.stream.char()
+        if data == "!":
+            self.state = self.markupDeclarationOpenState
+        elif data == "/":
+            self.state = self.closeTagOpenState
+        elif data in asciiLetters:
+            self.currentToken = {"type": tokenTypes["StartTag"],
+                                 "name": data, "data": [],
+                                 "selfClosing": False,
+                                 "selfClosingAcknowledged": False}
+            self.state = self.tagNameState
+        elif data == ">":
+            # XXX In theory it could be something besides a tag name. But
+            # do we really care?
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-tag-name-but-got-right-bracket"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
+            self.state = self.dataState
+        elif data == "?":
+            # XXX In theory it could be something besides a tag name. But
+            # do we really care?
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-tag-name-but-got-question-mark"})
+            self.stream.unget(data)
+            self.state = self.bogusCommentState
+        else:
+            # XXX
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-tag-name"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.dataState
+        return True
+
+    def closeTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
+                                 "data": [], "selfClosing": False}
+            self.state = self.tagNameState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-closing-tag-but-got-right-bracket"})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-closing-tag-but-got-eof"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.state = self.dataState
+        else:
+            # XXX data can be _'_...
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-closing-tag-but-got-char",
+                                    "datavars": {"data": data}})
+            self.stream.unget(data)
+            self.state = self.bogusCommentState
+        return True
+
+    def tagNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeAttributeNameState
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-tag-name"})
+            self.state = self.dataState
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] += "\uFFFD"
+        else:
+            self.currentToken["name"] += data
+            # (Don't use charsUntil here, because tag names are
+            # very short and it's faster to not do anything fancy)
+        return True
+
+    def rcdataLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.temporaryBuffer = ""
+            self.state = self.rcdataEndTagOpenState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.rcdataState
+        return True
+
+    def rcdataEndTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.temporaryBuffer += data
+            self.state = self.rcdataEndTagNameState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.stream.unget(data)
+            self.state = self.rcdataState
+        return True
+
+    def rcdataEndTagNameState(self):
+        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+        data = self.stream.char()
+        if data in spaceCharacters and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.beforeAttributeNameState
+        elif data == "/" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.selfClosingStartTagState
+        elif data == ">" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.emitCurrentToken()
+            self.state = self.dataState
+        elif data in asciiLetters:
+            self.temporaryBuffer += data
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "</" + self.temporaryBuffer})
+            self.stream.unget(data)
+            self.state = self.rcdataState
+        return True
+
+    def rawtextLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.temporaryBuffer = ""
+            self.state = self.rawtextEndTagOpenState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.rawtextState
+        return True
+
+    def rawtextEndTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.temporaryBuffer += data
+            self.state = self.rawtextEndTagNameState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.stream.unget(data)
+            self.state = self.rawtextState
+        return True
+
+    def rawtextEndTagNameState(self):
+        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+        data = self.stream.char()
+        if data in spaceCharacters and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.beforeAttributeNameState
+        elif data == "/" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.selfClosingStartTagState
+        elif data == ">" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.emitCurrentToken()
+            self.state = self.dataState
+        elif data in asciiLetters:
+            self.temporaryBuffer += data
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "</" + self.temporaryBuffer})
+            self.stream.unget(data)
+            self.state = self.rawtextState
+        return True
+
+    def scriptDataLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.temporaryBuffer = ""
+            self.state = self.scriptDataEndTagOpenState
+        elif data == "!":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
+            self.state = self.scriptDataEscapeStartState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEndTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.temporaryBuffer += data
+            self.state = self.scriptDataEndTagNameState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEndTagNameState(self):
+        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+        data = self.stream.char()
+        if data in spaceCharacters and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.beforeAttributeNameState
+        elif data == "/" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.selfClosingStartTagState
+        elif data == ">" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.emitCurrentToken()
+            self.state = self.dataState
+        elif data in asciiLetters:
+            self.temporaryBuffer += data
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "</" + self.temporaryBuffer})
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEscapeStartState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataEscapeStartDashState
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEscapeStartDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataEscapedDashDashState
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataState
+        return True
+
+    def scriptDataEscapedState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataEscapedDashState
+        elif data == "<":
+            self.state = self.scriptDataEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data == EOF:
+            self.state = self.dataState
+        else:
+            chars = self.stream.charsUntil(("<", "-", "\u0000"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
+                                    data + chars})
+        return True
+
+    def scriptDataEscapedDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataEscapedDashDashState
+        elif data == "<":
+            self.state = self.scriptDataEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataEscapedState
+        elif data == EOF:
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataEscapedDashDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+        elif data == "<":
+            self.state = self.scriptDataEscapedLessThanSignState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
+            self.state = self.scriptDataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataEscapedState
+        elif data == EOF:
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataEscapedLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.temporaryBuffer = ""
+            self.state = self.scriptDataEscapedEndTagOpenState
+        elif data in asciiLetters:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
+            self.temporaryBuffer = data
+            self.state = self.scriptDataDoubleEscapeStartState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.stream.unget(data)
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataEscapedEndTagOpenState(self):
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.temporaryBuffer = data
+            self.state = self.scriptDataEscapedEndTagNameState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
+            self.stream.unget(data)
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataEscapedEndTagNameState(self):
+        appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
+        data = self.stream.char()
+        if data in spaceCharacters and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.beforeAttributeNameState
+        elif data == "/" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.state = self.selfClosingStartTagState
+        elif data == ">" and appropriate:
+            self.currentToken = {"type": tokenTypes["EndTag"],
+                                 "name": self.temporaryBuffer,
+                                 "data": [], "selfClosing": False}
+            self.emitCurrentToken()
+            self.state = self.dataState
+        elif data in asciiLetters:
+            self.temporaryBuffer += data
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "</" + self.temporaryBuffer})
+            self.stream.unget(data)
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataDoubleEscapeStartState(self):
+        data = self.stream.char()
+        if data in (spaceCharacters | frozenset(("/", ">"))):
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            if self.temporaryBuffer.lower() == "script":
+                self.state = self.scriptDataDoubleEscapedState
+            else:
+                self.state = self.scriptDataEscapedState
+        elif data in asciiLetters:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.temporaryBuffer += data
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataEscapedState
+        return True
+
+    def scriptDataDoubleEscapedState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataDoubleEscapedDashState
+        elif data == "<":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.state = self.scriptDataDoubleEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-script-in-script"})
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+        return True
+
+    def scriptDataDoubleEscapedDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+            self.state = self.scriptDataDoubleEscapedDashDashState
+        elif data == "<":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.state = self.scriptDataDoubleEscapedLessThanSignState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataDoubleEscapedState
+        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-script-in-script"})
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.state = self.scriptDataDoubleEscapedState
+        return True
+
+    def scriptDataDoubleEscapedDashDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
+        elif data == "<":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
+            self.state = self.scriptDataDoubleEscapedLessThanSignState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
+            self.state = self.scriptDataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": "\uFFFD"})
+            self.state = self.scriptDataDoubleEscapedState
+        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-script-in-script"})
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.state = self.scriptDataDoubleEscapedState
+        return True
+
+    def scriptDataDoubleEscapedLessThanSignState(self):
+        data = self.stream.char()
+        if data == "/":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
+            self.temporaryBuffer = ""
+            self.state = self.scriptDataDoubleEscapeEndState
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataDoubleEscapedState
+        return True
+
+    def scriptDataDoubleEscapeEndState(self):
+        data = self.stream.char()
+        if data in (spaceCharacters | frozenset(("/", ">"))):
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            if self.temporaryBuffer.lower() == "script":
+                self.state = self.scriptDataEscapedState
+            else:
+                self.state = self.scriptDataDoubleEscapedState
+        elif data in asciiLetters:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
+            self.temporaryBuffer += data
+        else:
+            self.stream.unget(data)
+            self.state = self.scriptDataDoubleEscapedState
+        return True
+
+    def beforeAttributeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data in ("'", '"', "=", "<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "invalid-character-in-attribute-name"})
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"].append(["\uFFFD", ""])
+            self.state = self.attributeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-attribute-name-but-got-eof"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        return True
+
+    def attributeNameState(self):
+        data = self.stream.char()
+        leavingThisState = True
+        emitToken = False
+        if data == "=":
+            self.state = self.beforeAttributeValueState
+        elif data in asciiLetters:
+            self.currentToken["data"][-1][0] += data +\
+                self.stream.charsUntil(asciiLetters, True)
+            leavingThisState = False
+        elif data == ">":
+            # XXX If we emit here the attributes are converted to a dict
+            # without being checked and when the code below runs we error
+            # because data is a dict not a list
+            emitToken = True
+        elif data in spaceCharacters:
+            self.state = self.afterAttributeNameState
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][0] += "\uFFFD"
+            leavingThisState = False
+        elif data in ("'", '"', "<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data":
+                                    "invalid-character-in-attribute-name"})
+            self.currentToken["data"][-1][0] += data
+            leavingThisState = False
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "eof-in-attribute-name"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][0] += data
+            leavingThisState = False
+
+        if leavingThisState:
+            # Attributes are not dropped at this stage. That happens when the
+            # start tag token is emitted so values can still be safely appended
+            # to attributes, but we do want to report the parse error in time.
+            self.currentToken["data"][-1][0] = (
+                self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
+            for name, _ in self.currentToken["data"][:-1]:
+                if self.currentToken["data"][-1][0] == name:
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                            "duplicate-attribute"})
+                    break
+            # XXX Fix for above XXX
+            if emitToken:
+                self.emitCurrentToken()
+        return True
+
+    def afterAttributeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data == "=":
+            self.state = self.beforeAttributeValueState
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"].append(["\uFFFD", ""])
+            self.state = self.attributeNameState
+        elif data in ("'", '"', "<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "invalid-character-after-attribute-name"})
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-end-of-tag-but-got-eof"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        return True
+
+    def beforeAttributeValueState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data == "\"":
+            self.state = self.attributeValueDoubleQuotedState
+        elif data == "&":
+            self.state = self.attributeValueUnQuotedState
+            self.stream.unget(data)
+        elif data == "'":
+            self.state = self.attributeValueSingleQuotedState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-attribute-value-but-got-right-bracket"})
+            self.emitCurrentToken()
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+            self.state = self.attributeValueUnQuotedState
+        elif data in ("=", "<", "`"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "equals-in-unquoted-attribute-value"})
+            self.currentToken["data"][-1][1] += data
+            self.state = self.attributeValueUnQuotedState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-attribute-value-but-got-eof"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][1] += data
+            self.state = self.attributeValueUnQuotedState
+        return True
+
+    def attributeValueDoubleQuotedState(self):
+        data = self.stream.char()
+        if data == "\"":
+            self.state = self.afterAttributeValueState
+        elif data == "&":
+            self.processEntityInAttribute('"')
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-attribute-value-double-quote"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][1] += data +\
+                self.stream.charsUntil(("\"", "&", "\u0000"))
+        return True
+
+    def attributeValueSingleQuotedState(self):
+        data = self.stream.char()
+        if data == "'":
+            self.state = self.afterAttributeValueState
+        elif data == "&":
+            self.processEntityInAttribute("'")
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-attribute-value-single-quote"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][1] += data +\
+                self.stream.charsUntil(("'", "&", "\u0000"))
+        return True
+
+    def attributeValueUnQuotedState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeAttributeNameState
+        elif data == "&":
+            self.processEntityInAttribute(">")
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data in ('"', "'", "=", "<", "`"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-character-in-unquoted-attribute-value"})
+            self.currentToken["data"][-1][1] += data
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"][-1][1] += "\uFFFD"
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-attribute-value-no-quotes"})
+            self.state = self.dataState
+        else:
+            self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
+                frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
+        return True
+
+    def afterAttributeValueState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeAttributeNameState
+        elif data == ">":
+            self.emitCurrentToken()
+        elif data == "/":
+            self.state = self.selfClosingStartTagState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-EOF-after-attribute-value"})
+            self.stream.unget(data)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-character-after-attribute-value"})
+            self.stream.unget(data)
+            self.state = self.beforeAttributeNameState
+        return True
+
+    def selfClosingStartTagState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.currentToken["selfClosing"] = True
+            self.emitCurrentToken()
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data":
+                                    "unexpected-EOF-after-solidus-in-tag"})
+            self.stream.unget(data)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-character-after-solidus-in-tag"})
+            self.stream.unget(data)
+            self.state = self.beforeAttributeNameState
+        return True
+
+    def bogusCommentState(self):
+        # Make a new comment token and give it as value all the characters
+        # until the first > or EOF (charsUntil checks for EOF automatically)
+        # and emit it.
+        data = self.stream.charsUntil(">")
+        data = data.replace("\u0000", "\uFFFD")
+        self.tokenQueue.append(
+            {"type": tokenTypes["Comment"], "data": data})
+
+        # Eat the character directly after the bogus comment which is either a
+        # ">" or an EOF.
+        self.stream.char()
+        self.state = self.dataState
+        return True
+
+    def markupDeclarationOpenState(self):
+        charStack = [self.stream.char()]
+        if charStack[-1] == "-":
+            charStack.append(self.stream.char())
+            if charStack[-1] == "-":
+                self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
+                self.state = self.commentStartState
+                return True
+        elif charStack[-1] in ('d', 'D'):
+            matched = True
+            for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
+                             ('y', 'Y'), ('p', 'P'), ('e', 'E')):
+                charStack.append(self.stream.char())
+                if charStack[-1] not in expected:
+                    matched = False
+                    break
+            if matched:
+                self.currentToken = {"type": tokenTypes["Doctype"],
+                                     "name": "",
+                                     "publicId": None, "systemId": None,
+                                     "correct": True}
+                self.state = self.doctypeState
+                return True
+        elif (charStack[-1] == "[" and
+              self.parser is not None and
+              self.parser.tree.openElements and
+              self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
+            matched = True
+            for expected in ["C", "D", "A", "T", "A", "["]:
+                charStack.append(self.stream.char())
+                if charStack[-1] != expected:
+                    matched = False
+                    break
+            if matched:
+                self.state = self.cdataSectionState
+                return True
+
+        self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                "expected-dashes-or-doctype"})
+
+        while charStack:
+            self.stream.unget(charStack.pop())
+        self.state = self.bogusCommentState
+        return True
+
+    def commentStartState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.commentStartDashState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "incorrect-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += data
+            self.state = self.commentState
+        return True
+
+    def commentStartDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.commentEndState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "-\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "incorrect-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += "-" + data
+            self.state = self.commentState
+        return True
+
+    def commentState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.commentEndDashState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "\uFFFD"
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "eof-in-comment"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += data + \
+                self.stream.charsUntil(("-", "\u0000"))
+        return True
+
+    def commentEndDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.commentEndState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "-\uFFFD"
+            self.state = self.commentState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment-end-dash"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += "-" + data
+            self.state = self.commentState
+        return True
+
+    def commentEndState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "--\uFFFD"
+            self.state = self.commentState
+        elif data == "!":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-bang-after-double-dash-in-comment"})
+            self.state = self.commentEndBangState
+        elif data == "-":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-dash-after-double-dash-in-comment"})
+            self.currentToken["data"] += data
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment-double-dash"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            # XXX
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-comment"})
+            self.currentToken["data"] += "--" + data
+            self.state = self.commentState
+        return True
+
+    def commentEndBangState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == "-":
+            self.currentToken["data"] += "--!"
+            self.state = self.commentEndDashState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["data"] += "--!\uFFFD"
+            self.state = self.commentState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-comment-end-bang-state"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += "--!" + data
+            self.state = self.commentState
+        return True
+
+    def doctypeState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeDoctypeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-doctype-name-but-got-eof"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "need-space-after-doctype"})
+            self.stream.unget(data)
+            self.state = self.beforeDoctypeNameState
+        return True
+
+    def beforeDoctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-doctype-name-but-got-right-bracket"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] = "\uFFFD"
+            self.state = self.doctypeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-doctype-name-but-got-eof"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["name"] = data
+            self.state = self.doctypeNameState
+        return True
+
+    def doctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+            self.state = self.afterDoctypeNameState
+        elif data == ">":
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["name"] += "\uFFFD"
+            self.state = self.doctypeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype-name"})
+            self.currentToken["correct"] = False
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["name"] += data
+        return True
+
+    def afterDoctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.currentToken["correct"] = False
+            self.stream.unget(data)
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            if data in ("p", "P"):
+                matched = True
+                for expected in (("u", "U"), ("b", "B"), ("l", "L"),
+                                 ("i", "I"), ("c", "C")):
+                    data = self.stream.char()
+                    if data not in expected:
+                        matched = False
+                        break
+                if matched:
+                    self.state = self.afterDoctypePublicKeywordState
+                    return True
+            elif data in ("s", "S"):
+                matched = True
+                for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
+                                 ("e", "E"), ("m", "M")):
+                    data = self.stream.char()
+                    if data not in expected:
+                        matched = False
+                        break
+                if matched:
+                    self.state = self.afterDoctypeSystemKeywordState
+                    return True
+
+            # All the characters read before the current 'data' will be
+            # [a-zA-Z], so they're garbage in the bogus doctype and can be
+            # discarded; only the latest character might be '>' or EOF
+            # and needs to be ungetted
+            self.stream.unget(data)
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "expected-space-or-right-bracket-in-doctype", "datavars":
+                                    {"data": data}})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+
+        return True
+
+    def afterDoctypePublicKeywordState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeDoctypePublicIdentifierState
+        elif data in ("'", '"'):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.stream.unget(data)
+            self.state = self.beforeDoctypePublicIdentifierState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.stream.unget(data)
+            self.state = self.beforeDoctypePublicIdentifierState
+        return True
+
+    def beforeDoctypePublicIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == "\"":
+            self.currentToken["publicId"] = ""
+            self.state = self.doctypePublicIdentifierDoubleQuotedState
+        elif data == "'":
+            self.currentToken["publicId"] = ""
+            self.state = self.doctypePublicIdentifierSingleQuotedState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+        return True
+
+    def doctypePublicIdentifierDoubleQuotedState(self):
+        data = self.stream.char()
+        if data == "\"":
+            self.state = self.afterDoctypePublicIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["publicId"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["publicId"] += data
+        return True
+
+    def doctypePublicIdentifierSingleQuotedState(self):
+        data = self.stream.char()
+        if data == "'":
+            self.state = self.afterDoctypePublicIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["publicId"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["publicId"] += data
+        return True
+
+    def afterDoctypePublicIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.betweenDoctypePublicAndSystemIdentifiersState
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == '"':
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierDoubleQuotedState
+        elif data == "'":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierSingleQuotedState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+        return True
+
+    def betweenDoctypePublicAndSystemIdentifiersState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == '"':
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierDoubleQuotedState
+        elif data == "'":
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierSingleQuotedState
+        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+        return True
+
+    def afterDoctypeSystemKeywordState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeDoctypeSystemIdentifierState
+        elif data in ("'", '"'):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.stream.unget(data)
+            self.state = self.beforeDoctypeSystemIdentifierState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.stream.unget(data)
+            self.state = self.beforeDoctypeSystemIdentifierState
+        return True
+
+    def beforeDoctypeSystemIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == "\"":
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierDoubleQuotedState
+        elif data == "'":
+            self.currentToken["systemId"] = ""
+            self.state = self.doctypeSystemIdentifierSingleQuotedState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+        return True
+
+    def doctypeSystemIdentifierDoubleQuotedState(self):
+        data = self.stream.char()
+        if data == "\"":
+            self.state = self.afterDoctypeSystemIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["systemId"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["systemId"] += data
+        return True
+
+    def doctypeSystemIdentifierSingleQuotedState(self):
+        data = self.stream.char()
+        if data == "'":
+            self.state = self.afterDoctypeSystemIdentifierState
+        elif data == "\u0000":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                    "data": "invalid-codepoint"})
+            self.currentToken["systemId"] += "\uFFFD"
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-end-of-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["systemId"] += data
+        return True
+
+    def afterDoctypeSystemIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                                    "unexpected-char-in-doctype"})
+            self.state = self.bogusDoctypeState
+        return True
+
+    def bogusDoctypeState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data is EOF:
+            # XXX EMIT
+            self.stream.unget(data)
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            pass
+        return True
+
+    def cdataSectionState(self):
+        data = []
+        while True:
+            data.append(self.stream.charsUntil("]"))
+            data.append(self.stream.charsUntil(">"))
+            char = self.stream.char()
+            if char == EOF:
+                break
+            else:
+                assert char == ">"
+                if data[-1][-2:] == "]]":
+                    data[-1] = data[-1][:-2]
+                    break
+                else:
+                    data.append(char)
+
+        data = "".join(data)  # pylint:disable=redefined-variable-type
+        # Deal with null here rather than in the parser
+        nullCount = data.count("\u0000")
+        if nullCount > 0:
+            for _ in range(nullCount):
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                                        "data": "invalid-codepoint"})
+            data = data.replace("\u0000", "\uFFFD")
+        if data:
+            self.tokenQueue.append({"type": tokenTypes["Characters"],
+                                    "data": data})
+        self.state = self.dataState
+        return True
diff --git a/html_cleaner/html5lib/_trie/__init__.py b/html_cleaner/html5lib/_trie/__init__.py
new file mode 100644
index 0000000..a5ba4bf
--- /dev/null
+++ b/html_cleaner/html5lib/_trie/__init__.py
@@ -0,0 +1,14 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from .py import Trie as PyTrie
+
+Trie = PyTrie
+
+# pylint:disable=wrong-import-position
+try:
+    from .datrie import Trie as DATrie
+except ImportError:
+    pass
+else:
+    Trie = DATrie
+# pylint:enable=wrong-import-position
diff --git a/html_cleaner/html5lib/_trie/_base.py b/html_cleaner/html5lib/_trie/_base.py
new file mode 100644
index 0000000..25eece4
--- /dev/null
+++ b/html_cleaner/html5lib/_trie/_base.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from collections import Mapping
+
+
+class Trie(Mapping):
+    """Abstract base class for tries"""
+
+    def keys(self, prefix=None):
+        # pylint:disable=arguments-differ
+        keys = super(Trie, self).keys()
+
+        if prefix is None:
+            return set(keys)
+
+        # Python 2.6: no set comprehensions
+        return set([x for x in keys if x.startswith(prefix)])
+
+    def has_keys_with_prefix(self, prefix):
+        for key in self.keys():
+            if key.startswith(prefix):
+                return True
+
+        return False
+
+    def longest_prefix(self, prefix):
+        if prefix in self:
+            return prefix
+
+        for i in range(1, len(prefix) + 1):
+            if prefix[:-i] in self:
+                return prefix[:-i]
+
+        raise KeyError(prefix)
+
+    def longest_prefix_item(self, prefix):
+        lprefix = self.longest_prefix(prefix)
+        return (lprefix, self[lprefix])
diff --git a/html_cleaner/html5lib/_trie/datrie.py b/html_cleaner/html5lib/_trie/datrie.py
new file mode 100644
index 0000000..51f3d04
--- /dev/null
+++ b/html_cleaner/html5lib/_trie/datrie.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from datrie import Trie as DATrie
+from six import text_type
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        chars = set()
+        for key in data.keys():
+            if not isinstance(key, text_type):
+                raise TypeError("All keys must be strings")
+            for char in key:
+                chars.add(char)
+
+        self._data = DATrie("".join(chars))
+        for key, value in data.items():
+            self._data[key] = value
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        raise NotImplementedError()
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        return self._data.keys(prefix)
+
+    def has_keys_with_prefix(self, prefix):
+        return self._data.has_keys_with_prefix(prefix)
+
+    def longest_prefix(self, prefix):
+        return self._data.longest_prefix(prefix)
+
+    def longest_prefix_item(self, prefix):
+        return self._data.longest_prefix_item(prefix)
diff --git a/html_cleaner/html5lib/_trie/py.py b/html_cleaner/html5lib/_trie/py.py
new file mode 100644
index 0000000..c2ba3da
--- /dev/null
+++ b/html_cleaner/html5lib/_trie/py.py
@@ -0,0 +1,67 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from bisect import bisect_left
+
+from ._base import Trie as ABCTrie
+
+
+class Trie(ABCTrie):
+    def __init__(self, data):
+        if not all(isinstance(x, text_type) for x in data.keys()):
+            raise TypeError("All keys must be strings")
+
+        self._data = data
+        self._keys = sorted(data.keys())
+        self._cachestr = ""
+        self._cachepoints = (0, len(data))
+
+    def __contains__(self, key):
+        return key in self._data
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __getitem__(self, key):
+        return self._data[key]
+
+    def keys(self, prefix=None):
+        if prefix is None or prefix == "" or not self._keys:
+            return set(self._keys)
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            start = i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            start = i = bisect_left(self._keys, prefix)
+
+        keys = set()
+        if start == len(self._keys):
+            return keys
+
+        while self._keys[i].startswith(prefix):
+            keys.add(self._keys[i])
+            i += 1
+
+        self._cachestr = prefix
+        self._cachepoints = (start, i)
+
+        return keys
+
+    def has_keys_with_prefix(self, prefix):
+        if prefix in self._data:
+            return True
+
+        if prefix.startswith(self._cachestr):
+            lo, hi = self._cachepoints
+            i = bisect_left(self._keys, prefix, lo, hi)
+        else:
+            i = bisect_left(self._keys, prefix)
+
+        if i == len(self._keys):
+            return False
+
+        return self._keys[i].startswith(prefix)
diff --git a/html_cleaner/html5lib/_utils.py b/html_cleaner/html5lib/_utils.py
new file mode 100644
index 0000000..03f0dab
--- /dev/null
+++ b/html_cleaner/html5lib/_utils.py
@@ -0,0 +1,127 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import sys
+from types import ModuleType
+
+from six import text_type
+
+try:
+    import xml.etree.cElementTree as default_etree
+except ImportError:
+    import xml.etree.ElementTree as default_etree
+
+
+__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates", "PY27"]
+
+
+PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
+        assert isinstance(_x, text_type)
+except:  # pylint:disable=bare-except
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True
+
+
+class MethodDispatcher(dict):
+    """Dict with 2 special properties:
+
+    On initiation, keys that are lists, sets or tuples are converted to
+    multiple keys so accessing any one of the items in the original
+    list-like object returns the matching value
+
+    md = MethodDispatcher({("foo", "bar"):"baz"})
+    md["foo"] == "baz"
+
+    A default value which can be set through the default attribute.
+    """
+
+    def __init__(self, items=()):
+        # Using _dictEntries instead of directly assigning to self is about
+        # twice as fast. Please do careful performance testing before changing
+        # anything here.
+        _dictEntries = []
+        for name, value in items:
+            if isinstance(name, (list, tuple, frozenset, set)):
+                for item in name:
+                    _dictEntries.append((item, value))
+            else:
+                _dictEntries.append((name, value))
+        dict.__init__(self, _dictEntries)
+        assert len(self) == len(_dictEntries)
+        self.default = None
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
+
+
+# Some utility functions to deal with weirdness around UCS2 vs UCS4
+# python builds
+
+def isSurrogatePair(data):
+    return (len(data) == 2 and
+            ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
+            ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
+
+
+def surrogatePairToCodepoint(data):
+    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
+                (ord(data[1]) - 0xDC00))
+    return char_val
+
+# Module Factory Factory (no, this isn't Java, I know)
+# Here to stop this being duplicated all over the place.
+
+
+def moduleFactoryFactory(factory):
+    moduleCache = {}
+
+    def moduleFactory(baseModule, *args, **kwargs):
+        if isinstance(ModuleType.__name__, type("")):
+            name = "_%s_factory" % baseModule.__name__
+        else:
+            name = b"_%s_factory" % baseModule.__name__
+
+        kwargs_tuple = tuple(kwargs.items())
+
+        try:
+            return moduleCache[name][args][kwargs_tuple]
+        except KeyError:
+            mod = ModuleType(name)
+            objs = factory(baseModule, *args, **kwargs)
+            mod.__dict__.update(objs)
+            if "name" not in moduleCache:
+                moduleCache[name] = {}
+            if "args" not in moduleCache[name]:
+                moduleCache[name][args] = {}
+            if "kwargs" not in moduleCache[name][args]:
+                moduleCache[name][args][kwargs_tuple] = {}
+            moduleCache[name][args][kwargs_tuple] = mod
+            return mod
+
+    return moduleFactory
+
+
+def memoize(func):
+    cache = {}
+
+    def wrapped(*args, **kwargs):
+        key = (tuple(args), tuple(kwargs.items()))
+        if key not in cache:
+            cache[key] = func(*args, **kwargs)
+        return cache[key]
+
+    return wrapped
diff --git a/html_cleaner/html5lib/constants.py b/html_cleaner/html5lib/constants.py
new file mode 100644
index 0000000..9e7541d
--- /dev/null
+++ b/html_cleaner/html5lib/constants.py
@@ -0,0 +1,2945 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import string
+
+EOF = None
+
+E = {
+    "null-character":
+        "Null character in input stream, replaced with U+FFFD.",
+    "invalid-codepoint":
+        "Invalid codepoint in stream.",
+    "incorrectly-placed-solidus":
+        "Solidus (/) incorrectly placed in tag.",
+    "incorrect-cr-newline-entity":
+        "Incorrect CR newline entity, replaced with LF.",
+    "illegal-windows-1252-entity":
+        "Entity used with illegal number (windows-1252 reference).",
+    "cant-convert-numeric-entity":
+        "Numeric entity couldn't be converted to character "
+        "(codepoint U+%(charAsInt)08x).",
+    "illegal-codepoint-for-numeric-entity":
+        "Numeric entity represents an illegal codepoint: "
+        "U+%(charAsInt)08x.",
+    "numeric-entity-without-semicolon":
+        "Numeric entity didn't end with ';'.",
+    "expected-numeric-entity-but-got-eof":
+        "Numeric entity expected. Got end of file instead.",
+    "expected-numeric-entity":
+        "Numeric entity expected but none found.",
+    "named-entity-without-semicolon":
+        "Named entity didn't end with ';'.",
+    "expected-named-entity":
+        "Named entity expected. Got none.",
+    "attributes-in-end-tag":
+        "End tag contains unexpected attributes.",
+    'self-closing-flag-on-end-tag':
+        "End tag contains unexpected self-closing flag.",
+    "expected-tag-name-but-got-right-bracket":
+        "Expected tag name. Got '>' instead.",
+    "expected-tag-name-but-got-question-mark":
+        "Expected tag name. Got '?' instead. (HTML doesn't "
+        "support processing instructions.)",
+    "expected-tag-name":
+        "Expected tag name. Got something else instead",
+    "expected-closing-tag-but-got-right-bracket":
+        "Expected closing tag. Got '>' instead. Ignoring '</>'.",
+    "expected-closing-tag-but-got-eof":
+        "Expected closing tag. Unexpected end of file.",
+    "expected-closing-tag-but-got-char":
+        "Expected closing tag. Unexpected character '%(data)s' found.",
+    "eof-in-tag-name":
+        "Unexpected end of file in the tag name.",
+    "expected-attribute-name-but-got-eof":
+        "Unexpected end of file. Expected attribute name instead.",
+    "eof-in-attribute-name":
+        "Unexpected end of file in attribute name.",
+    "invalid-character-in-attribute-name":
+        "Invalid character in attribute name",
+    "duplicate-attribute":
+        "Dropped duplicate attribute on tag.",
+    "expected-end-of-tag-name-but-got-eof":
+        "Unexpected end of file. Expected = or end of tag.",
+    "expected-attribute-value-but-got-eof":
+        "Unexpected end of file. Expected attribute value.",
+    "expected-attribute-value-but-got-right-bracket":
+        "Expected attribute value. Got '>' instead.",
+    'equals-in-unquoted-attribute-value':
+        "Unexpected = in unquoted attribute",
+    'unexpected-character-in-unquoted-attribute-value':
+        "Unexpected character in unquoted attribute",
+    "invalid-character-after-attribute-name":
+        "Unexpected character after attribute name.",
+    "unexpected-character-after-attribute-value":
+        "Unexpected character after attribute value.",
+    "eof-in-attribute-value-double-quote":
+        "Unexpected end of file in attribute value (\").",
+    "eof-in-attribute-value-single-quote":
+        "Unexpected end of file in attribute value (').",
+    "eof-in-attribute-value-no-quotes":
+        "Unexpected end of file in attribute value.",
+    "unexpected-EOF-after-solidus-in-tag":
+        "Unexpected end of file in tag. Expected >",
+    "unexpected-character-after-solidus-in-tag":
+        "Unexpected character after / in tag. Expected >",
+    "expected-dashes-or-doctype":
+        "Expected '--' or 'DOCTYPE'. Not found.",
+    "unexpected-bang-after-double-dash-in-comment":
+        "Unexpected ! after -- in comment",
+    "unexpected-space-after-double-dash-in-comment":
+        "Unexpected space after -- in comment",
+    "incorrect-comment":
+        "Incorrect comment.",
+    "eof-in-comment":
+        "Unexpected end of file in comment.",
+    "eof-in-comment-end-dash":
+        "Unexpected end of file in comment (-)",
+    "unexpected-dash-after-double-dash-in-comment":
+        "Unexpected '-' after '--' found in comment.",
+    "eof-in-comment-double-dash":
+        "Unexpected end of file in comment (--).",
+    "eof-in-comment-end-space-state":
+        "Unexpected end of file in comment.",
+    "eof-in-comment-end-bang-state":
+        "Unexpected end of file in comment.",
+    "unexpected-char-in-comment":
+        "Unexpected character in comment found.",
+    "need-space-after-doctype":
+        "No space after literal string 'DOCTYPE'.",
+    "expected-doctype-name-but-got-right-bracket":
+        "Unexpected > character. Expected DOCTYPE name.",
+    "expected-doctype-name-but-got-eof":
+        "Unexpected end of file. Expected DOCTYPE name.",
+    "eof-in-doctype-name":
+        "Unexpected end of file in DOCTYPE name.",
+    "eof-in-doctype":
+        "Unexpected end of file in DOCTYPE.",
+    "expected-space-or-right-bracket-in-doctype":
+        "Expected space or '>'. Got '%(data)s'",
+    "unexpected-end-of-doctype":
+        "Unexpected end of DOCTYPE.",
+    "unexpected-char-in-doctype":
+        "Unexpected character in DOCTYPE.",
+    "eof-in-innerhtml":
+        "XXX innerHTML EOF",
+    "unexpected-doctype":
+        "Unexpected DOCTYPE. Ignored.",
+    "non-html-root":
+        "html needs to be the first start tag.",
+    "expected-doctype-but-got-eof":
+        "Unexpected End of file. Expected DOCTYPE.",
+    "unknown-doctype":
+        "Erroneous DOCTYPE.",
+    "expected-doctype-but-got-chars":
+        "Unexpected non-space characters. Expected DOCTYPE.",
+    "expected-doctype-but-got-start-tag":
+        "Unexpected start tag (%(name)s). Expected DOCTYPE.",
+    "expected-doctype-but-got-end-tag":
+        "Unexpected end tag (%(name)s). Expected DOCTYPE.",
+    "end-tag-after-implied-root":
+        "Unexpected end tag (%(name)s) after the (implied) root element.",
+    "expected-named-closing-tag-but-got-eof":
+        "Unexpected end of file. Expected end tag (%(name)s).",
+    "two-heads-are-not-better-than-one":
+        "Unexpected start tag head in existing head. Ignored.",
+    "unexpected-end-tag":
+        "Unexpected end tag (%(name)s). Ignored.",
+    "unexpected-start-tag-out-of-my-head":
+        "Unexpected start tag (%(name)s) that can be in head. Moved.",
+    "unexpected-start-tag":
+        "Unexpected start tag (%(name)s).",
+    "missing-end-tag":
+        "Missing end tag (%(name)s).",
+    "missing-end-tags":
+        "Missing end tags (%(name)s).",
+    "unexpected-start-tag-implies-end-tag":
+        "Unexpected start tag (%(startName)s) "
+        "implies end tag (%(endName)s).",
+    "unexpected-start-tag-treated-as":
+        "Unexpected start tag (%(originalName)s). Treated as %(newName)s.",
+    "deprecated-tag":
+        "Unexpected start tag %(name)s. Don't use it!",
+    "unexpected-start-tag-ignored":
+        "Unexpected start tag %(name)s. Ignored.",
+    "expected-one-end-tag-but-got-another":
+        "Unexpected end tag (%(gotName)s). "
+        "Missing end tag (%(expectedName)s).",
+    "end-tag-too-early":
+        "End tag (%(name)s) seen too early. Expected other end tag.",
+    "end-tag-too-early-named":
+        "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).",
+    "end-tag-too-early-ignored":
+        "End tag (%(name)s) seen too early. Ignored.",
+    "adoption-agency-1.1":
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 1 of the adoption agency algorithm.",
+    "adoption-agency-1.2":
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 2 of the adoption agency algorithm.",
+    "adoption-agency-1.3":
+        "End tag (%(name)s) violates step 1, "
+        "paragraph 3 of the adoption agency algorithm.",
+    "adoption-agency-4.4":
+        "End tag (%(name)s) violates step 4, "
+        "paragraph 4 of the adoption agency algorithm.",
+    "unexpected-end-tag-treated-as":
+        "Unexpected end tag (%(originalName)s). Treated as %(newName)s.",
+    "no-end-tag":
+        "This element (%(name)s) has no end tag.",
+    "unexpected-implied-end-tag-in-table":
+        "Unexpected implied end tag (%(name)s) in the table phase.",
+    "unexpected-implied-end-tag-in-table-body":
+        "Unexpected implied end tag (%(name)s) in the table body phase.",
+    "unexpected-char-implies-table-voodoo":
+        "Unexpected non-space characters in "
+        "table context caused voodoo mode.",
+    "unexpected-hidden-input-in-table":
+        "Unexpected input with type hidden in table context.",
+    "unexpected-form-in-table":
+        "Unexpected form in table context.",
+    "unexpected-start-tag-implies-table-voodoo":
+        "Unexpected start tag (%(name)s) in "
+        "table context caused voodoo mode.",
+    "unexpected-end-tag-implies-table-voodoo":
+        "Unexpected end tag (%(name)s) in "
+        "table context caused voodoo mode.",
+    "unexpected-cell-in-table-body":
+        "Unexpected table cell start tag (%(name)s) "
+        "in the table body phase.",
+    "unexpected-cell-end-tag":
+        "Got table cell end tag (%(name)s) "
+        "while required end tags are missing.",
+    "unexpected-end-tag-in-table-body":
+        "Unexpected end tag (%(name)s) in the table body phase. Ignored.",
+    "unexpected-implied-end-tag-in-table-row":
+        "Unexpected implied end tag (%(name)s) in the table row phase.",
+    "unexpected-end-tag-in-table-row":
+        "Unexpected end tag (%(name)s) in the table row phase. Ignored.",
+    "unexpected-select-in-select":
+        "Unexpected select start tag in the select phase "
+        "treated as select end tag.",
+    "unexpected-input-in-select":
+        "Unexpected input start tag in the select phase.",
+    "unexpected-start-tag-in-select":
+        "Unexpected start tag token (%(name)s in the select phase. "
+        "Ignored.",
+    "unexpected-end-tag-in-select":
+        "Unexpected end tag (%(name)s) in the select phase. Ignored.",
+    "unexpected-table-element-start-tag-in-select-in-table":
+        "Unexpected table element start tag (%(name)s) in the select in table phase.",
+    "unexpected-table-element-end-tag-in-select-in-table":
+        "Unexpected table element end tag (%(name)s) in the select in table phase.",
+    "unexpected-char-after-body":
+        "Unexpected non-space characters in the after body phase.",
+    "unexpected-start-tag-after-body":
+        "Unexpected start tag token (%(name)s)"
+        " in the after body phase.",
+    "unexpected-end-tag-after-body":
+        "Unexpected end tag token (%(name)s)"
+        " in the after body phase.",
+    "unexpected-char-in-frameset":
+        "Unexpected characters in the frameset phase. Characters ignored.",
+    "unexpected-start-tag-in-frameset":
+        "Unexpected start tag token (%(name)s)"
+        " in the frameset phase. Ignored.",
+    "unexpected-frameset-in-frameset-innerhtml":
+        "Unexpected end tag token (frameset) "
+        "in the frameset phase (innerHTML).",
+    "unexpected-end-tag-in-frameset":
+        "Unexpected end tag token (%(name)s)"
+        " in the frameset phase. Ignored.",
+    "unexpected-char-after-frameset":
+        "Unexpected non-space characters in the "
+        "after frameset phase. Ignored.",
+    "unexpected-start-tag-after-frameset":
+        "Unexpected start tag (%(name)s)"
+        " in the after frameset phase. Ignored.",
+    "unexpected-end-tag-after-frameset":
+        "Unexpected end tag (%(name)s)"
+        " in the after frameset phase. Ignored.",
+    "unexpected-end-tag-after-body-innerhtml":
+        "Unexpected end tag after body(innerHtml)",
+    "expected-eof-but-got-char":
+        "Unexpected non-space characters. Expected end of file.",
+    "expected-eof-but-got-start-tag":
+        "Unexpected start tag (%(name)s)"
+        ". Expected end of file.",
+    "expected-eof-but-got-end-tag":
+        "Unexpected end tag (%(name)s)"
+        ". Expected end of file.",
+    "eof-in-table":
+        "Unexpected end of file. Expected table content.",
+    "eof-in-select":
+        "Unexpected end of file. Expected select content.",
+    "eof-in-frameset":
+        "Unexpected end of file. Expected frameset content.",
+    "eof-in-script-in-script":
+        "Unexpected end of file. Expected script content.",
+    "eof-in-foreign-lands":
+        "Unexpected end of file. Expected foreign content",
+    "non-void-element-with-trailing-solidus":
+        "Trailing solidus not allowed on element %(name)s",
+    "unexpected-html-element-in-foreign-content":
+        "Element %(name)s not allowed in a non-html context",
+    "unexpected-end-tag-before-html":
+        "Unexpected end tag (%(name)s) before html.",
+    "unexpected-inhead-noscript-tag":
+        "Element %(name)s not allowed in a inhead-noscript context",
+    "eof-in-head-noscript":
+        "Unexpected end of file. Expected inhead-noscript content",
+    "char-in-head-noscript":
+        "Unexpected non-space character. Expected inhead-noscript content",
+    "XXX-undefined-error":
+        "Undefined error (this sucks and should be fixed)",
+}
+
+namespaces = {
+    "html": "http://www.w3.org/1999/xhtml",
+    "mathml": "http://www.w3.org/1998/Math/MathML",
+    "svg": "http://www.w3.org/2000/svg",
+    "xlink": "http://www.w3.org/1999/xlink",
+    "xml": "http://www.w3.org/XML/1998/namespace",
+    "xmlns": "http://www.w3.org/2000/xmlns/"
+}
+
+scopingElements = frozenset([
+    (namespaces["html"], "applet"),
+    (namespaces["html"], "caption"),
+    (namespaces["html"], "html"),
+    (namespaces["html"], "marquee"),
+    (namespaces["html"], "object"),
+    (namespaces["html"], "table"),
+    (namespaces["html"], "td"),
+    (namespaces["html"], "th"),
+    (namespaces["mathml"], "mi"),
+    (namespaces["mathml"], "mo"),
+    (namespaces["mathml"], "mn"),
+    (namespaces["mathml"], "ms"),
+    (namespaces["mathml"], "mtext"),
+    (namespaces["mathml"], "annotation-xml"),
+    (namespaces["svg"], "foreignObject"),
+    (namespaces["svg"], "desc"),
+    (namespaces["svg"], "title"),
+])
+
+formattingElements = frozenset([
+    (namespaces["html"], "a"),
+    (namespaces["html"], "b"),
+    (namespaces["html"], "big"),
+    (namespaces["html"], "code"),
+    (namespaces["html"], "em"),
+    (namespaces["html"], "font"),
+    (namespaces["html"], "i"),
+    (namespaces["html"], "nobr"),
+    (namespaces["html"], "s"),
+    (namespaces["html"], "small"),
+    (namespaces["html"], "strike"),
+    (namespaces["html"], "strong"),
+    (namespaces["html"], "tt"),
+    (namespaces["html"], "u")
+])
+
+specialElements = frozenset([
+    (namespaces["html"], "address"),
+    (namespaces["html"], "applet"),
+    (namespaces["html"], "area"),
+    (namespaces["html"], "article"),
+    (namespaces["html"], "aside"),
+    (namespaces["html"], "base"),
+    (namespaces["html"], "basefont"),
+    (namespaces["html"], "bgsound"),
+    (namespaces["html"], "blockquote"),
+    (namespaces["html"], "body"),
+    (namespaces["html"], "br"),
+    (namespaces["html"], "button"),
+    (namespaces["html"], "caption"),
+    (namespaces["html"], "center"),
+    (namespaces["html"], "col"),
+    (namespaces["html"], "colgroup"),
+    (namespaces["html"], "command"),
+    (namespaces["html"], "dd"),
+    (namespaces["html"], "details"),
+    (namespaces["html"], "dir"),
+    (namespaces["html"], "div"),
+    (namespaces["html"], "dl"),
+    (namespaces["html"], "dt"),
+    (namespaces["html"], "embed"),
+    (namespaces["html"], "fieldset"),
+    (namespaces["html"], "figure"),
+    (namespaces["html"], "footer"),
+    (namespaces["html"], "form"),
+    (namespaces["html"], "frame"),
+    (namespaces["html"], "frameset"),
+    (namespaces["html"], "h1"),
+    (namespaces["html"], "h2"),
+    (namespaces["html"], "h3"),
+    (namespaces["html"], "h4"),
+    (namespaces["html"], "h5"),
+    (namespaces["html"], "h6"),
+    (namespaces["html"], "head"),
+    (namespaces["html"], "header"),
+    (namespaces["html"], "hr"),
+    (namespaces["html"], "html"),
+    (namespaces["html"], "iframe"),
+    # Note that image is commented out in the spec as "this isn't an
+    # element that can end up on the stack, so it doesn't matter,"
+    (namespaces["html"], "image"),
+    (namespaces["html"], "img"),
+    (namespaces["html"], "input"),
+    (namespaces["html"], "isindex"),
+    (namespaces["html"], "li"),
+    (namespaces["html"], "link"),
+    (namespaces["html"], "listing"),
+    (namespaces["html"], "marquee"),
+    (namespaces["html"], "menu"),
+    (namespaces["html"], "meta"),
+    (namespaces["html"], "nav"),
+    (namespaces["html"], "noembed"),
+    (namespaces["html"], "noframes"),
+    (namespaces["html"], "noscript"),
+    (namespaces["html"], "object"),
+    (namespaces["html"], "ol"),
+    (namespaces["html"], "p"),
+    (namespaces["html"], "param"),
+    (namespaces["html"], "plaintext"),
+    (namespaces["html"], "pre"),
+    (namespaces["html"], "script"),
+    (namespaces["html"], "section"),
+    (namespaces["html"], "select"),
+    (namespaces["html"], "style"),
+    (namespaces["html"], "table"),
+    (namespaces["html"], "tbody"),
+    (namespaces["html"], "td"),
+    (namespaces["html"], "textarea"),
+    (namespaces["html"], "tfoot"),
+    (namespaces["html"], "th"),
+    (namespaces["html"], "thead"),
+    (namespaces["html"], "title"),
+    (namespaces["html"], "tr"),
+    (namespaces["html"], "ul"),
+    (namespaces["html"], "wbr"),
+    (namespaces["html"], "xmp"),
+    (namespaces["svg"], "foreignObject")
+])
+
+htmlIntegrationPointElements = frozenset([
+    (namespaces["mathml"], "annotaion-xml"),
+    (namespaces["svg"], "foreignObject"),
+    (namespaces["svg"], "desc"),
+    (namespaces["svg"], "title")
+])
+
+mathmlTextIntegrationPointElements = frozenset([
+    (namespaces["mathml"], "mi"),
+    (namespaces["mathml"], "mo"),
+    (namespaces["mathml"], "mn"),
+    (namespaces["mathml"], "ms"),
+    (namespaces["mathml"], "mtext")
+])
+
+adjustSVGAttributes = {
+    "attributename": "attributeName",
+    "attributetype": "attributeType",
+    "basefrequency": "baseFrequency",
+    "baseprofile": "baseProfile",
+    "calcmode": "calcMode",
+    "clippathunits": "clipPathUnits",
+    "contentscripttype": "contentScriptType",
+    "contentstyletype": "contentStyleType",
+    "diffuseconstant": "diffuseConstant",
+    "edgemode": "edgeMode",
+    "externalresourcesrequired": "externalResourcesRequired",
+    "filterres": "filterRes",
+    "filterunits": "filterUnits",
+    "glyphref": "glyphRef",
+    "gradienttransform": "gradientTransform",
+    "gradientunits": "gradientUnits",
+    "kernelmatrix": "kernelMatrix",
+    "kernelunitlength": "kernelUnitLength",
+    "keypoints": "keyPoints",
+    "keysplines": "keySplines",
+    "keytimes": "keyTimes",
+    "lengthadjust": "lengthAdjust",
+    "limitingconeangle": "limitingConeAngle",
+    "markerheight": "markerHeight",
+    "markerunits": "markerUnits",
+    "markerwidth": "markerWidth",
+    "maskcontentunits": "maskContentUnits",
+    "maskunits": "maskUnits",
+    "numoctaves": "numOctaves",
+    "pathlength": "pathLength",
+    "patterncontentunits": "patternContentUnits",
+    "patterntransform": "patternTransform",
+    "patternunits": "patternUnits",
+    "pointsatx": "pointsAtX",
+    "pointsaty": "pointsAtY",
+    "pointsatz": "pointsAtZ",
+    "preservealpha": "preserveAlpha",
+    "preserveaspectratio": "preserveAspectRatio",
+    "primitiveunits": "primitiveUnits",
+    "refx": "refX",
+    "refy": "refY",
+    "repeatcount": "repeatCount",
+    "repeatdur": "repeatDur",
+    "requiredextensions": "requiredExtensions",
+    "requiredfeatures": "requiredFeatures",
+    "specularconstant": "specularConstant",
+    "specularexponent": "specularExponent",
+    "spreadmethod": "spreadMethod",
+    "startoffset": "startOffset",
+    "stddeviation": "stdDeviation",
+    "stitchtiles": "stitchTiles",
+    "surfacescale": "surfaceScale",
+    "systemlanguage": "systemLanguage",
+    "tablevalues": "tableValues",
+    "targetx": "targetX",
+    "targety": "targetY",
+    "textlength": "textLength",
+    "viewbox": "viewBox",
+    "viewtarget": "viewTarget",
+    "xchannelselector": "xChannelSelector",
+    "ychannelselector": "yChannelSelector",
+    "zoomandpan": "zoomAndPan"
+}
+
+adjustMathMLAttributes = {"definitionurl": "definitionURL"}
+
+adjustForeignAttributes = {
+    "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
+    "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
+    "xlink:href": ("xlink", "href", namespaces["xlink"]),
+    "xlink:role": ("xlink", "role", namespaces["xlink"]),
+    "xlink:show": ("xlink", "show", namespaces["xlink"]),
+    "xlink:title": ("xlink", "title", namespaces["xlink"]),
+    "xlink:type": ("xlink", "type", namespaces["xlink"]),
+    "xml:base": ("xml", "base", namespaces["xml"]),
+    "xml:lang": ("xml", "lang", namespaces["xml"]),
+    "xml:space": ("xml", "space", namespaces["xml"]),
+    "xmlns": (None, "xmlns", namespaces["xmlns"]),
+    "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
+}
+
+unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
+                                  adjustForeignAttributes.items()])
+
+spaceCharacters = frozenset([
+    "\t",
+    "\n",
+    "\u000C",
+    " ",
+    "\r"
+])
+
+tableInsertModeElements = frozenset([
+    "table",
+    "tbody",
+    "tfoot",
+    "thead",
+    "tr"
+])
+
+asciiLowercase = frozenset(string.ascii_lowercase)
+asciiUppercase = frozenset(string.ascii_uppercase)
+asciiLetters = frozenset(string.ascii_letters)
+digits = frozenset(string.digits)
+hexDigits = frozenset(string.hexdigits)
+
+asciiUpper2Lower = dict([(ord(c), ord(c.lower()))
+                         for c in string.ascii_uppercase])
+
+# Heading elements need to be ordered
+headingElements = (
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6"
+)
+
+voidElements = frozenset([
+    "base",
+    "command",
+    "event-source",
+    "link",
+    "meta",
+    "hr",
+    "br",
+    "img",
+    "embed",
+    "param",
+    "area",
+    "col",
+    "input",
+    "source",
+    "track"
+])
+
+cdataElements = frozenset(['title', 'textarea'])
+
+rcdataElements = frozenset([
+    'style',
+    'script',
+    'xmp',
+    'iframe',
+    'noembed',
+    'noframes',
+    'noscript'
+])
+
+booleanAttributes = {
+    "": frozenset(["irrelevant"]),
+    "style": frozenset(["scoped"]),
+    "img": frozenset(["ismap"]),
+    "audio": frozenset(["autoplay", "controls"]),
+    "video": frozenset(["autoplay", "controls"]),
+    "script": frozenset(["defer", "async"]),
+    "details": frozenset(["open"]),
+    "datagrid": frozenset(["multiple", "disabled"]),
+    "command": frozenset(["hidden", "disabled", "checked", "default"]),
+    "hr": frozenset(["noshade"]),
+    "menu": frozenset(["autosubmit"]),
+    "fieldset": frozenset(["disabled", "readonly"]),
+    "option": frozenset(["disabled", "readonly", "selected"]),
+    "optgroup": frozenset(["disabled", "readonly"]),
+    "button": frozenset(["disabled", "autofocus"]),
+    "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]),
+    "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]),
+    "output": frozenset(["disabled", "readonly"]),
+}
+
+# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
+# therefore can't be a frozenset.
+entitiesWindows1252 = (
+    8364,   # 0x80  0x20AC  EURO SIGN
+    65533,  # 0x81          UNDEFINED
+    8218,   # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
+    402,    # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
+    8222,   # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
+    8230,   # 0x85  0x2026  HORIZONTAL ELLIPSIS
+    8224,   # 0x86  0x2020  DAGGER
+    8225,   # 0x87  0x2021  DOUBLE DAGGER
+    710,    # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
+    8240,   # 0x89  0x2030  PER MILLE SIGN
+    352,    # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
+    8249,   # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    338,    # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
+    65533,  # 0x8D          UNDEFINED
+    381,    # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
+    65533,  # 0x8F          UNDEFINED
+    65533,  # 0x90          UNDEFINED
+    8216,   # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
+    8217,   # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
+    8220,   # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
+    8221,   # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
+    8226,   # 0x95  0x2022  BULLET
+    8211,   # 0x96  0x2013  EN DASH
+    8212,   # 0x97  0x2014  EM DASH
+    732,    # 0x98  0x02DC  SMALL TILDE
+    8482,   # 0x99  0x2122  TRADE MARK SIGN
+    353,    # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
+    8250,   # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    339,    # 0x9C  0x0153  LATIN SMALL LIGATURE OE
+    65533,  # 0x9D          UNDEFINED
+    382,    # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
+    376     # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
+)
+
+xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;'])
+
+entities = {
+    "AElig": "\xc6",
+    "AElig;": "\xc6",
+    "AMP": "&",
+    "AMP;": "&",
+    "Aacute": "\xc1",
+    "Aacute;": "\xc1",
+    "Abreve;": "\u0102",
+    "Acirc": "\xc2",
+    "Acirc;": "\xc2",
+    "Acy;": "\u0410",
+    "Afr;": "\U0001d504",
+    "Agrave": "\xc0",
+    "Agrave;": "\xc0",
+    "Alpha;": "\u0391",
+    "Amacr;": "\u0100",
+    "And;": "\u2a53",
+    "Aogon;": "\u0104",
+    "Aopf;": "\U0001d538",
+    "ApplyFunction;": "\u2061",
+    "Aring": "\xc5",
+    "Aring;": "\xc5",
+    "Ascr;": "\U0001d49c",
+    "Assign;": "\u2254",
+    "Atilde": "\xc3",
+    "Atilde;": "\xc3",
+    "Auml": "\xc4",
+    "Auml;": "\xc4",
+    "Backslash;": "\u2216",
+    "Barv;": "\u2ae7",
+    "Barwed;": "\u2306",
+    "Bcy;": "\u0411",
+    "Because;": "\u2235",
+    "Bernoullis;": "\u212c",
+    "Beta;": "\u0392",
+    "Bfr;": "\U0001d505",
+    "Bopf;": "\U0001d539",
+    "Breve;": "\u02d8",
+    "Bscr;": "\u212c",
+    "Bumpeq;": "\u224e",
+    "CHcy;": "\u0427",
+    "COPY": "\xa9",
+    "COPY;": "\xa9",
+    "Cacute;": "\u0106",
+    "Cap;": "\u22d2",
+    "CapitalDifferentialD;": "\u2145",
+    "Cayleys;": "\u212d",
+    "Ccaron;": "\u010c",
+    "Ccedil": "\xc7",
+    "Ccedil;": "\xc7",
+    "Ccirc;": "\u0108",
+    "Cconint;": "\u2230",
+    "Cdot;": "\u010a",
+    "Cedilla;": "\xb8",
+    "CenterDot;": "\xb7",
+    "Cfr;": "\u212d",
+    "Chi;": "\u03a7",
+    "CircleDot;": "\u2299",
+    "CircleMinus;": "\u2296",
+    "CirclePlus;": "\u2295",
+    "CircleTimes;": "\u2297",
+    "ClockwiseContourIntegral;": "\u2232",
+    "CloseCurlyDoubleQuote;": "\u201d",
+    "CloseCurlyQuote;": "\u2019",
+    "Colon;": "\u2237",
+    "Colone;": "\u2a74",
+    "Congruent;": "\u2261",
+    "Conint;": "\u222f",
+    "ContourIntegral;": "\u222e",
+    "Copf;": "\u2102",
+    "Coproduct;": "\u2210",
+    "CounterClockwiseContourIntegral;": "\u2233",
+    "Cross;": "\u2a2f",
+    "Cscr;": "\U0001d49e",
+    "Cup;": "\u22d3",
+    "CupCap;": "\u224d",
+    "DD;": "\u2145",
+    "DDotrahd;": "\u2911",
+    "DJcy;": "\u0402",
+    "DScy;": "\u0405",
+    "DZcy;": "\u040f",
+    "Dagger;": "\u2021",
+    "Darr;": "\u21a1",
+    "Dashv;": "\u2ae4",
+    "Dcaron;": "\u010e",
+    "Dcy;": "\u0414",
+    "Del;": "\u2207",
+    "Delta;": "\u0394",
+    "Dfr;": "\U0001d507",
+    "DiacriticalAcute;": "\xb4",
+    "DiacriticalDot;": "\u02d9",
+    "DiacriticalDoubleAcute;": "\u02dd",
+    "DiacriticalGrave;": "`",
+    "DiacriticalTilde;": "\u02dc",
+    "Diamond;": "\u22c4",
+    "DifferentialD;": "\u2146",
+    "Dopf;": "\U0001d53b",
+    "Dot;": "\xa8",
+    "DotDot;": "\u20dc",
+    "DotEqual;": "\u2250",
+    "DoubleContourIntegral;": "\u222f",
+    "DoubleDot;": "\xa8",
+    "DoubleDownArrow;": "\u21d3",
+    "DoubleLeftArrow;": "\u21d0",
+    "DoubleLeftRightArrow;": "\u21d4",
+    "DoubleLeftTee;": "\u2ae4",
+    "DoubleLongLeftArrow;": "\u27f8",
+    "DoubleLongLeftRightArrow;": "\u27fa",
+    "DoubleLongRightArrow;": "\u27f9",
+    "DoubleRightArrow;": "\u21d2",
+    "DoubleRightTee;": "\u22a8",
+    "DoubleUpArrow;": "\u21d1",
+    "DoubleUpDownArrow;": "\u21d5",
+    "DoubleVerticalBar;": "\u2225",
+    "DownArrow;": "\u2193",
+    "DownArrowBar;": "\u2913",
+    "DownArrowUpArrow;": "\u21f5",
+    "DownBreve;": "\u0311",
+    "DownLeftRightVector;": "\u2950",
+    "DownLeftTeeVector;": "\u295e",
+    "DownLeftVector;": "\u21bd",
+    "DownLeftVectorBar;": "\u2956",
+    "DownRightTeeVector;": "\u295f",
+    "DownRightVector;": "\u21c1",
+    "DownRightVectorBar;": "\u2957",
+    "DownTee;": "\u22a4",
+    "DownTeeArrow;": "\u21a7",
+    "Downarrow;": "\u21d3",
+    "Dscr;": "\U0001d49f",
+    "Dstrok;": "\u0110",
+    "ENG;": "\u014a",
+    "ETH": "\xd0",
+    "ETH;": "\xd0",
+    "Eacute": "\xc9",
+    "Eacute;": "\xc9",
+    "Ecaron;": "\u011a",
+    "Ecirc": "\xca",
+    "Ecirc;": "\xca",
+    "Ecy;": "\u042d",
+    "Edot;": "\u0116",
+    "Efr;": "\U0001d508",
+    "Egrave": "\xc8",
+    "Egrave;": "\xc8",
+    "Element;": "\u2208",
+    "Emacr;": "\u0112",
+    "EmptySmallSquare;": "\u25fb",
+    "EmptyVerySmallSquare;": "\u25ab",
+    "Eogon;": "\u0118",
+    "Eopf;": "\U0001d53c",
+    "Epsilon;": "\u0395",
+    "Equal;": "\u2a75",
+    "EqualTilde;": "\u2242",
+    "Equilibrium;": "\u21cc",
+    "Escr;": "\u2130",
+    "Esim;": "\u2a73",
+    "Eta;": "\u0397",
+    "Euml": "\xcb",
+    "Euml;": "\xcb",
+    "Exists;": "\u2203",
+    "ExponentialE;": "\u2147",
+    "Fcy;": "\u0424",
+    "Ffr;": "\U0001d509",
+    "FilledSmallSquare;": "\u25fc",
+    "FilledVerySmallSquare;": "\u25aa",
+    "Fopf;": "\U0001d53d",
+    "ForAll;": "\u2200",
+    "Fouriertrf;": "\u2131",
+    "Fscr;": "\u2131",
+    "GJcy;": "\u0403",
+    "GT": ">",
+    "GT;": ">",
+    "Gamma;": "\u0393",
+    "Gammad;": "\u03dc",
+    "Gbreve;": "\u011e",
+    "Gcedil;": "\u0122",
+    "Gcirc;": "\u011c",
+    "Gcy;": "\u0413",
+    "Gdot;": "\u0120",
+    "Gfr;": "\U0001d50a",
+    "Gg;": "\u22d9",
+    "Gopf;": "\U0001d53e",
+    "GreaterEqual;": "\u2265",
+    "GreaterEqualLess;": "\u22db",
+    "GreaterFullEqual;": "\u2267",
+    "GreaterGreater;": "\u2aa2",
+    "GreaterLess;": "\u2277",
+    "GreaterSlantEqual;": "\u2a7e",
+    "GreaterTilde;": "\u2273",
+    "Gscr;": "\U0001d4a2",
+    "Gt;": "\u226b",
+    "HARDcy;": "\u042a",
+    "Hacek;": "\u02c7",
+    "Hat;": "^",
+    "Hcirc;": "\u0124",
+    "Hfr;": "\u210c",
+    "HilbertSpace;": "\u210b",
+    "Hopf;": "\u210d",
+    "HorizontalLine;": "\u2500",
+    "Hscr;": "\u210b",
+    "Hstrok;": "\u0126",
+    "HumpDownHump;": "\u224e",
+    "HumpEqual;": "\u224f",
+    "IEcy;": "\u0415",
+    "IJlig;": "\u0132",
+    "IOcy;": "\u0401",
+    "Iacute": "\xcd",
+    "Iacute;": "\xcd",
+    "Icirc": "\xce",
+    "Icirc;": "\xce",
+    "Icy;": "\u0418",
+    "Idot;": "\u0130",
+    "Ifr;": "\u2111",
+    "Igrave": "\xcc",
+    "Igrave;": "\xcc",
+    "Im;": "\u2111",
+    "Imacr;": "\u012a",
+    "ImaginaryI;": "\u2148",
+    "Implies;": "\u21d2",
+    "Int;": "\u222c",
+    "Integral;": "\u222b",
+    "Intersection;": "\u22c2",
+    "InvisibleComma;": "\u2063",
+    "InvisibleTimes;": "\u2062",
+    "Iogon;": "\u012e",
+    "Iopf;": "\U0001d540",
+    "Iota;": "\u0399",
+    "Iscr;": "\u2110",
+    "Itilde;": "\u0128",
+    "Iukcy;": "\u0406",
+    "Iuml": "\xcf",
+    "Iuml;": "\xcf",
+    "Jcirc;": "\u0134",
+    "Jcy;": "\u0419",
+    "Jfr;": "\U0001d50d",
+    "Jopf;": "\U0001d541",
+    "Jscr;": "\U0001d4a5",
+    "Jsercy;": "\u0408",
+    "Jukcy;": "\u0404",
+    "KHcy;": "\u0425",
+    "KJcy;": "\u040c",
+    "Kappa;": "\u039a",
+    "Kcedil;": "\u0136",
+    "Kcy;": "\u041a",
+    "Kfr;": "\U0001d50e",
+    "Kopf;": "\U0001d542",
+    "Kscr;": "\U0001d4a6",
+    "LJcy;": "\u0409",
+    "LT": "<",
+    "LT;": "<",
+    "Lacute;": "\u0139",
+    "Lambda;": "\u039b",
+    "Lang;": "\u27ea",
+    "Laplacetrf;": "\u2112",
+    "Larr;": "\u219e",
+    "Lcaron;": "\u013d",
+    "Lcedil;": "\u013b",
+    "Lcy;": "\u041b",
+    "LeftAngleBracket;": "\u27e8",
+    "LeftArrow;": "\u2190",
+    "LeftArrowBar;": "\u21e4",
+    "LeftArrowRightArrow;": "\u21c6",
+    "LeftCeiling;": "\u2308",
+    "LeftDoubleBracket;": "\u27e6",
+    "LeftDownTeeVector;": "\u2961",
+    "LeftDownVector;": "\u21c3",
+    "LeftDownVectorBar;": "\u2959",
+    "LeftFloor;": "\u230a",
+    "LeftRightArrow;": "\u2194",
+    "LeftRightVector;": "\u294e",
+    "LeftTee;": "\u22a3",
+    "LeftTeeArrow;": "\u21a4",
+    "LeftTeeVector;": "\u295a",
+    "LeftTriangle;": "\u22b2",
+    "LeftTriangleBar;": "\u29cf",
+    "LeftTriangleEqual;": "\u22b4",
+    "LeftUpDownVector;": "\u2951",
+    "LeftUpTeeVector;": "\u2960",
+    "LeftUpVector;": "\u21bf",
+    "LeftUpVectorBar;": "\u2958",
+    "LeftVector;": "\u21bc",
+    "LeftVectorBar;": "\u2952",
+    "Leftarrow;": "\u21d0",
+    "Leftrightarrow;": "\u21d4",
+    "LessEqualGreater;": "\u22da",
+    "LessFullEqual;": "\u2266",
+    "LessGreater;": "\u2276",
+    "LessLess;": "\u2aa1",
+    "LessSlantEqual;": "\u2a7d",
+    "LessTilde;": "\u2272",
+    "Lfr;": "\U0001d50f",
+    "Ll;": "\u22d8",
+    "Lleftarrow;": "\u21da",
+    "Lmidot;": "\u013f",
+    "LongLeftArrow;": "\u27f5",
+    "LongLeftRightArrow;": "\u27f7",
+    "LongRightArrow;": "\u27f6",
+    "Longleftarrow;": "\u27f8",
+    "Longleftrightarrow;": "\u27fa",
+    "Longrightarrow;": "\u27f9",
+    "Lopf;": "\U0001d543",
+    "LowerLeftArrow;": "\u2199",
+    "LowerRightArrow;": "\u2198",
+    "Lscr;": "\u2112",
+    "Lsh;": "\u21b0",
+    "Lstrok;": "\u0141",
+    "Lt;": "\u226a",
+    "Map;": "\u2905",
+    "Mcy;": "\u041c",
+    "MediumSpace;": "\u205f",
+    "Mellintrf;": "\u2133",
+    "Mfr;": "\U0001d510",
+    "MinusPlus;": "\u2213",
+    "Mopf;": "\U0001d544",
+    "Mscr;": "\u2133",
+    "Mu;": "\u039c",
+    "NJcy;": "\u040a",
+    "Nacute;": "\u0143",
+    "Ncaron;": "\u0147",
+    "Ncedil;": "\u0145",
+    "Ncy;": "\u041d",
+    "NegativeMediumSpace;": "\u200b",
+    "NegativeThickSpace;": "\u200b",
+    "NegativeThinSpace;": "\u200b",
+    "NegativeVeryThinSpace;": "\u200b",
+    "NestedGreaterGreater;": "\u226b",
+    "NestedLessLess;": "\u226a",
+    "NewLine;": "\n",
+    "Nfr;": "\U0001d511",
+    "NoBreak;": "\u2060",
+    "NonBreakingSpace;": "\xa0",
+    "Nopf;": "\u2115",
+    "Not;": "\u2aec",
+    "NotCongruent;": "\u2262",
+    "NotCupCap;": "\u226d",
+    "NotDoubleVerticalBar;": "\u2226",
+    "NotElement;": "\u2209",
+    "NotEqual;": "\u2260",
+    "NotEqualTilde;": "\u2242\u0338",
+    "NotExists;": "\u2204",
+    "NotGreater;": "\u226f",
+    "NotGreaterEqual;": "\u2271",
+    "NotGreaterFullEqual;": "\u2267\u0338",
+    "NotGreaterGreater;": "\u226b\u0338",
+    "NotGreaterLess;": "\u2279",
+    "NotGreaterSlantEqual;": "\u2a7e\u0338",
+    "NotGreaterTilde;": "\u2275",
+    "NotHumpDownHump;": "\u224e\u0338",
+    "NotHumpEqual;": "\u224f\u0338",
+    "NotLeftTriangle;": "\u22ea",
+    "NotLeftTriangleBar;": "\u29cf\u0338",
+    "NotLeftTriangleEqual;": "\u22ec",
+    "NotLess;": "\u226e",
+    "NotLessEqual;": "\u2270",
+    "NotLessGreater;": "\u2278",
+    "NotLessLess;": "\u226a\u0338",
+    "NotLessSlantEqual;": "\u2a7d\u0338",
+    "NotLessTilde;": "\u2274",
+    "NotNestedGreaterGreater;": "\u2aa2\u0338",
+    "NotNestedLessLess;": "\u2aa1\u0338",
+    "NotPrecedes;": "\u2280",
+    "NotPrecedesEqual;": "\u2aaf\u0338",
+    "NotPrecedesSlantEqual;": "\u22e0",
+    "NotReverseElement;": "\u220c",
+    "NotRightTriangle;": "\u22eb",
+    "NotRightTriangleBar;": "\u29d0\u0338",
+    "NotRightTriangleEqual;": "\u22ed",
+    "NotSquareSubset;": "\u228f\u0338",
+    "NotSquareSubsetEqual;": "\u22e2",
+    "NotSquareSuperset;": "\u2290\u0338",
+    "NotSquareSupersetEqual;": "\u22e3",
+    "NotSubset;": "\u2282\u20d2",
+    "NotSubsetEqual;": "\u2288",
+    "NotSucceeds;": "\u2281",
+    "NotSucceedsEqual;": "\u2ab0\u0338",
+    "NotSucceedsSlantEqual;": "\u22e1",
+    "NotSucceedsTilde;": "\u227f\u0338",
+    "NotSuperset;": "\u2283\u20d2",
+    "NotSupersetEqual;": "\u2289",
+    "NotTilde;": "\u2241",
+    "NotTildeEqual;": "\u2244",
+    "NotTildeFullEqual;": "\u2247",
+    "NotTildeTilde;": "\u2249",
+    "NotVerticalBar;": "\u2224",
+    "Nscr;": "\U0001d4a9",
+    "Ntilde": "\xd1",
+    "Ntilde;": "\xd1",
+    "Nu;": "\u039d",
+    "OElig;": "\u0152",
+    "Oacute": "\xd3",
+    "Oacute;": "\xd3",
+    "Ocirc": "\xd4",
+    "Ocirc;": "\xd4",
+    "Ocy;": "\u041e",
+    "Odblac;": "\u0150",
+    "Ofr;": "\U0001d512",
+    "Ograve": "\xd2",
+    "Ograve;": "\xd2",
+    "Omacr;": "\u014c",
+    "Omega;": "\u03a9",
+    "Omicron;": "\u039f",
+    "Oopf;": "\U0001d546",
+    "OpenCurlyDoubleQuote;": "\u201c",
+    "OpenCurlyQuote;": "\u2018",
+    "Or;": "\u2a54",
+    "Oscr;": "\U0001d4aa",
+    "Oslash": "\xd8",
+    "Oslash;": "\xd8",
+    "Otilde": "\xd5",
+    "Otilde;": "\xd5",
+    "Otimes;": "\u2a37",
+    "Ouml": "\xd6",
+    "Ouml;": "\xd6",
+    "OverBar;": "\u203e",
+    "OverBrace;": "\u23de",
+    "OverBracket;": "\u23b4",
+    "OverParenthesis;": "\u23dc",
+    "PartialD;": "\u2202",
+    "Pcy;": "\u041f",
+    "Pfr;": "\U0001d513",
+    "Phi;": "\u03a6",
+    "Pi;": "\u03a0",
+    "PlusMinus;": "\xb1",
+    "Poincareplane;": "\u210c",
+    "Popf;": "\u2119",
+    "Pr;": "\u2abb",
+    "Precedes;": "\u227a",
+    "PrecedesEqual;": "\u2aaf",
+    "PrecedesSlantEqual;": "\u227c",
+    "PrecedesTilde;": "\u227e",
+    "Prime;": "\u2033",
+    "Product;": "\u220f",
+    "Proportion;": "\u2237",
+    "Proportional;": "\u221d",
+    "Pscr;": "\U0001d4ab",
+    "Psi;": "\u03a8",
+    "QUOT": "\"",
+    "QUOT;": "\"",
+    "Qfr;": "\U0001d514",
+    "Qopf;": "\u211a",
+    "Qscr;": "\U0001d4ac",
+    "RBarr;": "\u2910",
+    "REG": "\xae",
+    "REG;": "\xae",
+    "Racute;": "\u0154",
+    "Rang;": "\u27eb",
+    "Rarr;": "\u21a0",
+    "Rarrtl;": "\u2916",
+    "Rcaron;": "\u0158",
+    "Rcedil;": "\u0156",
+    "Rcy;": "\u0420",
+    "Re;": "\u211c",
+    "ReverseElement;": "\u220b",
+    "ReverseEquilibrium;": "\u21cb",
+    "ReverseUpEquilibrium;": "\u296f",
+    "Rfr;": "\u211c",
+    "Rho;": "\u03a1",
+    "RightAngleBracket;": "\u27e9",
+    "RightArrow;": "\u2192",
+    "RightArrowBar;": "\u21e5",
+    "RightArrowLeftArrow;": "\u21c4",
+    "RightCeiling;": "\u2309",
+    "RightDoubleBracket;": "\u27e7",
+    "RightDownTeeVector;": "\u295d",
+    "RightDownVector;": "\u21c2",
+    "RightDownVectorBar;": "\u2955",
+    "RightFloor;": "\u230b",
+    "RightTee;": "\u22a2",
+    "RightTeeArrow;": "\u21a6",
+    "RightTeeVector;": "\u295b",
+    "RightTriangle;": "\u22b3",
+    "RightTriangleBar;": "\u29d0",
+    "RightTriangleEqual;": "\u22b5",
+    "RightUpDownVector;": "\u294f",
+    "RightUpTeeVector;": "\u295c",
+    "RightUpVector;": "\u21be",
+    "RightUpVectorBar;": "\u2954",
+    "RightVector;": "\u21c0",
+    "RightVectorBar;": "\u2953",
+    "Rightarrow;": "\u21d2",
+    "Ropf;": "\u211d",
+    "RoundImplies;": "\u2970",
+    "Rrightarrow;": "\u21db",
+    "Rscr;": "\u211b",
+    "Rsh;": "\u21b1",
+    "RuleDelayed;": "\u29f4",
+    "SHCHcy;": "\u0429",
+    "SHcy;": "\u0428",
+    "SOFTcy;": "\u042c",
+    "Sacute;": "\u015a",
+    "Sc;": "\u2abc",
+    "Scaron;": "\u0160",
+    "Scedil;": "\u015e",
+    "Scirc;": "\u015c",
+    "Scy;": "\u0421",
+    "Sfr;": "\U0001d516",
+    "ShortDownArrow;": "\u2193",
+    "ShortLeftArrow;": "\u2190",
+    "ShortRightArrow;": "\u2192",
+    "ShortUpArrow;": "\u2191",
+    "Sigma;": "\u03a3",
+    "SmallCircle;": "\u2218",
+    "Sopf;": "\U0001d54a",
+    "Sqrt;": "\u221a",
+    "Square;": "\u25a1",
+    "SquareIntersection;": "\u2293",
+    "SquareSubset;": "\u228f",
+    "SquareSubsetEqual;": "\u2291",
+    "SquareSuperset;": "\u2290",
+    "SquareSupersetEqual;": "\u2292",
+    "SquareUnion;": "\u2294",
+    "Sscr;": "\U0001d4ae",
+    "Star;": "\u22c6",
+    "Sub;": "\u22d0",
+    "Subset;": "\u22d0",
+    "SubsetEqual;": "\u2286",
+    "Succeeds;": "\u227b",
+    "SucceedsEqual;": "\u2ab0",
+    "SucceedsSlantEqual;": "\u227d",
+    "SucceedsTilde;": "\u227f",
+    "SuchThat;": "\u220b",
+    "Sum;": "\u2211",
+    "Sup;": "\u22d1",
+    "Superset;": "\u2283",
+    "SupersetEqual;": "\u2287",
+    "Supset;": "\u22d1",
+    "THORN": "\xde",
+    "THORN;": "\xde",
+    "TRADE;": "\u2122",
+    "TSHcy;": "\u040b",
+    "TScy;": "\u0426",
+    "Tab;": "\t",
+    "Tau;": "\u03a4",
+    "Tcaron;": "\u0164",
+    "Tcedil;": "\u0162",
+    "Tcy;": "\u0422",
+    "Tfr;": "\U0001d517",
+    "Therefore;": "\u2234",
+    "Theta;": "\u0398",
+    "ThickSpace;": "\u205f\u200a",
+    "ThinSpace;": "\u2009",
+    "Tilde;": "\u223c",
+    "TildeEqual;": "\u2243",
+    "TildeFullEqual;": "\u2245",
+    "TildeTilde;": "\u2248",
+    "Topf;": "\U0001d54b",
+    "TripleDot;": "\u20db",
+    "Tscr;": "\U0001d4af",
+    "Tstrok;": "\u0166",
+    "Uacute": "\xda",
+    "Uacute;": "\xda",
+    "Uarr;": "\u219f",
+    "Uarrocir;": "\u2949",
+    "Ubrcy;": "\u040e",
+    "Ubreve;": "\u016c",
+    "Ucirc": "\xdb",
+    "Ucirc;": "\xdb",
+    "Ucy;": "\u0423",
+    "Udblac;": "\u0170",
+    "Ufr;": "\U0001d518",
+    "Ugrave": "\xd9",
+    "Ugrave;": "\xd9",
+    "Umacr;": "\u016a",
+    "UnderBar;": "_",
+    "UnderBrace;": "\u23df",
+    "UnderBracket;": "\u23b5",
+    "UnderParenthesis;": "\u23dd",
+    "Union;": "\u22c3",
+    "UnionPlus;": "\u228e",
+    "Uogon;": "\u0172",
+    "Uopf;": "\U0001d54c",
+    "UpArrow;": "\u2191",
+    "UpArrowBar;": "\u2912",
+    "UpArrowDownArrow;": "\u21c5",
+    "UpDownArrow;": "\u2195",
+    "UpEquilibrium;": "\u296e",
+    "UpTee;": "\u22a5",
+    "UpTeeArrow;": "\u21a5",
+    "Uparrow;": "\u21d1",
+    "Updownarrow;": "\u21d5",
+    "UpperLeftArrow;": "\u2196",
+    "UpperRightArrow;": "\u2197",
+    "Upsi;": "\u03d2",
+    "Upsilon;": "\u03a5",
+    "Uring;": "\u016e",
+    "Uscr;": "\U0001d4b0",
+    "Utilde;": "\u0168",
+    "Uuml": "\xdc",
+    "Uuml;": "\xdc",
+    "VDash;": "\u22ab",
+    "Vbar;": "\u2aeb",
+    "Vcy;": "\u0412",
+    "Vdash;": "\u22a9",
+    "Vdashl;": "\u2ae6",
+    "Vee;": "\u22c1",
+    "Verbar;": "\u2016",
+    "Vert;": "\u2016",
+    "VerticalBar;": "\u2223",
+    "VerticalLine;": "|",
+    "VerticalSeparator;": "\u2758",
+    "VerticalTilde;": "\u2240",
+    "VeryThinSpace;": "\u200a",
+    "Vfr;": "\U0001d519",
+    "Vopf;": "\U0001d54d",
+    "Vscr;": "\U0001d4b1",
+    "Vvdash;": "\u22aa",
+    "Wcirc;": "\u0174",
+    "Wedge;": "\u22c0",
+    "Wfr;": "\U0001d51a",
+    "Wopf;": "\U0001d54e",
+    "Wscr;": "\U0001d4b2",
+    "Xfr;": "\U0001d51b",
+    "Xi;": "\u039e",
+    "Xopf;": "\U0001d54f",
+    "Xscr;": "\U0001d4b3",
+    "YAcy;": "\u042f",
+    "YIcy;": "\u0407",
+    "YUcy;": "\u042e",
+    "Yacute": "\xdd",
+    "Yacute;": "\xdd",
+    "Ycirc;": "\u0176",
+    "Ycy;": "\u042b",
+    "Yfr;": "\U0001d51c",
+    "Yopf;": "\U0001d550",
+    "Yscr;": "\U0001d4b4",
+    "Yuml;": "\u0178",
+    "ZHcy;": "\u0416",
+    "Zacute;": "\u0179",
+    "Zcaron;": "\u017d",
+    "Zcy;": "\u0417",
+    "Zdot;": "\u017b",
+    "ZeroWidthSpace;": "\u200b",
+    "Zeta;": "\u0396",
+    "Zfr;": "\u2128",
+    "Zopf;": "\u2124",
+    "Zscr;": "\U0001d4b5",
+    "aacute": "\xe1",
+    "aacute;": "\xe1",
+    "abreve;": "\u0103",
+    "ac;": "\u223e",
+    "acE;": "\u223e\u0333",
+    "acd;": "\u223f",
+    "acirc": "\xe2",
+    "acirc;": "\xe2",
+    "acute": "\xb4",
+    "acute;": "\xb4",
+    "acy;": "\u0430",
+    "aelig": "\xe6",
+    "aelig;": "\xe6",
+    "af;": "\u2061",
+    "afr;": "\U0001d51e",
+    "agrave": "\xe0",
+    "agrave;": "\xe0",
+    "alefsym;": "\u2135",
+    "aleph;": "\u2135",
+    "alpha;": "\u03b1",
+    "amacr;": "\u0101",
+    "amalg;": "\u2a3f",
+    "amp": "&",
+    "amp;": "&",
+    "and;": "\u2227",
+    "andand;": "\u2a55",
+    "andd;": "\u2a5c",
+    "andslope;": "\u2a58",
+    "andv;": "\u2a5a",
+    "ang;": "\u2220",
+    "ange;": "\u29a4",
+    "angle;": "\u2220",
+    "angmsd;": "\u2221",
+    "angmsdaa;": "\u29a8",
+    "angmsdab;": "\u29a9",
+    "angmsdac;": "\u29aa",
+    "angmsdad;": "\u29ab",
+    "angmsdae;": "\u29ac",
+    "angmsdaf;": "\u29ad",
+    "angmsdag;": "\u29ae",
+    "angmsdah;": "\u29af",
+    "angrt;": "\u221f",
+    "angrtvb;": "\u22be",
+    "angrtvbd;": "\u299d",
+    "angsph;": "\u2222",
+    "angst;": "\xc5",
+    "angzarr;": "\u237c",
+    "aogon;": "\u0105",
+    "aopf;": "\U0001d552",
+    "ap;": "\u2248",
+    "apE;": "\u2a70",
+    "apacir;": "\u2a6f",
+    "ape;": "\u224a",
+    "apid;": "\u224b",
+    "apos;": "'",
+    "approx;": "\u2248",
+    "approxeq;": "\u224a",
+    "aring": "\xe5",
+    "aring;": "\xe5",
+    "ascr;": "\U0001d4b6",
+    "ast;": "*",
+    "asymp;": "\u2248",
+    "asympeq;": "\u224d",
+    "atilde": "\xe3",
+    "atilde;": "\xe3",
+    "auml": "\xe4",
+    "auml;": "\xe4",
+    "awconint;": "\u2233",
+    "awint;": "\u2a11",
+    "bNot;": "\u2aed",
+    "backcong;": "\u224c",
+    "backepsilon;": "\u03f6",
+    "backprime;": "\u2035",
+    "backsim;": "\u223d",
+    "backsimeq;": "\u22cd",
+    "barvee;": "\u22bd",
+    "barwed;": "\u2305",
+    "barwedge;": "\u2305",
+    "bbrk;": "\u23b5",
+    "bbrktbrk;": "\u23b6",
+    "bcong;": "\u224c",
+    "bcy;": "\u0431",
+    "bdquo;": "\u201e",
+    "becaus;": "\u2235",
+    "because;": "\u2235",
+    "bemptyv;": "\u29b0",
+    "bepsi;": "\u03f6",
+    "bernou;": "\u212c",
+    "beta;": "\u03b2",
+    "beth;": "\u2136",
+    "between;": "\u226c",
+    "bfr;": "\U0001d51f",
+    "bigcap;": "\u22c2",
+    "bigcirc;": "\u25ef",
+    "bigcup;": "\u22c3",
+    "bigodot;": "\u2a00",
+    "bigoplus;": "\u2a01",
+    "bigotimes;": "\u2a02",
+    "bigsqcup;": "\u2a06",
+    "bigstar;": "\u2605",
+    "bigtriangledown;": "\u25bd",
+    "bigtriangleup;": "\u25b3",
+    "biguplus;": "\u2a04",
+    "bigvee;": "\u22c1",
+    "bigwedge;": "\u22c0",
+    "bkarow;": "\u290d",
+    "blacklozenge;": "\u29eb",
+    "blacksquare;": "\u25aa",
+    "blacktriangle;": "\u25b4",
+    "blacktriangledown;": "\u25be",
+    "blacktriangleleft;": "\u25c2",
+    "blacktriangleright;": "\u25b8",
+    "blank;": "\u2423",
+    "blk12;": "\u2592",
+    "blk14;": "\u2591",
+    "blk34;": "\u2593",
+    "block;": "\u2588",
+    "bne;": "=\u20e5",
+    "bnequiv;": "\u2261\u20e5",
+    "bnot;": "\u2310",
+    "bopf;": "\U0001d553",
+    "bot;": "\u22a5",
+    "bottom;": "\u22a5",
+    "bowtie;": "\u22c8",
+    "boxDL;": "\u2557",
+    "boxDR;": "\u2554",
+    "boxDl;": "\u2556",
+    "boxDr;": "\u2553",
+    "boxH;": "\u2550",
+    "boxHD;": "\u2566",
+    "boxHU;": "\u2569",
+    "boxHd;": "\u2564",
+    "boxHu;": "\u2567",
+    "boxUL;": "\u255d",
+    "boxUR;": "\u255a",
+    "boxUl;": "\u255c",
+    "boxUr;": "\u2559",
+    "boxV;": "\u2551",
+    "boxVH;": "\u256c",
+    "boxVL;": "\u2563",
+    "boxVR;": "\u2560",
+    "boxVh;": "\u256b",
+    "boxVl;": "\u2562",
+    "boxVr;": "\u255f",
+    "boxbox;": "\u29c9",
+    "boxdL;": "\u2555",
+    "boxdR;": "\u2552",
+    "boxdl;": "\u2510",
+    "boxdr;": "\u250c",
+    "boxh;": "\u2500",
+    "boxhD;": "\u2565",
+    "boxhU;": "\u2568",
+    "boxhd;": "\u252c",
+    "boxhu;": "\u2534",
+    "boxminus;": "\u229f",
+    "boxplus;": "\u229e",
+    "boxtimes;": "\u22a0",
+    "boxuL;": "\u255b",
+    "boxuR;": "\u2558",
+    "boxul;": "\u2518",
+    "boxur;": "\u2514",
+    "boxv;": "\u2502",
+    "boxvH;": "\u256a",
+    "boxvL;": "\u2561",
+    "boxvR;": "\u255e",
+    "boxvh;": "\u253c",
+    "boxvl;": "\u2524",
+    "boxvr;": "\u251c",
+    "bprime;": "\u2035",
+    "breve;": "\u02d8",
+    "brvbar": "\xa6",
+    "brvbar;": "\xa6",
+    "bscr;": "\U0001d4b7",
+    "bsemi;": "\u204f",
+    "bsim;": "\u223d",
+    "bsime;": "\u22cd",
+    "bsol;": "\\",
+    "bsolb;": "\u29c5",
+    "bsolhsub;": "\u27c8",
+    "bull;": "\u2022",
+    "bullet;": "\u2022",
+    "bump;": "\u224e",
+    "bumpE;": "\u2aae",
+    "bumpe;": "\u224f",
+    "bumpeq;": "\u224f",
+    "cacute;": "\u0107",
+    "cap;": "\u2229",
+    "capand;": "\u2a44",
+    "capbrcup;": "\u2a49",
+    "capcap;": "\u2a4b",
+    "capcup;": "\u2a47",
+    "capdot;": "\u2a40",
+    "caps;": "\u2229\ufe00",
+    "caret;": "\u2041",
+    "caron;": "\u02c7",
+    "ccaps;": "\u2a4d",
+    "ccaron;": "\u010d",
+    "ccedil": "\xe7",
+    "ccedil;": "\xe7",
+    "ccirc;": "\u0109",
+    "ccups;": "\u2a4c",
+    "ccupssm;": "\u2a50",
+    "cdot;": "\u010b",
+    "cedil": "\xb8",
+    "cedil;": "\xb8",
+    "cemptyv;": "\u29b2",
+    "cent": "\xa2",
+    "cent;": "\xa2",
+    "centerdot;": "\xb7",
+    "cfr;": "\U0001d520",
+    "chcy;": "\u0447",
+    "check;": "\u2713",
+    "checkmark;": "\u2713",
+    "chi;": "\u03c7",
+    "cir;": "\u25cb",
+    "cirE;": "\u29c3",
+    "circ;": "\u02c6",
+    "circeq;": "\u2257",
+    "circlearrowleft;": "\u21ba",
+    "circlearrowright;": "\u21bb",
+    "circledR;": "\xae",
+    "circledS;": "\u24c8",
+    "circledast;": "\u229b",
+    "circledcirc;": "\u229a",
+    "circleddash;": "\u229d",
+    "cire;": "\u2257",
+    "cirfnint;": "\u2a10",
+    "cirmid;": "\u2aef",
+    "cirscir;": "\u29c2",
+    "clubs;": "\u2663",
+    "clubsuit;": "\u2663",
+    "colon;": ":",
+    "colone;": "\u2254",
+    "coloneq;": "\u2254",
+    "comma;": ",",
+    "commat;": "@",
+    "comp;": "\u2201",
+    "compfn;": "\u2218",
+    "complement;": "\u2201",
+    "complexes;": "\u2102",
+    "cong;": "\u2245",
+    "congdot;": "\u2a6d",
+    "conint;": "\u222e",
+    "copf;": "\U0001d554",
+    "coprod;": "\u2210",
+    "copy": "\xa9",
+    "copy;": "\xa9",
+    "copysr;": "\u2117",
+    "crarr;": "\u21b5",
+    "cross;": "\u2717",
+    "cscr;": "\U0001d4b8",
+    "csub;": "\u2acf",
+    "csube;": "\u2ad1",
+    "csup;": "\u2ad0",
+    "csupe;": "\u2ad2",
+    "ctdot;": "\u22ef",
+    "cudarrl;": "\u2938",
+    "cudarrr;": "\u2935",
+    "cuepr;": "\u22de",
+    "cuesc;": "\u22df",
+    "cularr;": "\u21b6",
+    "cularrp;": "\u293d",
+    "cup;": "\u222a",
+    "cupbrcap;": "\u2a48",
+    "cupcap;": "\u2a46",
+    "cupcup;": "\u2a4a",
+    "cupdot;": "\u228d",
+    "cupor;": "\u2a45",
+    "cups;": "\u222a\ufe00",
+    "curarr;": "\u21b7",
+    "curarrm;": "\u293c",
+    "curlyeqprec;": "\u22de",
+    "curlyeqsucc;": "\u22df",
+    "curlyvee;": "\u22ce",
+    "curlywedge;": "\u22cf",
+    "curren": "\xa4",
+    "curren;": "\xa4",
+    "curvearrowleft;": "\u21b6",
+    "curvearrowright;": "\u21b7",
+    "cuvee;": "\u22ce",
+    "cuwed;": "\u22cf",
+    "cwconint;": "\u2232",
+    "cwint;": "\u2231",
+    "cylcty;": "\u232d",
+    "dArr;": "\u21d3",
+    "dHar;": "\u2965",
+    "dagger;": "\u2020",
+    "daleth;": "\u2138",
+    "darr;": "\u2193",
+    "dash;": "\u2010",
+    "dashv;": "\u22a3",
+    "dbkarow;": "\u290f",
+    "dblac;": "\u02dd",
+    "dcaron;": "\u010f",
+    "dcy;": "\u0434",
+    "dd;": "\u2146",
+    "ddagger;": "\u2021",
+    "ddarr;": "\u21ca",
+    "ddotseq;": "\u2a77",
+    "deg": "\xb0",
+    "deg;": "\xb0",
+    "delta;": "\u03b4",
+    "demptyv;": "\u29b1",
+    "dfisht;": "\u297f",
+    "dfr;": "\U0001d521",
+    "dharl;": "\u21c3",
+    "dharr;": "\u21c2",
+    "diam;": "\u22c4",
+    "diamond;": "\u22c4",
+    "diamondsuit;": "\u2666",
+    "diams;": "\u2666",
+    "die;": "\xa8",
+    "digamma;": "\u03dd",
+    "disin;": "\u22f2",
+    "div;": "\xf7",
+    "divide": "\xf7",
+    "divide;": "\xf7",
+    "divideontimes;": "\u22c7",
+    "divonx;": "\u22c7",
+    "djcy;": "\u0452",
+    "dlcorn;": "\u231e",
+    "dlcrop;": "\u230d",
+    "dollar;": "$",
+    "dopf;": "\U0001d555",
+    "dot;": "\u02d9",
+    "doteq;": "\u2250",
+    "doteqdot;": "\u2251",
+    "dotminus;": "\u2238",
+    "dotplus;": "\u2214",
+    "dotsquare;": "\u22a1",
+    "doublebarwedge;": "\u2306",
+    "downarrow;": "\u2193",
+    "downdownarrows;": "\u21ca",
+    "downharpoonleft;": "\u21c3",
+    "downharpoonright;": "\u21c2",
+    "drbkarow;": "\u2910",
+    "drcorn;": "\u231f",
+    "drcrop;": "\u230c",
+    "dscr;": "\U0001d4b9",
+    "dscy;": "\u0455",
+    "dsol;": "\u29f6",
+    "dstrok;": "\u0111",
+    "dtdot;": "\u22f1",
+    "dtri;": "\u25bf",
+    "dtrif;": "\u25be",
+    "duarr;": "\u21f5",
+    "duhar;": "\u296f",
+    "dwangle;": "\u29a6",
+    "dzcy;": "\u045f",
+    "dzigrarr;": "\u27ff",
+    "eDDot;": "\u2a77",
+    "eDot;": "\u2251",
+    "eacute": "\xe9",
+    "eacute;": "\xe9",
+    "easter;": "\u2a6e",
+    "ecaron;": "\u011b",
+    "ecir;": "\u2256",
+    "ecirc": "\xea",
+    "ecirc;": "\xea",
+    "ecolon;": "\u2255",
+    "ecy;": "\u044d",
+    "edot;": "\u0117",
+    "ee;": "\u2147",
+    "efDot;": "\u2252",
+    "efr;": "\U0001d522",
+    "eg;": "\u2a9a",
+    "egrave": "\xe8",
+    "egrave;": "\xe8",
+    "egs;": "\u2a96",
+    "egsdot;": "\u2a98",
+    "el;": "\u2a99",
+    "elinters;": "\u23e7",
+    "ell;": "\u2113",
+    "els;": "\u2a95",
+    "elsdot;": "\u2a97",
+    "emacr;": "\u0113",
+    "empty;": "\u2205",
+    "emptyset;": "\u2205",
+    "emptyv;": "\u2205",
+    "emsp13;": "\u2004",
+    "emsp14;": "\u2005",
+    "emsp;": "\u2003",
+    "eng;": "\u014b",
+    "ensp;": "\u2002",
+    "eogon;": "\u0119",
+    "eopf;": "\U0001d556",
+    "epar;": "\u22d5",
+    "eparsl;": "\u29e3",
+    "eplus;": "\u2a71",
+    "epsi;": "\u03b5",
+    "epsilon;": "\u03b5",
+    "epsiv;": "\u03f5",
+    "eqcirc;": "\u2256",
+    "eqcolon;": "\u2255",
+    "eqsim;": "\u2242",
+    "eqslantgtr;": "\u2a96",
+    "eqslantless;": "\u2a95",
+    "equals;": "=",
+    "equest;": "\u225f",
+    "equiv;": "\u2261",
+    "equivDD;": "\u2a78",
+    "eqvparsl;": "\u29e5",
+    "erDot;": "\u2253",
+    "erarr;": "\u2971",
+    "escr;": "\u212f",
+    "esdot;": "\u2250",
+    "esim;": "\u2242",
+    "eta;": "\u03b7",
+    "eth": "\xf0",
+    "eth;": "\xf0",
+    "euml": "\xeb",
+    "euml;": "\xeb",
+    "euro;": "\u20ac",
+    "excl;": "!",
+    "exist;": "\u2203",
+    "expectation;": "\u2130",
+    "exponentiale;": "\u2147",
+    "fallingdotseq;": "\u2252",
+    "fcy;": "\u0444",
+    "female;": "\u2640",
+    "ffilig;": "\ufb03",
+    "fflig;": "\ufb00",
+    "ffllig;": "\ufb04",
+    "ffr;": "\U0001d523",
+    "filig;": "\ufb01",
+    "fjlig;": "fj",
+    "flat;": "\u266d",
+    "fllig;": "\ufb02",
+    "fltns;": "\u25b1",
+    "fnof;": "\u0192",
+    "fopf;": "\U0001d557",
+    "forall;": "\u2200",
+    "fork;": "\u22d4",
+    "forkv;": "\u2ad9",
+    "fpartint;": "\u2a0d",
+    "frac12": "\xbd",
+    "frac12;": "\xbd",
+    "frac13;": "\u2153",
+    "frac14": "\xbc",
+    "frac14;": "\xbc",
+    "frac15;": "\u2155",
+    "frac16;": "\u2159",
+    "frac18;": "\u215b",
+    "frac23;": "\u2154",
+    "frac25;": "\u2156",
+    "frac34": "\xbe",
+    "frac34;": "\xbe",
+    "frac35;": "\u2157",
+    "frac38;": "\u215c",
+    "frac45;": "\u2158",
+    "frac56;": "\u215a",
+    "frac58;": "\u215d",
+    "frac78;": "\u215e",
+    "frasl;": "\u2044",
+    "frown;": "\u2322",
+    "fscr;": "\U0001d4bb",
+    "gE;": "\u2267",
+    "gEl;": "\u2a8c",
+    "gacute;": "\u01f5",
+    "gamma;": "\u03b3",
+    "gammad;": "\u03dd",
+    "gap;": "\u2a86",
+    "gbreve;": "\u011f",
+    "gcirc;": "\u011d",
+    "gcy;": "\u0433",
+    "gdot;": "\u0121",
+    "ge;": "\u2265",
+    "gel;": "\u22db",
+    "geq;": "\u2265",
+    "geqq;": "\u2267",
+    "geqslant;": "\u2a7e",
+    "ges;": "\u2a7e",
+    "gescc;": "\u2aa9",
+    "gesdot;": "\u2a80",
+    "gesdoto;": "\u2a82",
+    "gesdotol;": "\u2a84",
+    "gesl;": "\u22db\ufe00",
+    "gesles;": "\u2a94",
+    "gfr;": "\U0001d524",
+    "gg;": "\u226b",
+    "ggg;": "\u22d9",
+    "gimel;": "\u2137",
+    "gjcy;": "\u0453",
+    "gl;": "\u2277",
+    "glE;": "\u2a92",
+    "gla;": "\u2aa5",
+    "glj;": "\u2aa4",
+    "gnE;": "\u2269",
+    "gnap;": "\u2a8a",
+    "gnapprox;": "\u2a8a",
+    "gne;": "\u2a88",
+    "gneq;": "\u2a88",
+    "gneqq;": "\u2269",
+    "gnsim;": "\u22e7",
+    "gopf;": "\U0001d558",
+    "grave;": "`",
+    "gscr;": "\u210a",
+    "gsim;": "\u2273",
+    "gsime;": "\u2a8e",
+    "gsiml;": "\u2a90",
+    "gt": ">",
+    "gt;": ">",
+    "gtcc;": "\u2aa7",
+    "gtcir;": "\u2a7a",
+    "gtdot;": "\u22d7",
+    "gtlPar;": "\u2995",
+    "gtquest;": "\u2a7c",
+    "gtrapprox;": "\u2a86",
+    "gtrarr;": "\u2978",
+    "gtrdot;": "\u22d7",
+    "gtreqless;": "\u22db",
+    "gtreqqless;": "\u2a8c",
+    "gtrless;": "\u2277",
+    "gtrsim;": "\u2273",
+    "gvertneqq;": "\u2269\ufe00",
+    "gvnE;": "\u2269\ufe00",
+    "hArr;": "\u21d4",
+    "hairsp;": "\u200a",
+    "half;": "\xbd",
+    "hamilt;": "\u210b",
+    "hardcy;": "\u044a",
+    "harr;": "\u2194",
+    "harrcir;": "\u2948",
+    "harrw;": "\u21ad",
+    "hbar;": "\u210f",
+    "hcirc;": "\u0125",
+    "hearts;": "\u2665",
+    "heartsuit;": "\u2665",
+    "hellip;": "\u2026",
+    "hercon;": "\u22b9",
+    "hfr;": "\U0001d525",
+    "hksearow;": "\u2925",
+    "hkswarow;": "\u2926",
+    "hoarr;": "\u21ff",
+    "homtht;": "\u223b",
+    "hookleftarrow;": "\u21a9",
+    "hookrightarrow;": "\u21aa",
+    "hopf;": "\U0001d559",
+    "horbar;": "\u2015",
+    "hscr;": "\U0001d4bd",
+    "hslash;": "\u210f",
+    "hstrok;": "\u0127",
+    "hybull;": "\u2043",
+    "hyphen;": "\u2010",
+    "iacute": "\xed",
+    "iacute;": "\xed",
+    "ic;": "\u2063",
+    "icirc": "\xee",
+    "icirc;": "\xee",
+    "icy;": "\u0438",
+    "iecy;": "\u0435",
+    "iexcl": "\xa1",
+    "iexcl;": "\xa1",
+    "iff;": "\u21d4",
+    "ifr;": "\U0001d526",
+    "igrave": "\xec",
+    "igrave;": "\xec",
+    "ii;": "\u2148",
+    "iiiint;": "\u2a0c",
+    "iiint;": "\u222d",
+    "iinfin;": "\u29dc",
+    "iiota;": "\u2129",
+    "ijlig;": "\u0133",
+    "imacr;": "\u012b",
+    "image;": "\u2111",
+    "imagline;": "\u2110",
+    "imagpart;": "\u2111",
+    "imath;": "\u0131",
+    "imof;": "\u22b7",
+    "imped;": "\u01b5",
+    "in;": "\u2208",
+    "incare;": "\u2105",
+    "infin;": "\u221e",
+    "infintie;": "\u29dd",
+    "inodot;": "\u0131",
+    "int;": "\u222b",
+    "intcal;": "\u22ba",
+    "integers;": "\u2124",
+    "intercal;": "\u22ba",
+    "intlarhk;": "\u2a17",
+    "intprod;": "\u2a3c",
+    "iocy;": "\u0451",
+    "iogon;": "\u012f",
+    "iopf;": "\U0001d55a",
+    "iota;": "\u03b9",
+    "iprod;": "\u2a3c",
+    "iquest": "\xbf",
+    "iquest;": "\xbf",
+    "iscr;": "\U0001d4be",
+    "isin;": "\u2208",
+    "isinE;": "\u22f9",
+    "isindot;": "\u22f5",
+    "isins;": "\u22f4",
+    "isinsv;": "\u22f3",
+    "isinv;": "\u2208",
+    "it;": "\u2062",
+    "itilde;": "\u0129",
+    "iukcy;": "\u0456",
+    "iuml": "\xef",
+    "iuml;": "\xef",
+    "jcirc;": "\u0135",
+    "jcy;": "\u0439",
+    "jfr;": "\U0001d527",
+    "jmath;": "\u0237",
+    "jopf;": "\U0001d55b",
+    "jscr;": "\U0001d4bf",
+    "jsercy;": "\u0458",
+    "jukcy;": "\u0454",
+    "kappa;": "\u03ba",
+    "kappav;": "\u03f0",
+    "kcedil;": "\u0137",
+    "kcy;": "\u043a",
+    "kfr;": "\U0001d528",
+    "kgreen;": "\u0138",
+    "khcy;": "\u0445",
+    "kjcy;": "\u045c",
+    "kopf;": "\U0001d55c",
+    "kscr;": "\U0001d4c0",
+    "lAarr;": "\u21da",
+    "lArr;": "\u21d0",
+    "lAtail;": "\u291b",
+    "lBarr;": "\u290e",
+    "lE;": "\u2266",
+    "lEg;": "\u2a8b",
+    "lHar;": "\u2962",
+    "lacute;": "\u013a",
+    "laemptyv;": "\u29b4",
+    "lagran;": "\u2112",
+    "lambda;": "\u03bb",
+    "lang;": "\u27e8",
+    "langd;": "\u2991",
+    "langle;": "\u27e8",
+    "lap;": "\u2a85",
+    "laquo": "\xab",
+    "laquo;": "\xab",
+    "larr;": "\u2190",
+    "larrb;": "\u21e4",
+    "larrbfs;": "\u291f",
+    "larrfs;": "\u291d",
+    "larrhk;": "\u21a9",
+    "larrlp;": "\u21ab",
+    "larrpl;": "\u2939",
+    "larrsim;": "\u2973",
+    "larrtl;": "\u21a2",
+    "lat;": "\u2aab",
+    "latail;": "\u2919",
+    "late;": "\u2aad",
+    "lates;": "\u2aad\ufe00",
+    "lbarr;": "\u290c",
+    "lbbrk;": "\u2772",
+    "lbrace;": "{",
+    "lbrack;": "[",
+    "lbrke;": "\u298b",
+    "lbrksld;": "\u298f",
+    "lbrkslu;": "\u298d",
+    "lcaron;": "\u013e",
+    "lcedil;": "\u013c",
+    "lceil;": "\u2308",
+    "lcub;": "{",
+    "lcy;": "\u043b",
+    "ldca;": "\u2936",
+    "ldquo;": "\u201c",
+    "ldquor;": "\u201e",
+    "ldrdhar;": "\u2967",
+    "ldrushar;": "\u294b",
+    "ldsh;": "\u21b2",
+    "le;": "\u2264",
+    "leftarrow;": "\u2190",
+    "leftarrowtail;": "\u21a2",
+    "leftharpoondown;": "\u21bd",
+    "leftharpoonup;": "\u21bc",
+    "leftleftarrows;": "\u21c7",
+    "leftrightarrow;": "\u2194",
+    "leftrightarrows;": "\u21c6",
+    "leftrightharpoons;": "\u21cb",
+    "leftrightsquigarrow;": "\u21ad",
+    "leftthreetimes;": "\u22cb",
+    "leg;": "\u22da",
+    "leq;": "\u2264",
+    "leqq;": "\u2266",
+    "leqslant;": "\u2a7d",
+    "les;": "\u2a7d",
+    "lescc;": "\u2aa8",
+    "lesdot;": "\u2a7f",
+    "lesdoto;": "\u2a81",
+    "lesdotor;": "\u2a83",
+    "lesg;": "\u22da\ufe00",
+    "lesges;": "\u2a93",
+    "lessapprox;": "\u2a85",
+    "lessdot;": "\u22d6",
+    "lesseqgtr;": "\u22da",
+    "lesseqqgtr;": "\u2a8b",
+    "lessgtr;": "\u2276",
+    "lesssim;": "\u2272",
+    "lfisht;": "\u297c",
+    "lfloor;": "\u230a",
+    "lfr;": "\U0001d529",
+    "lg;": "\u2276",
+    "lgE;": "\u2a91",
+    "lhard;": "\u21bd",
+    "lharu;": "\u21bc",
+    "lharul;": "\u296a",
+    "lhblk;": "\u2584",
+    "ljcy;": "\u0459",
+    "ll;": "\u226a",
+    "llarr;": "\u21c7",
+    "llcorner;": "\u231e",
+    "llhard;": "\u296b",
+    "lltri;": "\u25fa",
+    "lmidot;": "\u0140",
+    "lmoust;": "\u23b0",
+    "lmoustache;": "\u23b0",
+    "lnE;": "\u2268",
+    "lnap;": "\u2a89",
+    "lnapprox;": "\u2a89",
+    "lne;": "\u2a87",
+    "lneq;": "\u2a87",
+    "lneqq;": "\u2268",
+    "lnsim;": "\u22e6",
+    "loang;": "\u27ec",
+    "loarr;": "\u21fd",
+    "lobrk;": "\u27e6",
+    "longleftarrow;": "\u27f5",
+    "longleftrightarrow;": "\u27f7",
+    "longmapsto;": "\u27fc",
+    "longrightarrow;": "\u27f6",
+    "looparrowleft;": "\u21ab",
+    "looparrowright;": "\u21ac",
+    "lopar;": "\u2985",
+    "lopf;": "\U0001d55d",
+    "loplus;": "\u2a2d",
+    "lotimes;": "\u2a34",
+    "lowast;": "\u2217",
+    "lowbar;": "_",
+    "loz;": "\u25ca",
+    "lozenge;": "\u25ca",
+    "lozf;": "\u29eb",
+    "lpar;": "(",
+    "lparlt;": "\u2993",
+    "lrarr;": "\u21c6",
+    "lrcorner;": "\u231f",
+    "lrhar;": "\u21cb",
+    "lrhard;": "\u296d",
+    "lrm;": "\u200e",
+    "lrtri;": "\u22bf",
+    "lsaquo;": "\u2039",
+    "lscr;": "\U0001d4c1",
+    "lsh;": "\u21b0",
+    "lsim;": "\u2272",
+    "lsime;": "\u2a8d",
+    "lsimg;": "\u2a8f",
+    "lsqb;": "[",
+    "lsquo;": "\u2018",
+    "lsquor;": "\u201a",
+    "lstrok;": "\u0142",
+    "lt": "<",
+    "lt;": "<",
+    "ltcc;": "\u2aa6",
+    "ltcir;": "\u2a79",
+    "ltdot;": "\u22d6",
+    "lthree;": "\u22cb",
+    "ltimes;": "\u22c9",
+    "ltlarr;": "\u2976",
+    "ltquest;": "\u2a7b",
+    "ltrPar;": "\u2996",
+    "ltri;": "\u25c3",
+    "ltrie;": "\u22b4",
+    "ltrif;": "\u25c2",
+    "lurdshar;": "\u294a",
+    "luruhar;": "\u2966",
+    "lvertneqq;": "\u2268\ufe00",
+    "lvnE;": "\u2268\ufe00",
+    "mDDot;": "\u223a",
+    "macr": "\xaf",
+    "macr;": "\xaf",
+    "male;": "\u2642",
+    "malt;": "\u2720",
+    "maltese;": "\u2720",
+    "map;": "\u21a6",
+    "mapsto;": "\u21a6",
+    "mapstodown;": "\u21a7",
+    "mapstoleft;": "\u21a4",
+    "mapstoup;": "\u21a5",
+    "marker;": "\u25ae",
+    "mcomma;": "\u2a29",
+    "mcy;": "\u043c",
+    "mdash;": "\u2014",
+    "measuredangle;": "\u2221",
+    "mfr;": "\U0001d52a",
+    "mho;": "\u2127",
+    "micro": "\xb5",
+    "micro;": "\xb5",
+    "mid;": "\u2223",
+    "midast;": "*",
+    "midcir;": "\u2af0",
+    "middot": "\xb7",
+    "middot;": "\xb7",
+    "minus;": "\u2212",
+    "minusb;": "\u229f",
+    "minusd;": "\u2238",
+    "minusdu;": "\u2a2a",
+    "mlcp;": "\u2adb",
+    "mldr;": "\u2026",
+    "mnplus;": "\u2213",
+    "models;": "\u22a7",
+    "mopf;": "\U0001d55e",
+    "mp;": "\u2213",
+    "mscr;": "\U0001d4c2",
+    "mstpos;": "\u223e",
+    "mu;": "\u03bc",
+    "multimap;": "\u22b8",
+    "mumap;": "\u22b8",
+    "nGg;": "\u22d9\u0338",
+    "nGt;": "\u226b\u20d2",
+    "nGtv;": "\u226b\u0338",
+    "nLeftarrow;": "\u21cd",
+    "nLeftrightarrow;": "\u21ce",
+    "nLl;": "\u22d8\u0338",
+    "nLt;": "\u226a\u20d2",
+    "nLtv;": "\u226a\u0338",
+    "nRightarrow;": "\u21cf",
+    "nVDash;": "\u22af",
+    "nVdash;": "\u22ae",
+    "nabla;": "\u2207",
+    "nacute;": "\u0144",
+    "nang;": "\u2220\u20d2",
+    "nap;": "\u2249",
+    "napE;": "\u2a70\u0338",
+    "napid;": "\u224b\u0338",
+    "napos;": "\u0149",
+    "napprox;": "\u2249",
+    "natur;": "\u266e",
+    "natural;": "\u266e",
+    "naturals;": "\u2115",
+    "nbsp": "\xa0",
+    "nbsp;": "\xa0",
+    "nbump;": "\u224e\u0338",
+    "nbumpe;": "\u224f\u0338",
+    "ncap;": "\u2a43",
+    "ncaron;": "\u0148",
+    "ncedil;": "\u0146",
+    "ncong;": "\u2247",
+    "ncongdot;": "\u2a6d\u0338",
+    "ncup;": "\u2a42",
+    "ncy;": "\u043d",
+    "ndash;": "\u2013",
+    "ne;": "\u2260",
+    "neArr;": "\u21d7",
+    "nearhk;": "\u2924",
+    "nearr;": "\u2197",
+    "nearrow;": "\u2197",
+    "nedot;": "\u2250\u0338",
+    "nequiv;": "\u2262",
+    "nesear;": "\u2928",
+    "nesim;": "\u2242\u0338",
+    "nexist;": "\u2204",
+    "nexists;": "\u2204",
+    "nfr;": "\U0001d52b",
+    "ngE;": "\u2267\u0338",
+    "nge;": "\u2271",
+    "ngeq;": "\u2271",
+    "ngeqq;": "\u2267\u0338",
+    "ngeqslant;": "\u2a7e\u0338",
+    "nges;": "\u2a7e\u0338",
+    "ngsim;": "\u2275",
+    "ngt;": "\u226f",
+    "ngtr;": "\u226f",
+    "nhArr;": "\u21ce",
+    "nharr;": "\u21ae",
+    "nhpar;": "\u2af2",
+    "ni;": "\u220b",
+    "nis;": "\u22fc",
+    "nisd;": "\u22fa",
+    "niv;": "\u220b",
+    "njcy;": "\u045a",
+    "nlArr;": "\u21cd",
+    "nlE;": "\u2266\u0338",
+    "nlarr;": "\u219a",
+    "nldr;": "\u2025",
+    "nle;": "\u2270",
+    "nleftarrow;": "\u219a",
+    "nleftrightarrow;": "\u21ae",
+    "nleq;": "\u2270",
+    "nleqq;": "\u2266\u0338",
+    "nleqslant;": "\u2a7d\u0338",
+    "nles;": "\u2a7d\u0338",
+    "nless;": "\u226e",
+    "nlsim;": "\u2274",
+    "nlt;": "\u226e",
+    "nltri;": "\u22ea",
+    "nltrie;": "\u22ec",
+    "nmid;": "\u2224",
+    "nopf;": "\U0001d55f",
+    "not": "\xac",
+    "not;": "\xac",
+    "notin;": "\u2209",
+    "notinE;": "\u22f9\u0338",
+    "notindot;": "\u22f5\u0338",
+    "notinva;": "\u2209",
+    "notinvb;": "\u22f7",
+    "notinvc;": "\u22f6",
+    "notni;": "\u220c",
+    "notniva;": "\u220c",
+    "notnivb;": "\u22fe",
+    "notnivc;": "\u22fd",
+    "npar;": "\u2226",
+    "nparallel;": "\u2226",
+    "nparsl;": "\u2afd\u20e5",
+    "npart;": "\u2202\u0338",
+    "npolint;": "\u2a14",
+    "npr;": "\u2280",
+    "nprcue;": "\u22e0",
+    "npre;": "\u2aaf\u0338",
+    "nprec;": "\u2280",
+    "npreceq;": "\u2aaf\u0338",
+    "nrArr;": "\u21cf",
+    "nrarr;": "\u219b",
+    "nrarrc;": "\u2933\u0338",
+    "nrarrw;": "\u219d\u0338",
+    "nrightarrow;": "\u219b",
+    "nrtri;": "\u22eb",
+    "nrtrie;": "\u22ed",
+    "nsc;": "\u2281",
+    "nsccue;": "\u22e1",
+    "nsce;": "\u2ab0\u0338",
+    "nscr;": "\U0001d4c3",
+    "nshortmid;": "\u2224",
+    "nshortparallel;": "\u2226",
+    "nsim;": "\u2241",
+    "nsime;": "\u2244",
+    "nsimeq;": "\u2244",
+    "nsmid;": "\u2224",
+    "nspar;": "\u2226",
+    "nsqsube;": "\u22e2",
+    "nsqsupe;": "\u22e3",
+    "nsub;": "\u2284",
+    "nsubE;": "\u2ac5\u0338",
+    "nsube;": "\u2288",
+    "nsubset;": "\u2282\u20d2",
+    "nsubseteq;": "\u2288",
+    "nsubseteqq;": "\u2ac5\u0338",
+    "nsucc;": "\u2281",
+    "nsucceq;": "\u2ab0\u0338",
+    "nsup;": "\u2285",
+    "nsupE;": "\u2ac6\u0338",
+    "nsupe;": "\u2289",
+    "nsupset;": "\u2283\u20d2",
+    "nsupseteq;": "\u2289",
+    "nsupseteqq;": "\u2ac6\u0338",
+    "ntgl;": "\u2279",
+    "ntilde": "\xf1",
+    "ntilde;": "\xf1",
+    "ntlg;": "\u2278",
+    "ntriangleleft;": "\u22ea",
+    "ntrianglelefteq;": "\u22ec",
+    "ntriangleright;": "\u22eb",
+    "ntrianglerighteq;": "\u22ed",
+    "nu;": "\u03bd",
+    "num;": "#",
+    "numero;": "\u2116",
+    "numsp;": "\u2007",
+    "nvDash;": "\u22ad",
+    "nvHarr;": "\u2904",
+    "nvap;": "\u224d\u20d2",
+    "nvdash;": "\u22ac",
+    "nvge;": "\u2265\u20d2",
+    "nvgt;": ">\u20d2",
+    "nvinfin;": "\u29de",
+    "nvlArr;": "\u2902",
+    "nvle;": "\u2264\u20d2",
+    "nvlt;": "<\u20d2",
+    "nvltrie;": "\u22b4\u20d2",
+    "nvrArr;": "\u2903",
+    "nvrtrie;": "\u22b5\u20d2",
+    "nvsim;": "\u223c\u20d2",
+    "nwArr;": "\u21d6",
+    "nwarhk;": "\u2923",
+    "nwarr;": "\u2196",
+    "nwarrow;": "\u2196",
+    "nwnear;": "\u2927",
+    "oS;": "\u24c8",
+    "oacute": "\xf3",
+    "oacute;": "\xf3",
+    "oast;": "\u229b",
+    "ocir;": "\u229a",
+    "ocirc": "\xf4",
+    "ocirc;": "\xf4",
+    "ocy;": "\u043e",
+    "odash;": "\u229d",
+    "odblac;": "\u0151",
+    "odiv;": "\u2a38",
+    "odot;": "\u2299",
+    "odsold;": "\u29bc",
+    "oelig;": "\u0153",
+    "ofcir;": "\u29bf",
+    "ofr;": "\U0001d52c",
+    "ogon;": "\u02db",
+    "ograve": "\xf2",
+    "ograve;": "\xf2",
+    "ogt;": "\u29c1",
+    "ohbar;": "\u29b5",
+    "ohm;": "\u03a9",
+    "oint;": "\u222e",
+    "olarr;": "\u21ba",
+    "olcir;": "\u29be",
+    "olcross;": "\u29bb",
+    "oline;": "\u203e",
+    "olt;": "\u29c0",
+    "omacr;": "\u014d",
+    "omega;": "\u03c9",
+    "omicron;": "\u03bf",
+    "omid;": "\u29b6",
+    "ominus;": "\u2296",
+    "oopf;": "\U0001d560",
+    "opar;": "\u29b7",
+    "operp;": "\u29b9",
+    "oplus;": "\u2295",
+    "or;": "\u2228",
+    "orarr;": "\u21bb",
+    "ord;": "\u2a5d",
+    "order;": "\u2134",
+    "orderof;": "\u2134",
+    "ordf": "\xaa",
+    "ordf;": "\xaa",
+    "ordm": "\xba",
+    "ordm;": "\xba",
+    "origof;": "\u22b6",
+    "oror;": "\u2a56",
+    "orslope;": "\u2a57",
+    "orv;": "\u2a5b",
+    "oscr;": "\u2134",
+    "oslash": "\xf8",
+    "oslash;": "\xf8",
+    "osol;": "\u2298",
+    "otilde": "\xf5",
+    "otilde;": "\xf5",
+    "otimes;": "\u2297",
+    "otimesas;": "\u2a36",
+    "ouml": "\xf6",
+    "ouml;": "\xf6",
+    "ovbar;": "\u233d",
+    "par;": "\u2225",
+    "para": "\xb6",
+    "para;": "\xb6",
+    "parallel;": "\u2225",
+    "parsim;": "\u2af3",
+    "parsl;": "\u2afd",
+    "part;": "\u2202",
+    "pcy;": "\u043f",
+    "percnt;": "%",
+    "period;": ".",
+    "permil;": "\u2030",
+    "perp;": "\u22a5",
+    "pertenk;": "\u2031",
+    "pfr;": "\U0001d52d",
+    "phi;": "\u03c6",
+    "phiv;": "\u03d5",
+    "phmmat;": "\u2133",
+    "phone;": "\u260e",
+    "pi;": "\u03c0",
+    "pitchfork;": "\u22d4",
+    "piv;": "\u03d6",
+    "planck;": "\u210f",
+    "planckh;": "\u210e",
+    "plankv;": "\u210f",
+    "plus;": "+",
+    "plusacir;": "\u2a23",
+    "plusb;": "\u229e",
+    "pluscir;": "\u2a22",
+    "plusdo;": "\u2214",
+    "plusdu;": "\u2a25",
+    "pluse;": "\u2a72",
+    "plusmn": "\xb1",
+    "plusmn;": "\xb1",
+    "plussim;": "\u2a26",
+    "plustwo;": "\u2a27",
+    "pm;": "\xb1",
+    "pointint;": "\u2a15",
+    "popf;": "\U0001d561",
+    "pound": "\xa3",
+    "pound;": "\xa3",
+    "pr;": "\u227a",
+    "prE;": "\u2ab3",
+    "prap;": "\u2ab7",
+    "prcue;": "\u227c",
+    "pre;": "\u2aaf",
+    "prec;": "\u227a",
+    "precapprox;": "\u2ab7",
+    "preccurlyeq;": "\u227c",
+    "preceq;": "\u2aaf",
+    "precnapprox;": "\u2ab9",
+    "precneqq;": "\u2ab5",
+    "precnsim;": "\u22e8",
+    "precsim;": "\u227e",
+    "prime;": "\u2032",
+    "primes;": "\u2119",
+    "prnE;": "\u2ab5",
+    "prnap;": "\u2ab9",
+    "prnsim;": "\u22e8",
+    "prod;": "\u220f",
+    "profalar;": "\u232e",
+    "profline;": "\u2312",
+    "profsurf;": "\u2313",
+    "prop;": "\u221d",
+    "propto;": "\u221d",
+    "prsim;": "\u227e",
+    "prurel;": "\u22b0",
+    "pscr;": "\U0001d4c5",
+    "psi;": "\u03c8",
+    "puncsp;": "\u2008",
+    "qfr;": "\U0001d52e",
+    "qint;": "\u2a0c",
+    "qopf;": "\U0001d562",
+    "qprime;": "\u2057",
+    "qscr;": "\U0001d4c6",
+    "quaternions;": "\u210d",
+    "quatint;": "\u2a16",
+    "quest;": "?",
+    "questeq;": "\u225f",
+    "quot": "\"",
+    "quot;": "\"",
+    "rAarr;": "\u21db",
+    "rArr;": "\u21d2",
+    "rAtail;": "\u291c",
+    "rBarr;": "\u290f",
+    "rHar;": "\u2964",
+    "race;": "\u223d\u0331",
+    "racute;": "\u0155",
+    "radic;": "\u221a",
+    "raemptyv;": "\u29b3",
+    "rang;": "\u27e9",
+    "rangd;": "\u2992",
+    "range;": "\u29a5",
+    "rangle;": "\u27e9",
+    "raquo": "\xbb",
+    "raquo;": "\xbb",
+    "rarr;": "\u2192",
+    "rarrap;": "\u2975",
+    "rarrb;": "\u21e5",
+    "rarrbfs;": "\u2920",
+    "rarrc;": "\u2933",
+    "rarrfs;": "\u291e",
+    "rarrhk;": "\u21aa",
+    "rarrlp;": "\u21ac",
+    "rarrpl;": "\u2945",
+    "rarrsim;": "\u2974",
+    "rarrtl;": "\u21a3",
+    "rarrw;": "\u219d",
+    "ratail;": "\u291a",
+    "ratio;": "\u2236",
+    "rationals;": "\u211a",
+    "rbarr;": "\u290d",
+    "rbbrk;": "\u2773",
+    "rbrace;": "}",
+    "rbrack;": "]",
+    "rbrke;": "\u298c",
+    "rbrksld;": "\u298e",
+    "rbrkslu;": "\u2990",
+    "rcaron;": "\u0159",
+    "rcedil;": "\u0157",
+    "rceil;": "\u2309",
+    "rcub;": "}",
+    "rcy;": "\u0440",
+    "rdca;": "\u2937",
+    "rdldhar;": "\u2969",
+    "rdquo;": "\u201d",
+    "rdquor;": "\u201d",
+    "rdsh;": "\u21b3",
+    "real;": "\u211c",
+    "realine;": "\u211b",
+    "realpart;": "\u211c",
+    "reals;": "\u211d",
+    "rect;": "\u25ad",
+    "reg": "\xae",
+    "reg;": "\xae",
+    "rfisht;": "\u297d",
+    "rfloor;": "\u230b",
+    "rfr;": "\U0001d52f",
+    "rhard;": "\u21c1",
+    "rharu;": "\u21c0",
+    "rharul;": "\u296c",
+    "rho;": "\u03c1",
+    "rhov;": "\u03f1",
+    "rightarrow;": "\u2192",
+    "rightarrowtail;": "\u21a3",
+    "rightharpoondown;": "\u21c1",
+    "rightharpoonup;": "\u21c0",
+    "rightleftarrows;": "\u21c4",
+    "rightleftharpoons;": "\u21cc",
+    "rightrightarrows;": "\u21c9",
+    "rightsquigarrow;": "\u219d",
+    "rightthreetimes;": "\u22cc",
+    "ring;": "\u02da",
+    "risingdotseq;": "\u2253",
+    "rlarr;": "\u21c4",
+    "rlhar;": "\u21cc",
+    "rlm;": "\u200f",
+    "rmoust;": "\u23b1",
+    "rmoustache;": "\u23b1",
+    "rnmid;": "\u2aee",
+    "roang;": "\u27ed",
+    "roarr;": "\u21fe",
+    "robrk;": "\u27e7",
+    "ropar;": "\u2986",
+    "ropf;": "\U0001d563",
+    "roplus;": "\u2a2e",
+    "rotimes;": "\u2a35",
+    "rpar;": ")",
+    "rpargt;": "\u2994",
+    "rppolint;": "\u2a12",
+    "rrarr;": "\u21c9",
+    "rsaquo;": "\u203a",
+    "rscr;": "\U0001d4c7",
+    "rsh;": "\u21b1",
+    "rsqb;": "]",
+    "rsquo;": "\u2019",
+    "rsquor;": "\u2019",
+    "rthree;": "\u22cc",
+    "rtimes;": "\u22ca",
+    "rtri;": "\u25b9",
+    "rtrie;": "\u22b5",
+    "rtrif;": "\u25b8",
+    "rtriltri;": "\u29ce",
+    "ruluhar;": "\u2968",
+    "rx;": "\u211e",
+    "sacute;": "\u015b",
+    "sbquo;": "\u201a",
+    "sc;": "\u227b",
+    "scE;": "\u2ab4",
+    "scap;": "\u2ab8",
+    "scaron;": "\u0161",
+    "sccue;": "\u227d",
+    "sce;": "\u2ab0",
+    "scedil;": "\u015f",
+    "scirc;": "\u015d",
+    "scnE;": "\u2ab6",
+    "scnap;": "\u2aba",
+    "scnsim;": "\u22e9",
+    "scpolint;": "\u2a13",
+    "scsim;": "\u227f",
+    "scy;": "\u0441",
+    "sdot;": "\u22c5",
+    "sdotb;": "\u22a1",
+    "sdote;": "\u2a66",
+    "seArr;": "\u21d8",
+    "searhk;": "\u2925",
+    "searr;": "\u2198",
+    "searrow;": "\u2198",
+    "sect": "\xa7",
+    "sect;": "\xa7",
+    "semi;": ";",
+    "seswar;": "\u2929",
+    "setminus;": "\u2216",
+    "setmn;": "\u2216",
+    "sext;": "\u2736",
+    "sfr;": "\U0001d530",
+    "sfrown;": "\u2322",
+    "sharp;": "\u266f",
+    "shchcy;": "\u0449",
+    "shcy;": "\u0448",
+    "shortmid;": "\u2223",
+    "shortparallel;": "\u2225",
+    "shy": "\xad",
+    "shy;": "\xad",
+    "sigma;": "\u03c3",
+    "sigmaf;": "\u03c2",
+    "sigmav;": "\u03c2",
+    "sim;": "\u223c",
+    "simdot;": "\u2a6a",
+    "sime;": "\u2243",
+    "simeq;": "\u2243",
+    "simg;": "\u2a9e",
+    "simgE;": "\u2aa0",
+    "siml;": "\u2a9d",
+    "simlE;": "\u2a9f",
+    "simne;": "\u2246",
+    "simplus;": "\u2a24",
+    "simrarr;": "\u2972",
+    "slarr;": "\u2190",
+    "smallsetminus;": "\u2216",
+    "smashp;": "\u2a33",
+    "smeparsl;": "\u29e4",
+    "smid;": "\u2223",
+    "smile;": "\u2323",
+    "smt;": "\u2aaa",
+    "smte;": "\u2aac",
+    "smtes;": "\u2aac\ufe00",
+    "softcy;": "\u044c",
+    "sol;": "/",
+    "solb;": "\u29c4",
+    "solbar;": "\u233f",
+    "sopf;": "\U0001d564",
+    "spades;": "\u2660",
+    "spadesuit;": "\u2660",
+    "spar;": "\u2225",
+    "sqcap;": "\u2293",
+    "sqcaps;": "\u2293\ufe00",
+    "sqcup;": "\u2294",
+    "sqcups;": "\u2294\ufe00",
+    "sqsub;": "\u228f",
+    "sqsube;": "\u2291",
+    "sqsubset;": "\u228f",
+    "sqsubseteq;": "\u2291",
+    "sqsup;": "\u2290",
+    "sqsupe;": "\u2292",
+    "sqsupset;": "\u2290",
+    "sqsupseteq;": "\u2292",
+    "squ;": "\u25a1",
+    "square;": "\u25a1",
+    "squarf;": "\u25aa",
+    "squf;": "\u25aa",
+    "srarr;": "\u2192",
+    "sscr;": "\U0001d4c8",
+    "ssetmn;": "\u2216",
+    "ssmile;": "\u2323",
+    "sstarf;": "\u22c6",
+    "star;": "\u2606",
+    "starf;": "\u2605",
+    "straightepsilon;": "\u03f5",
+    "straightphi;": "\u03d5",
+    "strns;": "\xaf",
+    "sub;": "\u2282",
+    "subE;": "\u2ac5",
+    "subdot;": "\u2abd",
+    "sube;": "\u2286",
+    "subedot;": "\u2ac3",
+    "submult;": "\u2ac1",
+    "subnE;": "\u2acb",
+    "subne;": "\u228a",
+    "subplus;": "\u2abf",
+    "subrarr;": "\u2979",
+    "subset;": "\u2282",
+    "subseteq;": "\u2286",
+    "subseteqq;": "\u2ac5",
+    "subsetneq;": "\u228a",
+    "subsetneqq;": "\u2acb",
+    "subsim;": "\u2ac7",
+    "subsub;": "\u2ad5",
+    "subsup;": "\u2ad3",
+    "succ;": "\u227b",
+    "succapprox;": "\u2ab8",
+    "succcurlyeq;": "\u227d",
+    "succeq;": "\u2ab0",
+    "succnapprox;": "\u2aba",
+    "succneqq;": "\u2ab6",
+    "succnsim;": "\u22e9",
+    "succsim;": "\u227f",
+    "sum;": "\u2211",
+    "sung;": "\u266a",
+    "sup1": "\xb9",
+    "sup1;": "\xb9",
+    "sup2": "\xb2",
+    "sup2;": "\xb2",
+    "sup3": "\xb3",
+    "sup3;": "\xb3",
+    "sup;": "\u2283",
+    "supE;": "\u2ac6",
+    "supdot;": "\u2abe",
+    "supdsub;": "\u2ad8",
+    "supe;": "\u2287",
+    "supedot;": "\u2ac4",
+    "suphsol;": "\u27c9",
+    "suphsub;": "\u2ad7",
+    "suplarr;": "\u297b",
+    "supmult;": "\u2ac2",
+    "supnE;": "\u2acc",
+    "supne;": "\u228b",
+    "supplus;": "\u2ac0",
+    "supset;": "\u2283",
+    "supseteq;": "\u2287",
+    "supseteqq;": "\u2ac6",
+    "supsetneq;": "\u228b",
+    "supsetneqq;": "\u2acc",
+    "supsim;": "\u2ac8",
+    "supsub;": "\u2ad4",
+    "supsup;": "\u2ad6",
+    "swArr;": "\u21d9",
+    "swarhk;": "\u2926",
+    "swarr;": "\u2199",
+    "swarrow;": "\u2199",
+    "swnwar;": "\u292a",
+    "szlig": "\xdf",
+    "szlig;": "\xdf",
+    "target;": "\u2316",
+    "tau;": "\u03c4",
+    "tbrk;": "\u23b4",
+    "tcaron;": "\u0165",
+    "tcedil;": "\u0163",
+    "tcy;": "\u0442",
+    "tdot;": "\u20db",
+    "telrec;": "\u2315",
+    "tfr;": "\U0001d531",
+    "there4;": "\u2234",
+    "therefore;": "\u2234",
+    "theta;": "\u03b8",
+    "thetasym;": "\u03d1",
+    "thetav;": "\u03d1",
+    "thickapprox;": "\u2248",
+    "thicksim;": "\u223c",
+    "thinsp;": "\u2009",
+    "thkap;": "\u2248",
+    "thksim;": "\u223c",
+    "thorn": "\xfe",
+    "thorn;": "\xfe",
+    "tilde;": "\u02dc",
+    "times": "\xd7",
+    "times;": "\xd7",
+    "timesb;": "\u22a0",
+    "timesbar;": "\u2a31",
+    "timesd;": "\u2a30",
+    "tint;": "\u222d",
+    "toea;": "\u2928",
+    "top;": "\u22a4",
+    "topbot;": "\u2336",
+    "topcir;": "\u2af1",
+    "topf;": "\U0001d565",
+    "topfork;": "\u2ada",
+    "tosa;": "\u2929",
+    "tprime;": "\u2034",
+    "trade;": "\u2122",
+    "triangle;": "\u25b5",
+    "triangledown;": "\u25bf",
+    "triangleleft;": "\u25c3",
+    "trianglelefteq;": "\u22b4",
+    "triangleq;": "\u225c",
+    "triangleright;": "\u25b9",
+    "trianglerighteq;": "\u22b5",
+    "tridot;": "\u25ec",
+    "trie;": "\u225c",
+    "triminus;": "\u2a3a",
+    "triplus;": "\u2a39",
+    "trisb;": "\u29cd",
+    "tritime;": "\u2a3b",
+    "trpezium;": "\u23e2",
+    "tscr;": "\U0001d4c9",
+    "tscy;": "\u0446",
+    "tshcy;": "\u045b",
+    "tstrok;": "\u0167",
+    "twixt;": "\u226c",
+    "twoheadleftarrow;": "\u219e",
+    "twoheadrightarrow;": "\u21a0",
+    "uArr;": "\u21d1",
+    "uHar;": "\u2963",
+    "uacute": "\xfa",
+    "uacute;": "\xfa",
+    "uarr;": "\u2191",
+    "ubrcy;": "\u045e",
+    "ubreve;": "\u016d",
+    "ucirc": "\xfb",
+    "ucirc;": "\xfb",
+    "ucy;": "\u0443",
+    "udarr;": "\u21c5",
+    "udblac;": "\u0171",
+    "udhar;": "\u296e",
+    "ufisht;": "\u297e",
+    "ufr;": "\U0001d532",
+    "ugrave": "\xf9",
+    "ugrave;": "\xf9",
+    "uharl;": "\u21bf",
+    "uharr;": "\u21be",
+    "uhblk;": "\u2580",
+    "ulcorn;": "\u231c",
+    "ulcorner;": "\u231c",
+    "ulcrop;": "\u230f",
+    "ultri;": "\u25f8",
+    "umacr;": "\u016b",
+    "uml": "\xa8",
+    "uml;": "\xa8",
+    "uogon;": "\u0173",
+    "uopf;": "\U0001d566",
+    "uparrow;": "\u2191",
+    "updownarrow;": "\u2195",
+    "upharpoonleft;": "\u21bf",
+    "upharpoonright;": "\u21be",
+    "uplus;": "\u228e",
+    "upsi;": "\u03c5",
+    "upsih;": "\u03d2",
+    "upsilon;": "\u03c5",
+    "upuparrows;": "\u21c8",
+    "urcorn;": "\u231d",
+    "urcorner;": "\u231d",
+    "urcrop;": "\u230e",
+    "uring;": "\u016f",
+    "urtri;": "\u25f9",
+    "uscr;": "\U0001d4ca",
+    "utdot;": "\u22f0",
+    "utilde;": "\u0169",
+    "utri;": "\u25b5",
+    "utrif;": "\u25b4",
+    "uuarr;": "\u21c8",
+    "uuml": "\xfc",
+    "uuml;": "\xfc",
+    "uwangle;": "\u29a7",
+    "vArr;": "\u21d5",
+    "vBar;": "\u2ae8",
+    "vBarv;": "\u2ae9",
+    "vDash;": "\u22a8",
+    "vangrt;": "\u299c",
+    "varepsilon;": "\u03f5",
+    "varkappa;": "\u03f0",
+    "varnothing;": "\u2205",
+    "varphi;": "\u03d5",
+    "varpi;": "\u03d6",
+    "varpropto;": "\u221d",
+    "varr;": "\u2195",
+    "varrho;": "\u03f1",
+    "varsigma;": "\u03c2",
+    "varsubsetneq;": "\u228a\ufe00",
+    "varsubsetneqq;": "\u2acb\ufe00",
+    "varsupsetneq;": "\u228b\ufe00",
+    "varsupsetneqq;": "\u2acc\ufe00",
+    "vartheta;": "\u03d1",
+    "vartriangleleft;": "\u22b2",
+    "vartriangleright;": "\u22b3",
+    "vcy;": "\u0432",
+    "vdash;": "\u22a2",
+    "vee;": "\u2228",
+    "veebar;": "\u22bb",
+    "veeeq;": "\u225a",
+    "vellip;": "\u22ee",
+    "verbar;": "|",
+    "vert;": "|",
+    "vfr;": "\U0001d533",
+    "vltri;": "\u22b2",
+    "vnsub;": "\u2282\u20d2",
+    "vnsup;": "\u2283\u20d2",
+    "vopf;": "\U0001d567",
+    "vprop;": "\u221d",
+    "vrtri;": "\u22b3",
+    "vscr;": "\U0001d4cb",
+    "vsubnE;": "\u2acb\ufe00",
+    "vsubne;": "\u228a\ufe00",
+    "vsupnE;": "\u2acc\ufe00",
+    "vsupne;": "\u228b\ufe00",
+    "vzigzag;": "\u299a",
+    "wcirc;": "\u0175",
+    "wedbar;": "\u2a5f",
+    "wedge;": "\u2227",
+    "wedgeq;": "\u2259",
+    "weierp;": "\u2118",
+    "wfr;": "\U0001d534",
+    "wopf;": "\U0001d568",
+    "wp;": "\u2118",
+    "wr;": "\u2240",
+    "wreath;": "\u2240",
+    "wscr;": "\U0001d4cc",
+    "xcap;": "\u22c2",
+    "xcirc;": "\u25ef",
+    "xcup;": "\u22c3",
+    "xdtri;": "\u25bd",
+    "xfr;": "\U0001d535",
+    "xhArr;": "\u27fa",
+    "xharr;": "\u27f7",
+    "xi;": "\u03be",
+    "xlArr;": "\u27f8",
+    "xlarr;": "\u27f5",
+    "xmap;": "\u27fc",
+    "xnis;": "\u22fb",
+    "xodot;": "\u2a00",
+    "xopf;": "\U0001d569",
+    "xoplus;": "\u2a01",
+    "xotime;": "\u2a02",
+    "xrArr;": "\u27f9",
+    "xrarr;": "\u27f6",
+    "xscr;": "\U0001d4cd",
+    "xsqcup;": "\u2a06",
+    "xuplus;": "\u2a04",
+    "xutri;": "\u25b3",
+    "xvee;": "\u22c1",
+    "xwedge;": "\u22c0",
+    "yacute": "\xfd",
+    "yacute;": "\xfd",
+    "yacy;": "\u044f",
+    "ycirc;": "\u0177",
+    "ycy;": "\u044b",
+    "yen": "\xa5",
+    "yen;": "\xa5",
+    "yfr;": "\U0001d536",
+    "yicy;": "\u0457",
+    "yopf;": "\U0001d56a",
+    "yscr;": "\U0001d4ce",
+    "yucy;": "\u044e",
+    "yuml": "\xff",
+    "yuml;": "\xff",
+    "zacute;": "\u017a",
+    "zcaron;": "\u017e",
+    "zcy;": "\u0437",
+    "zdot;": "\u017c",
+    "zeetrf;": "\u2128",
+    "zeta;": "\u03b6",
+    "zfr;": "\U0001d537",
+    "zhcy;": "\u0436",
+    "zigrarr;": "\u21dd",
+    "zopf;": "\U0001d56b",
+    "zscr;": "\U0001d4cf",
+    "zwj;": "\u200d",
+    "zwnj;": "\u200c",
+}
+
+replacementCharacters = {
+    0x0: "\uFFFD",
+    0x0d: "\u000D",
+    0x80: "\u20AC",
+    0x81: "\u0081",
+    0x82: "\u201A",
+    0x83: "\u0192",
+    0x84: "\u201E",
+    0x85: "\u2026",
+    0x86: "\u2020",
+    0x87: "\u2021",
+    0x88: "\u02C6",
+    0x89: "\u2030",
+    0x8A: "\u0160",
+    0x8B: "\u2039",
+    0x8C: "\u0152",
+    0x8D: "\u008D",
+    0x8E: "\u017D",
+    0x8F: "\u008F",
+    0x90: "\u0090",
+    0x91: "\u2018",
+    0x92: "\u2019",
+    0x93: "\u201C",
+    0x94: "\u201D",
+    0x95: "\u2022",
+    0x96: "\u2013",
+    0x97: "\u2014",
+    0x98: "\u02DC",
+    0x99: "\u2122",
+    0x9A: "\u0161",
+    0x9B: "\u203A",
+    0x9C: "\u0153",
+    0x9D: "\u009D",
+    0x9E: "\u017E",
+    0x9F: "\u0178",
+}
+
+tokenTypes = {
+    "Doctype": 0,
+    "Characters": 1,
+    "SpaceCharacters": 2,
+    "StartTag": 3,
+    "EndTag": 4,
+    "EmptyTag": 5,
+    "Comment": 6,
+    "ParseError": 7
+}
+
+tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"],
+                           tokenTypes["EmptyTag"]])
+
+
+prefixes = dict([(v, k) for k, v in namespaces.items()])
+prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
+
+
+class DataLossWarning(UserWarning):
+    pass
+
+
+class ReparseException(Exception):
+    pass
diff --git a/html_cleaner/html5lib/filters/__init__.py b/html_cleaner/html5lib/filters/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/html_cleaner/html5lib/filters/alphabeticalattributes.py b/html_cleaner/html5lib/filters/alphabeticalattributes.py
new file mode 100644
index 0000000..4795bae
--- /dev/null
+++ b/html_cleaner/html5lib/filters/alphabeticalattributes.py
@@ -0,0 +1,20 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
+
+class Filter(base.Filter):
+    def __iter__(self):
+        for token in base.Filter.__iter__(self):
+            if token["type"] in ("StartTag", "EmptyTag"):
+                attrs = OrderedDict()
+                for name, value in sorted(token["data"].items(),
+                                          key=lambda x: x[0]):
+                    attrs[name] = value
+                token["data"] = attrs
+            yield token
diff --git a/html_cleaner/html5lib/filters/base.py b/html_cleaner/html5lib/filters/base.py
new file mode 100644
index 0000000..c7dbaed
--- /dev/null
+++ b/html_cleaner/html5lib/filters/base.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+class Filter(object):
+    def __init__(self, source):
+        self.source = source
+
+    def __iter__(self):
+        return iter(self.source)
+
+    def __getattr__(self, name):
+        return getattr(self.source, name)
diff --git a/html_cleaner/html5lib/filters/inject_meta_charset.py b/html_cleaner/html5lib/filters/inject_meta_charset.py
new file mode 100644
index 0000000..2059ec8
--- /dev/null
+++ b/html_cleaner/html5lib/filters/inject_meta_charset.py
@@ -0,0 +1,65 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+
+class Filter(base.Filter):
+    def __init__(self, source, encoding):
+        base.Filter.__init__(self, source)
+        self.encoding = encoding
+
+    def __iter__(self):
+        state = "pre_head"
+        meta_found = (self.encoding is None)
+        pending = []
+
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag":
+                if token["name"].lower() == "head":
+                    state = "in_head"
+
+            elif type == "EmptyTag":
+                if token["name"].lower() == "meta":
+                    # replace charset with actual encoding
+                    has_http_equiv_content_type = False
+                    for (namespace, name), value in token["data"].items():
+                        if namespace is not None:
+                            continue
+                        elif name.lower() == 'charset':
+                            token["data"][(namespace, name)] = self.encoding
+                            meta_found = True
+                            break
+                        elif name == 'http-equiv' and value.lower() == 'content-type':
+                            has_http_equiv_content_type = True
+                    else:
+                        if has_http_equiv_content_type and (None, "content") in token["data"]:
+                            token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
+                            meta_found = True
+
+                elif token["name"].lower() == "head" and not meta_found:
+                    # insert meta into empty head
+                    yield {"type": "StartTag", "name": "head",
+                           "data": token["data"]}
+                    yield {"type": "EmptyTag", "name": "meta",
+                           "data": {(None, "charset"): self.encoding}}
+                    yield {"type": "EndTag", "name": "head"}
+                    meta_found = True
+                    continue
+
+            elif type == "EndTag":
+                if token["name"].lower() == "head" and pending:
+                    # insert meta into head (if necessary) and flush pending queue
+                    yield pending.pop(0)
+                    if not meta_found:
+                        yield {"type": "EmptyTag", "name": "meta",
+                               "data": {(None, "charset"): self.encoding}}
+                    while pending:
+                        yield pending.pop(0)
+                    meta_found = True
+                    state = "post_head"
+
+            if state == "in_head":
+                pending.append(token)
+            else:
+                yield token
diff --git a/html_cleaner/html5lib/filters/lint.py b/html_cleaner/html5lib/filters/lint.py
new file mode 100644
index 0000000..a9c0831
--- /dev/null
+++ b/html_cleaner/html5lib/filters/lint.py
@@ -0,0 +1,81 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from six import text_type
+
+from . import base
+from ..constants import namespaces, voidElements
+
+from ..constants import spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+
+class Filter(base.Filter):
+    def __init__(self, source, require_matching_tags=True):
+        super(Filter, self).__init__(source)
+        self.require_matching_tags = require_matching_tags
+
+    def __iter__(self):
+        open_elements = []
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                namespace = token["namespace"]
+                name = token["name"]
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                assert isinstance(token["data"], dict)
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert type == "EmptyTag"
+                else:
+                    assert type == "StartTag"
+                if type == "StartTag" and self.require_matching_tags:
+                    open_elements.append((namespace, name))
+                for (namespace, name), value in token["data"].items():
+                    assert namespace is None or isinstance(namespace, text_type)
+                    assert namespace != ""
+                    assert isinstance(name, text_type)
+                    assert name != ""
+                    assert isinstance(value, text_type)
+
+            elif type == "EndTag":
+                namespace = token["namespace"]
+                name = token["name"]
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
+                elif self.require_matching_tags:
+                    start = open_elements.pop()
+                    assert start == (namespace, name)
+
+            elif type == "Comment":
+                data = token["data"]
+                assert isinstance(data, text_type)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                data = token["data"]
+                assert isinstance(data, text_type)
+                assert data != ""
+                if type == "SpaceCharacters":
+                    assert data.strip(spaceCharacters) == ""
+
+            elif type == "Doctype":
+                name = token["name"]
+                assert name is None or isinstance(name, text_type)
+                assert token["publicId"] is None or isinstance(name, text_type)
+                assert token["systemId"] is None or isinstance(name, text_type)
+
+            elif type == "Entity":
+                assert isinstance(token["name"], text_type)
+
+            elif type == "SerializerError":
+                assert isinstance(token["data"], text_type)
+
+            else:
+                assert False, "Unknown token type: %(type)s" % {"type": type}
+
+            yield token
diff --git a/html_cleaner/html5lib/filters/optionaltags.py b/html_cleaner/html5lib/filters/optionaltags.py
new file mode 100644
index 0000000..f6edb73
--- /dev/null
+++ b/html_cleaner/html5lib/filters/optionaltags.py
@@ -0,0 +1,206 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import base
+
+
+class Filter(base.Filter):
+    def slider(self):
+        previous1 = previous2 = None
+        for token in self.source:
+            if previous1 is not None:
+                yield previous2, previous1, token
+            previous2 = previous1
+            previous1 = token
+        if previous1 is not None:
+            yield previous2, previous1, None
+
+    def __iter__(self):
+        for previous, token, next in self.slider():
+            type = token["type"]
+            if type == "StartTag":
+                if (token["data"] or
+                        not self.is_optional_start(token["name"], previous, next)):
+                    yield token
+            elif type == "EndTag":
+                if not self.is_optional_end(token["name"], next):
+                    yield token
+            else:
+                yield token
+
+    def is_optional_start(self, tagname, previous, next):
+        type = next and next["type"] or None
+        if tagname in 'html':
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname == 'head':
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
+        elif tagname == 'body':
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return next["name"] not in ('script', 'style')
+            else:
+                return True
+        elif tagname == 'colgroup':
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceded by another colgroup element whose
+            # end tag has been omitted.
+            if type in ("StartTag", "EmptyTag"):
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return next["name"] == "col"
+            else:
+                return False
+        elif tagname == 'tbody':
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == "StartTag":
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous['type'] == 'EndTag' and \
+                        previous['name'] in ('tbody', 'thead', 'tfoot'):
+                    return False
+                return next["name"] == 'tr'
+            else:
+                return False
+        return False
+
+    def is_optional_end(self, tagname, next):
+        type = next and next["type"] or None
+        if tagname in ('html', 'head', 'body'):
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname in ('li', 'optgroup', 'tr'):
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] == tagname
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('dt', 'dd'):
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('dt', 'dd')
+            elif tagname == 'dd':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'p':
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, article, aside,
+            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+            # nav, ol, p, pre, section, table, or ul, element, or if
+            # there is no more content in the parent element.
+            if type in ("StartTag", "EmptyTag"):
+                return next["name"] in ('address', 'article', 'aside',
+                                        'blockquote', 'datagrid', 'dialog',
+                                        'dir', 'div', 'dl', 'fieldset', 'footer',
+                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                                        'header', 'hr', 'menu', 'nav', 'ol',
+                                        'p', 'pre', 'section', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'option':
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if it is immediately followed by an <code>optgroup</code>
+            # element, or if there is no more content in the parent
+            # element.
+            if type == "StartTag":
+                return next["name"] in ('option', 'optgroup')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('rt', 'rp'):
+            # An rt element's end tag may be omitted if the rt element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            # An rp element's end tag may be omitted if the rp element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('rt', 'rp')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'colgroup':
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return next["name"] != 'colgroup'
+            else:
+                return True
+        elif tagname in ('thead', 'tbody'):
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] in ['tbody', 'tfoot']
+            elif tagname == 'tbody':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'tfoot':
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] == 'tbody'
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('td', 'th'):
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('td', 'th')
+            else:
+                return type == "EndTag" or type is None
+        return False
diff --git a/html_cleaner/html5lib/filters/sanitizer.py b/html_cleaner/html5lib/filters/sanitizer.py
new file mode 100644
index 0000000..b5ddcb9
--- /dev/null
+++ b/html_cleaner/html5lib/filters/sanitizer.py
@@ -0,0 +1,865 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+from xml.sax.saxutils import escape, unescape
+
+from six.moves import urllib_parse as urlparse
+
+from . import base
+from ..constants import namespaces, prefixes
+
+__all__ = ["Filter"]
+
+
+allowed_elements = frozenset((
+    (namespaces['html'], 'a'),
+    (namespaces['html'], 'abbr'),
+    (namespaces['html'], 'acronym'),
+    (namespaces['html'], 'address'),
+    (namespaces['html'], 'area'),
+    (namespaces['html'], 'article'),
+    (namespaces['html'], 'aside'),
+    (namespaces['html'], 'audio'),
+    (namespaces['html'], 'b'),
+    (namespaces['html'], 'big'),
+    (namespaces['html'], 'blockquote'),
+    (namespaces['html'], 'br'),
+    (namespaces['html'], 'button'),
+    (namespaces['html'], 'canvas'),
+    (namespaces['html'], 'caption'),
+    (namespaces['html'], 'center'),
+    (namespaces['html'], 'cite'),
+    (namespaces['html'], 'code'),
+    (namespaces['html'], 'col'),
+    (namespaces['html'], 'colgroup'),
+    (namespaces['html'], 'command'),
+    (namespaces['html'], 'datagrid'),
+    (namespaces['html'], 'datalist'),
+    (namespaces['html'], 'dd'),
+    (namespaces['html'], 'del'),
+    (namespaces['html'], 'details'),
+    (namespaces['html'], 'dfn'),
+    (namespaces['html'], 'dialog'),
+    (namespaces['html'], 'dir'),
+    (namespaces['html'], 'div'),
+    (namespaces['html'], 'dl'),
+    (namespaces['html'], 'dt'),
+    (namespaces['html'], 'em'),
+    (namespaces['html'], 'event-source'),
+    (namespaces['html'], 'fieldset'),
+    (namespaces['html'], 'figcaption'),
+    (namespaces['html'], 'figure'),
+    (namespaces['html'], 'footer'),
+    (namespaces['html'], 'font'),
+    (namespaces['html'], 'form'),
+    (namespaces['html'], 'header'),
+    (namespaces['html'], 'h1'),
+    (namespaces['html'], 'h2'),
+    (namespaces['html'], 'h3'),
+    (namespaces['html'], 'h4'),
+    (namespaces['html'], 'h5'),
+    (namespaces['html'], 'h6'),
+    (namespaces['html'], 'hr'),
+    (namespaces['html'], 'i'),
+    (namespaces['html'], 'img'),
+    (namespaces['html'], 'input'),
+    (namespaces['html'], 'ins'),
+    (namespaces['html'], 'keygen'),
+    (namespaces['html'], 'kbd'),
+    (namespaces['html'], 'label'),
+    (namespaces['html'], 'legend'),
+    (namespaces['html'], 'li'),
+    (namespaces['html'], 'm'),
+    (namespaces['html'], 'map'),
+    (namespaces['html'], 'menu'),
+    (namespaces['html'], 'meter'),
+    (namespaces['html'], 'multicol'),
+    (namespaces['html'], 'nav'),
+    (namespaces['html'], 'nextid'),
+    (namespaces['html'], 'ol'),
+    (namespaces['html'], 'output'),
+    (namespaces['html'], 'optgroup'),
+    (namespaces['html'], 'option'),
+    (namespaces['html'], 'p'),
+    (namespaces['html'], 'pre'),
+    (namespaces['html'], 'progress'),
+    (namespaces['html'], 'q'),
+    (namespaces['html'], 's'),
+    (namespaces['html'], 'samp'),
+    (namespaces['html'], 'section'),
+    (namespaces['html'], 'select'),
+    (namespaces['html'], 'small'),
+    (namespaces['html'], 'sound'),
+    (namespaces['html'], 'source'),
+    (namespaces['html'], 'spacer'),
+    (namespaces['html'], 'span'),
+    (namespaces['html'], 'strike'),
+    (namespaces['html'], 'strong'),
+    (namespaces['html'], 'sub'),
+    (namespaces['html'], 'sup'),
+    (namespaces['html'], 'table'),
+    (namespaces['html'], 'tbody'),
+    (namespaces['html'], 'td'),
+    (namespaces['html'], 'textarea'),
+    (namespaces['html'], 'time'),
+    (namespaces['html'], 'tfoot'),
+    (namespaces['html'], 'th'),
+    (namespaces['html'], 'thead'),
+    (namespaces['html'], 'tr'),
+    (namespaces['html'], 'tt'),
+    (namespaces['html'], 'u'),
+    (namespaces['html'], 'ul'),
+    (namespaces['html'], 'var'),
+    (namespaces['html'], 'video'),
+    (namespaces['mathml'], 'maction'),
+    (namespaces['mathml'], 'math'),
+    (namespaces['mathml'], 'merror'),
+    (namespaces['mathml'], 'mfrac'),
+    (namespaces['mathml'], 'mi'),
+    (namespaces['mathml'], 'mmultiscripts'),
+    (namespaces['mathml'], 'mn'),
+    (namespaces['mathml'], 'mo'),
+    (namespaces['mathml'], 'mover'),
+    (namespaces['mathml'], 'mpadded'),
+    (namespaces['mathml'], 'mphantom'),
+    (namespaces['mathml'], 'mprescripts'),
+    (namespaces['mathml'], 'mroot'),
+    (namespaces['mathml'], 'mrow'),
+    (namespaces['mathml'], 'mspace'),
+    (namespaces['mathml'], 'msqrt'),
+    (namespaces['mathml'], 'mstyle'),
+    (namespaces['mathml'], 'msub'),
+    (namespaces['mathml'], 'msubsup'),
+    (namespaces['mathml'], 'msup'),
+    (namespaces['mathml'], 'mtable'),
+    (namespaces['mathml'], 'mtd'),
+    (namespaces['mathml'], 'mtext'),
+    (namespaces['mathml'], 'mtr'),
+    (namespaces['mathml'], 'munder'),
+    (namespaces['mathml'], 'munderover'),
+    (namespaces['mathml'], 'none'),
+    (namespaces['svg'], 'a'),
+    (namespaces['svg'], 'animate'),
+    (namespaces['svg'], 'animateColor'),
+    (namespaces['svg'], 'animateMotion'),
+    (namespaces['svg'], 'animateTransform'),
+    (namespaces['svg'], 'clipPath'),
+    (namespaces['svg'], 'circle'),
+    (namespaces['svg'], 'defs'),
+    (namespaces['svg'], 'desc'),
+    (namespaces['svg'], 'ellipse'),
+    (namespaces['svg'], 'font-face'),
+    (namespaces['svg'], 'font-face-name'),
+    (namespaces['svg'], 'font-face-src'),
+    (namespaces['svg'], 'g'),
+    (namespaces['svg'], 'glyph'),
+    (namespaces['svg'], 'hkern'),
+    (namespaces['svg'], 'linearGradient'),
+    (namespaces['svg'], 'line'),
+    (namespaces['svg'], 'marker'),
+    (namespaces['svg'], 'metadata'),
+    (namespaces['svg'], 'missing-glyph'),
+    (namespaces['svg'], 'mpath'),
+    (namespaces['svg'], 'path'),
+    (namespaces['svg'], 'polygon'),
+    (namespaces['svg'], 'polyline'),
+    (namespaces['svg'], 'radialGradient'),
+    (namespaces['svg'], 'rect'),
+    (namespaces['svg'], 'set'),
+    (namespaces['svg'], 'stop'),
+    (namespaces['svg'], 'svg'),
+    (namespaces['svg'], 'switch'),
+    (namespaces['svg'], 'text'),
+    (namespaces['svg'], 'title'),
+    (namespaces['svg'], 'tspan'),
+    (namespaces['svg'], 'use'),
+))
+
+allowed_attributes = frozenset((
+    # HTML attributes
+    (None, 'abbr'),
+    (None, 'accept'),
+    (None, 'accept-charset'),
+    (None, 'accesskey'),
+    (None, 'action'),
+    (None, 'align'),
+    (None, 'alt'),
+    (None, 'autocomplete'),
+    (None, 'autofocus'),
+    (None, 'axis'),
+    (None, 'background'),
+    (None, 'balance'),
+    (None, 'bgcolor'),
+    (None, 'bgproperties'),
+    (None, 'border'),
+    (None, 'bordercolor'),
+    (None, 'bordercolordark'),
+    (None, 'bordercolorlight'),
+    (None, 'bottompadding'),
+    (None, 'cellpadding'),
+    (None, 'cellspacing'),
+    (None, 'ch'),
+    (None, 'challenge'),
+    (None, 'char'),
+    (None, 'charoff'),
+    (None, 'choff'),
+    (None, 'charset'),
+    (None, 'checked'),
+    (None, 'cite'),
+    (None, 'class'),
+    (None, 'clear'),
+    (None, 'color'),
+    (None, 'cols'),
+    (None, 'colspan'),
+    (None, 'compact'),
+    (None, 'contenteditable'),
+    (None, 'controls'),
+    (None, 'coords'),
+    (None, 'data'),
+    (None, 'datafld'),
+    (None, 'datapagesize'),
+    (None, 'datasrc'),
+    (None, 'datetime'),
+    (None, 'default'),
+    (None, 'delay'),
+    (None, 'dir'),
+    (None, 'disabled'),
+    (None, 'draggable'),
+    (None, 'dynsrc'),
+    (None, 'enctype'),
+    (None, 'end'),
+    (None, 'face'),
+    (None, 'for'),
+    (None, 'form'),
+    (None, 'frame'),
+    (None, 'galleryimg'),
+    (None, 'gutter'),
+    (None, 'headers'),
+    (None, 'height'),
+    (None, 'hidefocus'),
+    (None, 'hidden'),
+    (None, 'high'),
+    (None, 'href'),
+    (None, 'hreflang'),
+    (None, 'hspace'),
+    (None, 'icon'),
+    (None, 'id'),
+    (None, 'inputmode'),
+    (None, 'ismap'),
+    (None, 'keytype'),
+    (None, 'label'),
+    (None, 'leftspacing'),
+    (None, 'lang'),
+    (None, 'list'),
+    (None, 'longdesc'),
+    (None, 'loop'),
+    (None, 'loopcount'),
+    (None, 'loopend'),
+    (None, 'loopstart'),
+    (None, 'low'),
+    (None, 'lowsrc'),
+    (None, 'max'),
+    (None, 'maxlength'),
+    (None, 'media'),
+    (None, 'method'),
+    (None, 'min'),
+    (None, 'multiple'),
+    (None, 'name'),
+    (None, 'nohref'),
+    (None, 'noshade'),
+    (None, 'nowrap'),
+    (None, 'open'),
+    (None, 'optimum'),
+    (None, 'pattern'),
+    (None, 'ping'),
+    (None, 'point-size'),
+    (None, 'poster'),
+    (None, 'pqg'),
+    (None, 'preload'),
+    (None, 'prompt'),
+    (None, 'radiogroup'),
+    (None, 'readonly'),
+    (None, 'rel'),
+    (None, 'repeat-max'),
+    (None, 'repeat-min'),
+    (None, 'replace'),
+    (None, 'required'),
+    (None, 'rev'),
+    (None, 'rightspacing'),
+    (None, 'rows'),
+    (None, 'rowspan'),
+    (None, 'rules'),
+    (None, 'scope'),
+    (None, 'selected'),
+    (None, 'shape'),
+    (None, 'size'),
+    (None, 'span'),
+    (None, 'src'),
+    (None, 'start'),
+    (None, 'step'),
+    (None, 'style'),
+    (None, 'summary'),
+    (None, 'suppress'),
+    (None, 'tabindex'),
+    (None, 'target'),
+    (None, 'template'),
+    (None, 'title'),
+    (None, 'toppadding'),
+    (None, 'type'),
+    (None, 'unselectable'),
+    (None, 'usemap'),
+    (None, 'urn'),
+    (None, 'valign'),
+    (None, 'value'),
+    (None, 'variable'),
+    (None, 'volume'),
+    (None, 'vspace'),
+    (None, 'vrml'),
+    (None, 'width'),
+    (None, 'wrap'),
+    (namespaces['xml'], 'lang'),
+    # MathML attributes
+    (None, 'actiontype'),
+    (None, 'align'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnlines'),
+    (None, 'columnspacing'),
+    (None, 'columnspan'),
+    (None, 'depth'),
+    (None, 'display'),
+    (None, 'displaystyle'),
+    (None, 'equalcolumns'),
+    (None, 'equalrows'),
+    (None, 'fence'),
+    (None, 'fontstyle'),
+    (None, 'fontweight'),
+    (None, 'frame'),
+    (None, 'height'),
+    (None, 'linethickness'),
+    (None, 'lspace'),
+    (None, 'mathbackground'),
+    (None, 'mathcolor'),
+    (None, 'mathvariant'),
+    (None, 'mathvariant'),
+    (None, 'maxsize'),
+    (None, 'minsize'),
+    (None, 'other'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowlines'),
+    (None, 'rowspacing'),
+    (None, 'rowspan'),
+    (None, 'rspace'),
+    (None, 'scriptlevel'),
+    (None, 'selection'),
+    (None, 'separator'),
+    (None, 'stretchy'),
+    (None, 'width'),
+    (None, 'width'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'type'),
+    # SVG attributes
+    (None, 'accent-height'),
+    (None, 'accumulate'),
+    (None, 'additive'),
+    (None, 'alphabetic'),
+    (None, 'arabic-form'),
+    (None, 'ascent'),
+    (None, 'attributeName'),
+    (None, 'attributeType'),
+    (None, 'baseProfile'),
+    (None, 'bbox'),
+    (None, 'begin'),
+    (None, 'by'),
+    (None, 'calcMode'),
+    (None, 'cap-height'),
+    (None, 'class'),
+    (None, 'clip-path'),
+    (None, 'color'),
+    (None, 'color-rendering'),
+    (None, 'content'),
+    (None, 'cx'),
+    (None, 'cy'),
+    (None, 'd'),
+    (None, 'dx'),
+    (None, 'dy'),
+    (None, 'descent'),
+    (None, 'display'),
+    (None, 'dur'),
+    (None, 'end'),
+    (None, 'fill'),
+    (None, 'fill-opacity'),
+    (None, 'fill-rule'),
+    (None, 'font-family'),
+    (None, 'font-size'),
+    (None, 'font-stretch'),
+    (None, 'font-style'),
+    (None, 'font-variant'),
+    (None, 'font-weight'),
+    (None, 'from'),
+    (None, 'fx'),
+    (None, 'fy'),
+    (None, 'g1'),
+    (None, 'g2'),
+    (None, 'glyph-name'),
+    (None, 'gradientUnits'),
+    (None, 'hanging'),
+    (None, 'height'),
+    (None, 'horiz-adv-x'),
+    (None, 'horiz-origin-x'),
+    (None, 'id'),
+    (None, 'ideographic'),
+    (None, 'k'),
+    (None, 'keyPoints'),
+    (None, 'keySplines'),
+    (None, 'keyTimes'),
+    (None, 'lang'),
+    (None, 'marker-end'),
+    (None, 'marker-mid'),
+    (None, 'marker-start'),
+    (None, 'markerHeight'),
+    (None, 'markerUnits'),
+    (None, 'markerWidth'),
+    (None, 'mathematical'),
+    (None, 'max'),
+    (None, 'min'),
+    (None, 'name'),
+    (None, 'offset'),
+    (None, 'opacity'),
+    (None, 'orient'),
+    (None, 'origin'),
+    (None, 'overline-position'),
+    (None, 'overline-thickness'),
+    (None, 'panose-1'),
+    (None, 'path'),
+    (None, 'pathLength'),
+    (None, 'points'),
+    (None, 'preserveAspectRatio'),
+    (None, 'r'),
+    (None, 'refX'),
+    (None, 'refY'),
+    (None, 'repeatCount'),
+    (None, 'repeatDur'),
+    (None, 'requiredExtensions'),
+    (None, 'requiredFeatures'),
+    (None, 'restart'),
+    (None, 'rotate'),
+    (None, 'rx'),
+    (None, 'ry'),
+    (None, 'slope'),
+    (None, 'stemh'),
+    (None, 'stemv'),
+    (None, 'stop-color'),
+    (None, 'stop-opacity'),
+    (None, 'strikethrough-position'),
+    (None, 'strikethrough-thickness'),
+    (None, 'stroke'),
+    (None, 'stroke-dasharray'),
+    (None, 'stroke-dashoffset'),
+    (None, 'stroke-linecap'),
+    (None, 'stroke-linejoin'),
+    (None, 'stroke-miterlimit'),
+    (None, 'stroke-opacity'),
+    (None, 'stroke-width'),
+    (None, 'systemLanguage'),
+    (None, 'target'),
+    (None, 'text-anchor'),
+    (None, 'to'),
+    (None, 'transform'),
+    (None, 'type'),
+    (None, 'u1'),
+    (None, 'u2'),
+    (None, 'underline-position'),
+    (None, 'underline-thickness'),
+    (None, 'unicode'),
+    (None, 'unicode-range'),
+    (None, 'units-per-em'),
+    (None, 'values'),
+    (None, 'version'),
+    (None, 'viewBox'),
+    (None, 'visibility'),
+    (None, 'width'),
+    (None, 'widths'),
+    (None, 'x'),
+    (None, 'x-height'),
+    (None, 'x1'),
+    (None, 'x2'),
+    (namespaces['xlink'], 'actuate'),
+    (namespaces['xlink'], 'arcrole'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'role'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'title'),
+    (namespaces['xlink'], 'type'),
+    (namespaces['xml'], 'base'),
+    (namespaces['xml'], 'lang'),
+    (namespaces['xml'], 'space'),
+    (None, 'y'),
+    (None, 'y1'),
+    (None, 'y2'),
+    (None, 'zoomAndPan'),
+))
+
+attr_val_is_uri = frozenset((
+    (None, 'href'),
+    (None, 'src'),
+    (None, 'cite'),
+    (None, 'action'),
+    (None, 'longdesc'),
+    (None, 'poster'),
+    (None, 'background'),
+    (None, 'datasrc'),
+    (None, 'dynsrc'),
+    (None, 'lowsrc'),
+    (None, 'ping'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xml'], 'base'),
+))
+
+svg_attr_val_allows_ref = frozenset((
+    (None, 'clip-path'),
+    (None, 'color-profile'),
+    (None, 'cursor'),
+    (None, 'fill'),
+    (None, 'filter'),
+    (None, 'marker'),
+    (None, 'marker-start'),
+    (None, 'marker-mid'),
+    (None, 'marker-end'),
+    (None, 'mask'),
+    (None, 'stroke'),
+))
+
+svg_allow_local_href = frozenset((
+    (None, 'altGlyph'),
+    (None, 'animate'),
+    (None, 'animateColor'),
+    (None, 'animateMotion'),
+    (None, 'animateTransform'),
+    (None, 'cursor'),
+    (None, 'feImage'),
+    (None, 'filter'),
+    (None, 'linearGradient'),
+    (None, 'pattern'),
+    (None, 'radialGradient'),
+    (None, 'textpath'),
+    (None, 'tref'),
+    (None, 'set'),
+    (None, 'use')
+))
+
+allowed_css_properties = frozenset((
+    'azimuth',
+    'background-color',
+    'border-bottom-color',
+    'border-collapse',
+    'border-color',
+    'border-left-color',
+    'border-right-color',
+    'border-top-color',
+    'clear',
+    'color',
+    'cursor',
+    'direction',
+    'display',
+    'elevation',
+    'float',
+    'font',
+    'font-family',
+    'font-size',
+    'font-style',
+    'font-variant',
+    'font-weight',
+    'height',
+    'letter-spacing',
+    'line-height',
+    'overflow',
+    'pause',
+    'pause-after',
+    'pause-before',
+    'pitch',
+    'pitch-range',
+    'richness',
+    'speak',
+    'speak-header',
+    'speak-numeral',
+    'speak-punctuation',
+    'speech-rate',
+    'stress',
+    'text-align',
+    'text-decoration',
+    'text-indent',
+    'unicode-bidi',
+    'vertical-align',
+    'voice-family',
+    'volume',
+    'white-space',
+    'width',
+))
+
+allowed_css_keywords = frozenset((
+    'auto',
+    'aqua',
+    'black',
+    'block',
+    'blue',
+    'bold',
+    'both',
+    'bottom',
+    'brown',
+    'center',
+    'collapse',
+    'dashed',
+    'dotted',
+    'fuchsia',
+    'gray',
+    'green',
+    '!important',
+    'italic',
+    'left',
+    'lime',
+    'maroon',
+    'medium',
+    'none',
+    'navy',
+    'normal',
+    'nowrap',
+    'olive',
+    'pointer',
+    'purple',
+    'red',
+    'right',
+    'solid',
+    'silver',
+    'teal',
+    'top',
+    'transparent',
+    'underline',
+    'white',
+    'yellow',
+))
+
+allowed_svg_properties = frozenset((
+    'fill',
+    'fill-opacity',
+    'fill-rule',
+    'stroke',
+    'stroke-width',
+    'stroke-linecap',
+    'stroke-linejoin',
+    'stroke-opacity',
+))
+
+allowed_protocols = frozenset((
+    'ed2k',
+    'ftp',
+    'http',
+    'https',
+    'irc',
+    'mailto',
+    'news',
+    'gopher',
+    'nntp',
+    'telnet',
+    'webcal',
+    'xmpp',
+    'callto',
+    'feed',
+    'urn',
+    'aim',
+    'rsync',
+    'tag',
+    'ssh',
+    'sftp',
+    'rtsp',
+    'afs',
+    'data',
+))
+
+allowed_content_types = frozenset((
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/bmp',
+    'text/plain',
+))
+
+
+data_content_type = re.compile(r'''
+                                ^
+                                # Match a content type <application>/<type>
+                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                                # Match any character set and encoding
+                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                                # Assume the rest is data
+                                ,.*
+                                $
+                                ''',
+                               re.VERBOSE)
+
+
+class Filter(base.Filter):
+    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
+    def __init__(self,
+                 source,
+                 allowed_elements=allowed_elements,
+                 allowed_attributes=allowed_attributes,
+                 allowed_css_properties=allowed_css_properties,
+                 allowed_css_keywords=allowed_css_keywords,
+                 allowed_svg_properties=allowed_svg_properties,
+                 allowed_protocols=allowed_protocols,
+                 allowed_content_types=allowed_content_types,
+                 attr_val_is_uri=attr_val_is_uri,
+                 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
+                 svg_allow_local_href=svg_allow_local_href):
+        super(Filter, self).__init__(source)
+        self.allowed_elements = allowed_elements
+        self.allowed_attributes = allowed_attributes
+        self.allowed_css_properties = allowed_css_properties
+        self.allowed_css_keywords = allowed_css_keywords
+        self.allowed_svg_properties = allowed_svg_properties
+        self.allowed_protocols = allowed_protocols
+        self.allowed_content_types = allowed_content_types
+        self.attr_val_is_uri = attr_val_is_uri
+        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+        self.svg_allow_local_href = svg_allow_local_href
+
+    def __iter__(self):
+        for token in base.Filter.__iter__(self):
+            token = self.sanitize_token(token)
+            if token:
+                yield token
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+    # attributes are parsed, and a restricted set, # specified by
+    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+    # in ALLOWED_PROTOCOLS are allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def sanitize_token(self, token):
+
+        # accommodate filters which use token_type differently
+        token_type = token["type"]
+        if token_type in ("StartTag", "EndTag", "EmptyTag"):
+            name = token["name"]
+            namespace = token["namespace"]
+            if ((namespace, name) in self.allowed_elements or
+                (namespace is None and
+                 (namespaces["html"], name) in self.allowed_elements)):
+                return self.allowed_token(token)
+            else:
+                return self.disallowed_token(token)
+        elif token_type == "Comment":
+            pass
+        else:
+            return token
+
+    def allowed_token(self, token):
+        if "data" in token:
+            attrs = token["data"]
+            attr_names = set(attrs.keys())
+
+            # Remove forbidden attributes
+            for to_remove in (attr_names - self.allowed_attributes):
+                del token["data"][to_remove]
+                attr_names.remove(to_remove)
+
+            # Remove attributes with disallowed URL values
+            for attr in (attr_names & self.attr_val_is_uri):
+                assert attr in attrs
+                # I don't have a clue where this regexp comes from or why it matches those
+                # characters, nor why we call unescape. I just know it's always been here.
+                # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
+                # this will do is remove *more* than it otherwise would.
+                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # remove replacement characters from unescaped characters
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+                try:
+                    uri = urlparse.urlparse(val_unescaped)
+                except ValueError:
+                    uri = None
+                    del attrs[attr]
+                if uri and uri.scheme:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = data_content_type.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        elif m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+            if (token["name"] in self.svg_allow_local_href and
+                (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
+                                                                     attrs[(namespaces['xlink'], 'href')])):
+                del attrs[(namespaces['xlink'], 'href')]
+            if (None, 'style') in attrs:
+                attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
+            token["data"] = attrs
+        return token
+
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
+            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+        else:
+            token["data"] = "<%s>" % token["name"]
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
+        return token
+
+    def sanitize_css(self, style):
+        # disallow urls
+        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+            return ''
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+                                                'padding']:
+                for keyword in value.split():
+                    if keyword not in self.allowed_css_keywords and \
+                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):  # noqa
+                        break
+                else:
+                    clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
diff --git a/html_cleaner/html5lib/filters/whitespace.py b/html_cleaner/html5lib/filters/whitespace.py
new file mode 100644
index 0000000..8921052
--- /dev/null
+++ b/html_cleaner/html5lib/filters/whitespace.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+
+from . import base
+from ..constants import rcdataElements, spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
+
+
+class Filter(base.Filter):
+
+    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+    def __iter__(self):
+        preserve = 0
+        for token in base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag" \
+                    and (preserve or token["name"] in self.spacePreserveElements):
+                preserve += 1
+
+            elif type == "EndTag" and preserve:
+                preserve -= 1
+
+            elif not preserve and type == "SpaceCharacters" and token["data"]:
+                # Test on token["data"] above to not introduce spaces where there were not
+                token["data"] = " "
+
+            elif not preserve and type == "Characters":
+                token["data"] = collapse_spaces(token["data"])
+
+            yield token
+
+
+def collapse_spaces(text):
+    return SPACES_REGEX.sub(' ', text)
diff --git a/html_cleaner/html5lib/html5parser.py b/html_cleaner/html5lib/html5parser.py
new file mode 100644
index 0000000..2abd63e
--- /dev/null
+++ b/html_cleaner/html5lib/html5parser.py
@@ -0,0 +1,2733 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import with_metaclass, viewkeys, PY3
+
+import types
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
+from . import _inputstream
+from . import _tokenizer
+
+from . import treebuilders
+from .treebuilders.base import Marker
+
+from . import _utils
+from .constants import (
+    spaceCharacters, asciiUpper2Lower,
+    specialElements, headingElements, cdataElements, rcdataElements,
+    tokenTypes, tagTokenTypes,
+    namespaces,
+    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
+    adjustForeignAttributes as adjustForeignAttributesMap,
+    adjustMathMLAttributes, adjustSVGAttributes,
+    E,
+    ReparseException
+)
+
+
+def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
+    """Parse a string or file-like object into a tree"""
+    tb = treebuilders.getTreeBuilder(treebuilder)
+    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+    return p.parse(doc, **kwargs)
+
+
+def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
+    tb = treebuilders.getTreeBuilder(treebuilder)
+    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+    return p.parseFragment(doc, container=container, **kwargs)
+
+
+def method_decorator_metaclass(function):
+    class Decorated(type):
+        def __new__(meta, classname, bases, classDict):
+            for attributeName, attribute in classDict.items():
+                if isinstance(attribute, types.FunctionType):
+                    attribute = function(attribute)
+
+                classDict[attributeName] = attribute
+            return type.__new__(meta, classname, bases, classDict)
+    return Decorated
+
+
+class HTMLParser(object):
+    """HTML parser. Generates a tree structure from a stream of (possibly
+        malformed) HTML"""
+
+    def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
+        """
+        strict - raise an exception when a parse error is encountered
+
+        tree - a treebuilder class controlling the type of tree that will be
+        returned. Built in treebuilders can be accessed through
+        html5lib.treebuilders.getTreeBuilder(treeType)
+        """
+
+        # Raise an exception on the first error encountered
+        self.strict = strict
+
+        if tree is None:
+            tree = treebuilders.getTreeBuilder("etree")
+        self.tree = tree(namespaceHTMLElements)
+        self.errors = []
+
+        self.phases = dict([(name, cls(self, self.tree)) for name, cls in
+                            getPhases(debug).items()])
+
+    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
+
+        self.innerHTMLMode = innerHTML
+        self.container = container
+        self.scripting = scripting
+        self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
+        self.reset()
+
+        try:
+            self.mainLoop()
+        except ReparseException:
+            self.reset()
+            self.mainLoop()
+
+    def reset(self):
+        self.tree.reset()
+        self.firstStartTag = False
+        self.errors = []
+        self.log = []  # only used with debug mode
+        # "quirks" / "limited quirks" / "no quirks"
+        self.compatMode = "no quirks"
+
+        if self.innerHTMLMode:
+            self.innerHTML = self.container.lower()
+
+            if self.innerHTML in cdataElements:
+                self.tokenizer.state = self.tokenizer.rcdataState
+            elif self.innerHTML in rcdataElements:
+                self.tokenizer.state = self.tokenizer.rawtextState
+            elif self.innerHTML == 'plaintext':
+                self.tokenizer.state = self.tokenizer.plaintextState
+            else:
+                # state already is data state
+                # self.tokenizer.state = self.tokenizer.dataState
+                pass
+            self.phase = self.phases["beforeHtml"]
+            self.phase.insertHtmlElement()
+            self.resetInsertionMode()
+        else:
+            self.innerHTML = False  # pylint:disable=redefined-variable-type
+            self.phase = self.phases["initial"]
+
+        self.lastPhase = None
+
+        self.beforeRCDataPhase = None
+
+        self.framesetOK = True
+
+    @property
+    def documentEncoding(self):
+        """The name of the character encoding
+        that was used to decode the input stream,
+        or :obj:`None` if that is not determined yet.
+
+        """
+        if not hasattr(self, 'tokenizer'):
+            return None
+        return self.tokenizer.stream.charEncoding[0].name
+
+    def isHTMLIntegrationPoint(self, element):
+        if (element.name == "annotation-xml" and
+                element.namespace == namespaces["mathml"]):
+            return ("encoding" in element.attributes and
+                    element.attributes["encoding"].translate(
+                        asciiUpper2Lower) in
+                    ("text/html", "application/xhtml+xml"))
+        else:
+            return (element.namespace, element.name) in htmlIntegrationPointElements
+
+    def isMathMLTextIntegrationPoint(self, element):
+        return (element.namespace, element.name) in mathmlTextIntegrationPointElements
+
+    def mainLoop(self):
+        CharactersToken = tokenTypes["Characters"]
+        SpaceCharactersToken = tokenTypes["SpaceCharacters"]
+        StartTagToken = tokenTypes["StartTag"]
+        EndTagToken = tokenTypes["EndTag"]
+        CommentToken = tokenTypes["Comment"]
+        DoctypeToken = tokenTypes["Doctype"]
+        ParseErrorToken = tokenTypes["ParseError"]
+
+        for token in self.normalizedTokens():
+            prev_token = None
+            new_token = token
+            while new_token is not None:
+                prev_token = new_token
+                currentNode = self.tree.openElements[-1] if self.tree.openElements else None
+                currentNodeNamespace = currentNode.namespace if currentNode else None
+                currentNodeName = currentNode.name if currentNode else None
+
+                type = new_token["type"]
+
+                if type == ParseErrorToken:
+                    self.parseError(new_token["data"], new_token.get("datavars", {}))
+                    new_token = None
+                else:
+                    if (len(self.tree.openElements) == 0 or
+                        currentNodeNamespace == self.tree.defaultNamespace or
+                        (self.isMathMLTextIntegrationPoint(currentNode) and
+                         ((type == StartTagToken and
+                           token["name"] not in frozenset(["mglyph", "malignmark"])) or
+                          type in (CharactersToken, SpaceCharactersToken))) or
+                        (currentNodeNamespace == namespaces["mathml"] and
+                         currentNodeName == "annotation-xml" and
+                         type == StartTagToken and
+                         token["name"] == "svg") or
+                        (self.isHTMLIntegrationPoint(currentNode) and
+                         type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
+                        phase = self.phase
+                    else:
+                        phase = self.phases["inForeignContent"]
+
+                    if type == CharactersToken:
+                        new_token = phase.processCharacters(new_token)
+                    elif type == SpaceCharactersToken:
+                        new_token = phase.processSpaceCharacters(new_token)
+                    elif type == StartTagToken:
+                        new_token = phase.processStartTag(new_token)
+                    elif type == EndTagToken:
+                        new_token = phase.processEndTag(new_token)
+                    elif type == CommentToken:
+                        new_token = phase.processComment(new_token)
+                    elif type == DoctypeToken:
+                        new_token = phase.processDoctype(new_token)
+
+            if (type == StartTagToken and prev_token["selfClosing"] and
+                    not prev_token["selfClosingAcknowledged"]):
+                self.parseError("non-void-element-with-trailing-solidus",
+                                {"name": prev_token["name"]})
+
+        # When the loop finishes it's EOF
+        reprocess = True
+        phases = []
+        while reprocess:
+            phases.append(self.phase)
+            reprocess = self.phase.processEOF()
+            if reprocess:
+                assert self.phase not in phases
+
+    def normalizedTokens(self):
+        for token in self.tokenizer:
+            yield self.normalizeToken(token)
+
+    def parse(self, stream, *args, **kwargs):
+        """Parse a HTML document into a well-formed tree
+
+        stream - a filelike object or string containing the HTML to be parsed
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        scripting - treat noscript elements as if javascript was turned on
+        """
+        self._parse(stream, False, None, *args, **kwargs)
+        return self.tree.getDocument()
+
+    def parseFragment(self, stream, *args, **kwargs):
+        """Parse a HTML fragment into a well-formed tree fragment
+
+        container - name of the element we're setting the innerHTML property
+        if set to None, default to 'div'
+
+        stream - a filelike object or string containing the HTML to be parsed
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        scripting - treat noscript elements as if javascript was turned on
+        """
+        self._parse(stream, True, *args, **kwargs)
+        return self.tree.getFragment()
+
+    def parseError(self, errorcode="XXX-undefined-error", datavars=None):
+        # XXX The idea is to make errorcode mandatory.
+        if datavars is None:
+            datavars = {}
+        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
+        if self.strict:
+            raise ParseError(E[errorcode] % datavars)
+
+    def normalizeToken(self, token):
+        """ HTML5 specific normalizations to the token stream """
+
+        if token["type"] == tokenTypes["StartTag"]:
+            raw = token["data"]
+            token["data"] = OrderedDict(raw)
+            if len(raw) > len(token["data"]):
+                # we had some duplicated attribute, fix so first wins
+                token["data"].update(raw[::-1])
+
+        return token
+
+    def adjustMathMLAttributes(self, token):
+        adjust_attributes(token, adjustMathMLAttributes)
+
+    def adjustSVGAttributes(self, token):
+        adjust_attributes(token, adjustSVGAttributes)
+
+    def adjustForeignAttributes(self, token):
+        adjust_attributes(token, adjustForeignAttributesMap)
+
+    def reparseTokenNormal(self, token):
+        # pylint:disable=unused-argument
+        self.parser.phase()
+
+    def resetInsertionMode(self):
+        # The name of this method is mostly historical. (It's also used in the
+        # specification.)
+        last = False
+        newModes = {
+            "select": "inSelect",
+            "td": "inCell",
+            "th": "inCell",
+            "tr": "inRow",
+            "tbody": "inTableBody",
+            "thead": "inTableBody",
+            "tfoot": "inTableBody",
+            "caption": "inCaption",
+            "colgroup": "inColumnGroup",
+            "table": "inTable",
+            "head": "inBody",
+            "body": "inBody",
+            "frameset": "inFrameset",
+            "html": "beforeHead"
+        }
+        for node in self.tree.openElements[::-1]:
+            nodeName = node.name
+            new_phase = None
+            if node == self.tree.openElements[0]:
+                assert self.innerHTML
+                last = True
+                nodeName = self.innerHTML
+            # Check for conditions that should only happen in the innerHTML
+            # case
+            if nodeName in ("select", "colgroup", "head", "html"):
+                assert self.innerHTML
+
+            if not last and node.namespace != self.tree.defaultNamespace:
+                continue
+
+            if nodeName in newModes:
+                new_phase = self.phases[newModes[nodeName]]
+                break
+            elif last:
+                new_phase = self.phases["inBody"]
+                break
+
+        self.phase = new_phase
+
+    def parseRCDataRawtext(self, token, contentType):
+        """Generic RCDATA/RAWTEXT Parsing algorithm
+        contentType - RCDATA or RAWTEXT
+        """
+        assert contentType in ("RAWTEXT", "RCDATA")
+
+        self.tree.insertElement(token)
+
+        if contentType == "RAWTEXT":
+            self.tokenizer.state = self.tokenizer.rawtextState
+        else:
+            self.tokenizer.state = self.tokenizer.rcdataState
+
+        self.originalPhase = self.phase
+
+        self.phase = self.phases["text"]
+
+
+@_utils.memoize
+def getPhases(debug):
+    def log(function):
+        """Logger that records which phase processes each token"""
+        type_names = dict((value, key) for key, value in
+                          tokenTypes.items())
+
+        def wrapped(self, *args, **kwargs):
+            if function.__name__.startswith("process") and len(args) > 0:
+                token = args[0]
+                try:
+                    info = {"type": type_names[token['type']]}
+                except:
+                    raise
+                if token['type'] in tagTokenTypes:
+                    info["name"] = token['name']
+
+                self.parser.log.append((self.parser.tokenizer.state.__name__,
+                                        self.parser.phase.__class__.__name__,
+                                        self.__class__.__name__,
+                                        function.__name__,
+                                        info))
+                return function(self, *args, **kwargs)
+            else:
+                return function(self, *args, **kwargs)
+        return wrapped
+
+    def getMetaclass(use_metaclass, metaclass_func):
+        if use_metaclass:
+            return method_decorator_metaclass(metaclass_func)
+        else:
+            return type
+
+    # pylint:disable=unused-argument
+    class Phase(with_metaclass(getMetaclass(debug, log))):
+        """Base class for helper object that implements each phase of processing
+        """
+
+        def __init__(self, parser, tree):
+            self.parser = parser
+            self.tree = tree
+
+        def processEOF(self):
+            raise NotImplementedError
+
+        def processComment(self, token):
+            # For most phases the following is correct. Where it's not it will be
+            # overridden.
+            self.tree.insertComment(token, self.tree.openElements[-1])
+
+        def processDoctype(self, token):
+            self.parser.parseError("unexpected-doctype")
+
+        def processCharacters(self, token):
+            self.tree.insertText(token["data"])
+
+        def processSpaceCharacters(self, token):
+            self.tree.insertText(token["data"])
+
+        def processStartTag(self, token):
+            return self.startTagHandler[token["name"]](token)
+
+        def startTagHtml(self, token):
+            if not self.parser.firstStartTag and token["name"] == "html":
+                self.parser.parseError("non-html-root")
+            # XXX Need a check here to see if the first start tag token emitted is
+            # this token... If it's not, invoke self.parser.parseError().
+            for attr, value in token["data"].items():
+                if attr not in self.tree.openElements[0].attributes:
+                    self.tree.openElements[0].attributes[attr] = value
+            self.parser.firstStartTag = False
+
+        def processEndTag(self, token):
+            return self.endTagHandler[token["name"]](token)
+
+    class InitialPhase(Phase):
+        def processSpaceCharacters(self, token):
+            pass
+
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+            correct = token["correct"]
+
+            if (name != "html" or publicId is not None or
+                    systemId is not None and systemId != "about:legacy-compat"):
+                self.parser.parseError("unknown-doctype")
+
+            if publicId is None:
+                publicId = ""
+
+            self.tree.insertDoctype(token)
+
+            if publicId != "":
+                publicId = publicId.translate(asciiUpper2Lower)
+
+            if (not correct or token["name"] != "html" or
+                    publicId.startswith(
+                        ("+//silmaril//dtd html pro v0r11 19970101//",
+                         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+                         "-//as//dtd html 3.0 aswedit + extensions//",
+                         "-//ietf//dtd html 2.0 level 1//",
+                         "-//ietf//dtd html 2.0 level 2//",
+                         "-//ietf//dtd html 2.0 strict level 1//",
+                         "-//ietf//dtd html 2.0 strict level 2//",
+                         "-//ietf//dtd html 2.0 strict//",
+                         "-//ietf//dtd html 2.0//",
+                         "-//ietf//dtd html 2.1e//",
+                         "-//ietf//dtd html 3.0//",
+                         "-//ietf//dtd html 3.2 final//",
+                         "-//ietf//dtd html 3.2//",
+                         "-//ietf//dtd html 3//",
+                         "-//ietf//dtd html level 0//",
+                         "-//ietf//dtd html level 1//",
+                         "-//ietf//dtd html level 2//",
+                         "-//ietf//dtd html level 3//",
+                         "-//ietf//dtd html strict level 0//",
+                         "-//ietf//dtd html strict level 1//",
+                         "-//ietf//dtd html strict level 2//",
+                         "-//ietf//dtd html strict level 3//",
+                         "-//ietf//dtd html strict//",
+                         "-//ietf//dtd html//",
+                         "-//metrius//dtd metrius presentational//",
+                         "-//microsoft//dtd internet explorer 2.0 html strict//",
+                         "-//microsoft//dtd internet explorer 2.0 html//",
+                         "-//microsoft//dtd internet explorer 2.0 tables//",
+                         "-//microsoft//dtd internet explorer 3.0 html strict//",
+                         "-//microsoft//dtd internet explorer 3.0 html//",
+                         "-//microsoft//dtd internet explorer 3.0 tables//",
+                         "-//netscape comm. corp.//dtd html//",
+                         "-//netscape comm. corp.//dtd strict html//",
+                         "-//o'reilly and associates//dtd html 2.0//",
+                         "-//o'reilly and associates//dtd html extended 1.0//",
+                         "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+                         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+                         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+                         "-//spyglass//dtd html 2.0 extended//",
+                         "-//sq//dtd html 2.0 hotmetal + extensions//",
+                         "-//sun microsystems corp.//dtd hotjava html//",
+                         "-//sun microsystems corp.//dtd hotjava strict html//",
+                         "-//w3c//dtd html 3 1995-03-24//",
+                         "-//w3c//dtd html 3.2 draft//",
+                         "-//w3c//dtd html 3.2 final//",
+                         "-//w3c//dtd html 3.2//",
+                         "-//w3c//dtd html 3.2s draft//",
+                         "-//w3c//dtd html 4.0 frameset//",
+                         "-//w3c//dtd html 4.0 transitional//",
+                         "-//w3c//dtd html experimental 19960712//",
+                         "-//w3c//dtd html experimental 970421//",
+                         "-//w3c//dtd w3 html//",
+                         "-//w3o//dtd w3 html 3.0//",
+                         "-//webtechs//dtd mozilla html 2.0//",
+                         "-//webtechs//dtd mozilla html//")) or
+                    publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
+                                 "-/w3c/dtd html 4.0 transitional/en",
+                                 "html") or
+                    publicId.startswith(
+                        ("-//w3c//dtd html 4.01 frameset//",
+                         "-//w3c//dtd html 4.01 transitional//")) and
+                    systemId is None or
+                    systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+                self.parser.compatMode = "quirks"
+            elif (publicId.startswith(
+                    ("-//w3c//dtd xhtml 1.0 frameset//",
+                     "-//w3c//dtd xhtml 1.0 transitional//")) or
+                  publicId.startswith(
+                      ("-//w3c//dtd html 4.01 frameset//",
+                       "-//w3c//dtd html 4.01 transitional//")) and
+                  systemId is not None):
+                self.parser.compatMode = "limited quirks"
+
+            self.parser.phase = self.parser.phases["beforeHtml"]
+
+        def anythingElse(self):
+            self.parser.compatMode = "quirks"
+            self.parser.phase = self.parser.phases["beforeHtml"]
+
+        def processCharacters(self, token):
+            self.parser.parseError("expected-doctype-but-got-chars")
+            self.anythingElse()
+            return token
+
+        def processStartTag(self, token):
+            self.parser.parseError("expected-doctype-but-got-start-tag",
+                                   {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def processEndTag(self, token):
+            self.parser.parseError("expected-doctype-but-got-end-tag",
+                                   {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def processEOF(self):
+            self.parser.parseError("expected-doctype-but-got-eof")
+            self.anythingElse()
+            return True
+
+    class BeforeHtmlPhase(Phase):
+        # helper methods
+        def insertHtmlElement(self):
+            self.tree.insertRoot(impliedTagToken("html", "StartTag"))
+            self.parser.phase = self.parser.phases["beforeHead"]
+
+        # other
+        def processEOF(self):
+            self.insertHtmlElement()
+            return True
+
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processSpaceCharacters(self, token):
+            pass
+
+        def processCharacters(self, token):
+            self.insertHtmlElement()
+            return token
+
+        def processStartTag(self, token):
+            if token["name"] == "html":
+                self.parser.firstStartTag = True
+            self.insertHtmlElement()
+            return token
+
+        def processEndTag(self, token):
+            if token["name"] not in ("head", "body", "html", "br"):
+                self.parser.parseError("unexpected-end-tag-before-html",
+                                       {"name": token["name"]})
+            else:
+                self.insertHtmlElement()
+                return token
+
+    class BeforeHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                (("head", "body", "html", "br"), self.endTagImplyHead)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return True
+
+        def processSpaceCharacters(self, token):
+            pass
+
+        def processCharacters(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagHead(self, token):
+            self.tree.insertElement(token)
+            self.tree.headPointer = self.tree.openElements[-1]
+            self.parser.phase = self.parser.phases["inHead"]
+
+        def startTagOther(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
+
+        def endTagImplyHead(self, token):
+            self.startTagHead(impliedTagToken("head", "StartTag"))
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("end-tag-after-implied-root",
+                                   {"name": token["name"]})
+
+    class InHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("title", self.startTagTitle),
+                (("noframes", "style"), self.startTagNoFramesStyle),
+                ("noscript", self.startTagNoscript),
+                ("script", self.startTagScript),
+                (("base", "basefont", "bgsound", "command", "link"),
+                 self.startTagBaseLinkCommand),
+                ("meta", self.startTagMeta),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("head", self.endTagHead),
+                (("br", "html", "body"), self.endTagHtmlBodyBr)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # the real thing
+        def processEOF(self):
+            self.anythingElse()
+            return True
+
+        def processCharacters(self, token):
+            self.anythingElse()
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagHead(self, token):
+            self.parser.parseError("two-heads-are-not-better-than-one")
+
+        def startTagBaseLinkCommand(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+        def startTagMeta(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+            attributes = token["data"]
+            if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
+                if "charset" in attributes:
+                    self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
+                elif ("content" in attributes and
+                      "http-equiv" in attributes and
+                      attributes["http-equiv"].lower() == "content-type"):
+                    # Encoding it as UTF-8 here is a hack, as really we should pass
+                    # the abstract Unicode string, and just use the
+                    # ContentAttrParser on that, but using UTF-8 allows all chars
+                    # to be encoded and as a ASCII-superset works.
+                    data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
+                    parser = _inputstream.ContentAttrParser(data)
+                    codec = parser.parse()
+                    self.parser.tokenizer.stream.changeEncoding(codec)
+
+        def startTagTitle(self, token):
+            self.parser.parseRCDataRawtext(token, "RCDATA")
+
+        def startTagNoFramesStyle(self, token):
+            # Need to decide whether to implement the scripting-disabled case
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+        def startTagNoscript(self, token):
+            if self.parser.scripting:
+                self.parser.parseRCDataRawtext(token, "RAWTEXT")
+            else:
+                self.tree.insertElement(token)
+                self.parser.phase = self.parser.phases["inHeadNoscript"]
+
+        def startTagScript(self, token):
+            self.tree.insertElement(token)
+            self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
+            self.parser.originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["text"]
+
+        def startTagOther(self, token):
+            self.anythingElse()
+            return token
+
+        def endTagHead(self, token):
+            node = self.parser.tree.openElements.pop()
+            assert node.name == "head", "Expected head got %s" % node.name
+            self.parser.phase = self.parser.phases["afterHead"]
+
+        def endTagHtmlBodyBr(self, token):
+            self.anythingElse()
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def anythingElse(self):
+            self.endTagHead(impliedTagToken("head"))
+
+    class InHeadNoscriptPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
+                (("head", "noscript"), self.startTagHeadNoscript),
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("noscript", self.endTagNoscript),
+                ("br", self.endTagBr),
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.parser.parseError("eof-in-head-noscript")
+            self.anythingElse()
+            return True
+
+        def processComment(self, token):
+            return self.parser.phases["inHead"].processComment(token)
+
+        def processCharacters(self, token):
+            self.parser.parseError("char-in-head-noscript")
+            self.anythingElse()
+            return token
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inHead"].processSpaceCharacters(token)
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagBaseLinkCommand(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagHeadNoscript(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def endTagNoscript(self, token):
+            node = self.parser.tree.openElements.pop()
+            assert node.name == "noscript", "Expected noscript got %s" % node.name
+            self.parser.phase = self.parser.phases["inHead"]
+
+        def endTagBr(self, token):
+            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def anythingElse(self):
+            # Caller must raise parse error first!
+            self.endTagNoscript(impliedTagToken("noscript"))
+
+    class AfterHeadPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("body", self.startTagBody),
+                ("frameset", self.startTagFrameset),
+                (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
+                  "style", "title"),
+                 self.startTagFromHead),
+                ("head", self.startTagHead)
+            ])
+            self.startTagHandler.default = self.startTagOther
+            self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
+                                                           self.endTagHtmlBodyBr)])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.anythingElse()
+            return True
+
+        def processCharacters(self, token):
+            self.anythingElse()
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagBody(self, token):
+            self.parser.framesetOK = False
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inBody"]
+
+        def startTagFrameset(self, token):
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inFrameset"]
+
+        def startTagFromHead(self, token):
+            self.parser.parseError("unexpected-start-tag-out-of-my-head",
+                                   {"name": token["name"]})
+            self.tree.openElements.append(self.tree.headPointer)
+            self.parser.phases["inHead"].processStartTag(token)
+            for node in self.tree.openElements[::-1]:
+                if node.name == "head":
+                    self.tree.openElements.remove(node)
+                    break
+
+        def startTagHead(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+        def startTagOther(self, token):
+            self.anythingElse()
+            return token
+
+        def endTagHtmlBodyBr(self, token):
+            self.anythingElse()
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def anythingElse(self):
+            self.tree.insertElement(impliedTagToken("body", "StartTag"))
+            self.parser.phase = self.parser.phases["inBody"]
+            self.parser.framesetOK = True
+
+    class InBodyPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
+        # the really-really-really-very crazy mode
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            # Set this to the default handler
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("base", "basefont", "bgsound", "command", "link", "meta",
+                  "script", "style", "title"),
+                 self.startTagProcessInHead),
+                ("body", self.startTagBody),
+                ("frameset", self.startTagFrameset),
+                (("address", "article", "aside", "blockquote", "center", "details",
+                  "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
+                  "section", "summary", "ul"),
+                 self.startTagCloseP),
+                (headingElements, self.startTagHeading),
+                (("pre", "listing"), self.startTagPreListing),
+                ("form", self.startTagForm),
+                (("li", "dd", "dt"), self.startTagListItem),
+                ("plaintext", self.startTagPlaintext),
+                ("a", self.startTagA),
+                (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
+                  "strong", "tt", "u"), self.startTagFormatting),
+                ("nobr", self.startTagNobr),
+                ("button", self.startTagButton),
+                (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
+                ("xmp", self.startTagXmp),
+                ("table", self.startTagTable),
+                (("area", "br", "embed", "img", "keygen", "wbr"),
+                 self.startTagVoidFormatting),
+                (("param", "source", "track"), self.startTagParamSource),
+                ("input", self.startTagInput),
+                ("hr", self.startTagHr),
+                ("image", self.startTagImage),
+                ("isindex", self.startTagIsIndex),
+                ("textarea", self.startTagTextarea),
+                ("iframe", self.startTagIFrame),
+                ("noscript", self.startTagNoscript),
+                (("noembed", "noframes"), self.startTagRawtext),
+                ("select", self.startTagSelect),
+                (("rp", "rt"), self.startTagRpRt),
+                (("option", "optgroup"), self.startTagOpt),
+                (("math"), self.startTagMath),
+                (("svg"), self.startTagSvg),
+                (("caption", "col", "colgroup", "frame", "head",
+                  "tbody", "td", "tfoot", "th", "thead",
+                  "tr"), self.startTagMisplaced)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("body", self.endTagBody),
+                ("html", self.endTagHtml),
+                (("address", "article", "aside", "blockquote", "button", "center",
+                  "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
+                  "section", "summary", "ul"), self.endTagBlock),
+                ("form", self.endTagForm),
+                ("p", self.endTagP),
+                (("dd", "dt", "li"), self.endTagListItem),
+                (headingElements, self.endTagHeading),
+                (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
+                  "strike", "strong", "tt", "u"), self.endTagFormatting),
+                (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
+                ("br", self.endTagBr),
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def isMatchingFormattingElement(self, node1, node2):
+            return (node1.name == node2.name and
+                    node1.namespace == node2.namespace and
+                    node1.attributes == node2.attributes)
+
+        # helper
+        def addFormattingElement(self, token):
+            self.tree.insertElement(token)
+            element = self.tree.openElements[-1]
+
+            matchingElements = []
+            for node in self.tree.activeFormattingElements[::-1]:
+                if node is Marker:
+                    break
+                elif self.isMatchingFormattingElement(node, element):
+                    matchingElements.append(node)
+
+            assert len(matchingElements) <= 3
+            if len(matchingElements) == 3:
+                self.tree.activeFormattingElements.remove(matchingElements[-1])
+            self.tree.activeFormattingElements.append(element)
+
+        # the real deal
+        def processEOF(self):
+            allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
+                                          "tfoot", "th", "thead", "tr", "body",
+                                          "html"))
+            for node in self.tree.openElements[::-1]:
+                if node.name not in allowed_elements:
+                    self.parser.parseError("expected-closing-tag-but-got-eof")
+                    break
+            # Stop parsing
+
+        def processSpaceCharactersDropNewline(self, token):
+            # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
+            # want to drop leading newlines
+            data = token["data"]
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
+            if (data.startswith("\n") and
+                self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
+                    not self.tree.openElements[-1].hasContent()):
+                data = data[1:]
+            if data:
+                self.tree.reconstructActiveFormattingElements()
+                self.tree.insertText(data)
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                # The tokenizer should always emit null on its own
+                return
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertText(token["data"])
+            # This must be bad for performance
+            if (self.parser.framesetOK and
+                any([char not in spaceCharacters
+                     for char in token["data"]])):
+                self.parser.framesetOK = False
+
+        def processSpaceCharactersNonPre(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertText(token["data"])
+
+        def startTagProcessInHead(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagBody(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": "body"})
+            if (len(self.tree.openElements) == 1 or
+                    self.tree.openElements[1].name != "body"):
+                assert self.parser.innerHTML
+            else:
+                self.parser.framesetOK = False
+                for attr, value in token["data"].items():
+                    if attr not in self.tree.openElements[1].attributes:
+                        self.tree.openElements[1].attributes[attr] = value
+
+        def startTagFrameset(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
+            if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
+                assert self.parser.innerHTML
+            elif not self.parser.framesetOK:
+                pass
+            else:
+                if self.tree.openElements[1].parent:
+                    self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
+                while self.tree.openElements[-1].name != "html":
+                    self.tree.openElements.pop()
+                self.tree.insertElement(token)
+                self.parser.phase = self.parser.phases["inFrameset"]
+
+        def startTagCloseP(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+
+        def startTagPreListing(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+
+        def startTagForm(self, token):
+            if self.tree.formPointer:
+                self.parser.parseError("unexpected-start-tag", {"name": "form"})
+            else:
+                if self.tree.elementInScope("p", variant="button"):
+                    self.endTagP(impliedTagToken("p"))
+                self.tree.insertElement(token)
+                self.tree.formPointer = self.tree.openElements[-1]
+
+        def startTagListItem(self, token):
+            self.parser.framesetOK = False
+
+            stopNamesMap = {"li": ["li"],
+                            "dt": ["dt", "dd"],
+                            "dd": ["dt", "dd"]}
+            stopNames = stopNamesMap[token["name"]]
+            for node in reversed(self.tree.openElements):
+                if node.name in stopNames:
+                    self.parser.phase.processEndTag(
+                        impliedTagToken(node.name, "EndTag"))
+                    break
+                if (node.nameTuple in specialElements and
+                        node.name not in ("address", "div", "p")):
+                    break
+
+            if self.tree.elementInScope("p", variant="button"):
+                self.parser.phase.processEndTag(
+                    impliedTagToken("p", "EndTag"))
+
+            self.tree.insertElement(token)
+
+        def startTagPlaintext(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
+
+        def startTagHeading(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            if self.tree.openElements[-1].name in headingElements:
+                self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+                self.tree.openElements.pop()
+            self.tree.insertElement(token)
+
+        def startTagA(self, token):
+            afeAElement = self.tree.elementInActiveFormattingElements("a")
+            if afeAElement:
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "a", "endName": "a"})
+                self.endTagFormatting(impliedTagToken("a"))
+                if afeAElement in self.tree.openElements:
+                    self.tree.openElements.remove(afeAElement)
+                if afeAElement in self.tree.activeFormattingElements:
+                    self.tree.activeFormattingElements.remove(afeAElement)
+            self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
+
+        def startTagFormatting(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
+
+        def startTagNobr(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            if self.tree.elementInScope("nobr"):
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "nobr", "endName": "nobr"})
+                self.processEndTag(impliedTagToken("nobr"))
+                # XXX Need tests that trigger the following
+                self.tree.reconstructActiveFormattingElements()
+            self.addFormattingElement(token)
+
+        def startTagButton(self, token):
+            if self.tree.elementInScope("button"):
+                self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                       {"startName": "button", "endName": "button"})
+                self.processEndTag(impliedTagToken("button"))
+                return token
+            else:
+                self.tree.reconstructActiveFormattingElements()
+                self.tree.insertElement(token)
+                self.parser.framesetOK = False
+
+        def startTagAppletMarqueeObject(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.tree.activeFormattingElements.append(Marker)
+            self.parser.framesetOK = False
+
+        def startTagXmp(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.framesetOK = False
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+        def startTagTable(self, token):
+            if self.parser.compatMode != "quirks":
+                if self.tree.elementInScope("p", variant="button"):
+                    self.processEndTag(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            self.parser.phase = self.parser.phases["inTable"]
+
+        def startTagVoidFormatting(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+            self.parser.framesetOK = False
+
+        def startTagInput(self, token):
+            framesetOK = self.parser.framesetOK
+            self.startTagVoidFormatting(token)
+            if ("type" in token["data"] and
+                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+                # input type=hidden doesn't change framesetOK
+                self.parser.framesetOK = framesetOK
+
+        def startTagParamSource(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+        def startTagHr(self, token):
+            if self.tree.elementInScope("p", variant="button"):
+                self.endTagP(impliedTagToken("p"))
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+            self.parser.framesetOK = False
+
+        def startTagImage(self, token):
+            # No really...
+            self.parser.parseError("unexpected-start-tag-treated-as",
+                                   {"originalName": "image", "newName": "img"})
+            self.processStartTag(impliedTagToken("img", "StartTag",
+                                                 attributes=token["data"],
+                                                 selfClosing=token["selfClosing"]))
+
+        def startTagIsIndex(self, token):
+            self.parser.parseError("deprecated-tag", {"name": "isindex"})
+            if self.tree.formPointer:
+                return
+            form_attrs = {}
+            if "action" in token["data"]:
+                form_attrs["action"] = token["data"]["action"]
+            self.processStartTag(impliedTagToken("form", "StartTag",
+                                                 attributes=form_attrs))
+            self.processStartTag(impliedTagToken("hr", "StartTag"))
+            self.processStartTag(impliedTagToken("label", "StartTag"))
+            # XXX Localization ...
+            if "prompt" in token["data"]:
+                prompt = token["data"]["prompt"]
+            else:
+                prompt = "This is a searchable index. Enter search keywords: "
+            self.processCharacters(
+                {"type": tokenTypes["Characters"], "data": prompt})
+            attributes = token["data"].copy()
+            if "action" in attributes:
+                del attributes["action"]
+            if "prompt" in attributes:
+                del attributes["prompt"]
+            attributes["name"] = "isindex"
+            self.processStartTag(impliedTagToken("input", "StartTag",
+                                                 attributes=attributes,
+                                                 selfClosing=token["selfClosing"]))
+            self.processEndTag(impliedTagToken("label"))
+            self.processStartTag(impliedTagToken("hr", "StartTag"))
+            self.processEndTag(impliedTagToken("form"))
+
+        def startTagTextarea(self, token):
+            self.tree.insertElement(token)
+            self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
+            self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+            self.parser.framesetOK = False
+
+        def startTagIFrame(self, token):
+            self.parser.framesetOK = False
+            self.startTagRawtext(token)
+
+        def startTagNoscript(self, token):
+            if self.parser.scripting:
+                self.startTagRawtext(token)
+            else:
+                self.startTagOther(token)
+
+        def startTagRawtext(self, token):
+            """iframe, noembed noframes, noscript(if scripting enabled)"""
+            self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+        def startTagOpt(self, token):
+            if self.tree.openElements[-1].name == "option":
+                self.parser.phase.processEndTag(impliedTagToken("option"))
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.tree.insertElement(token)
+
+        def startTagSelect(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+            self.parser.framesetOK = False
+            if self.parser.phase in (self.parser.phases["inTable"],
+                                     self.parser.phases["inCaption"],
+                                     self.parser.phases["inColumnGroup"],
+                                     self.parser.phases["inTableBody"],
+                                     self.parser.phases["inRow"],
+                                     self.parser.phases["inCell"]):
+                self.parser.phase = self.parser.phases["inSelectInTable"]
+            else:
+                self.parser.phase = self.parser.phases["inSelect"]
+
+        def startTagRpRt(self, token):
+            if self.tree.elementInScope("ruby"):
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "ruby":
+                    self.parser.parseError()
+            self.tree.insertElement(token)
+
+        def startTagMath(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.adjustMathMLAttributes(token)
+            self.parser.adjustForeignAttributes(token)
+            token["namespace"] = namespaces["mathml"]
+            self.tree.insertElement(token)
+            # Need to get the parse error right for the case where the token
+            # has a namespace not equal to the xmlns attribute
+            if token["selfClosing"]:
+                self.tree.openElements.pop()
+                token["selfClosingAcknowledged"] = True
+
+        def startTagSvg(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.parser.adjustSVGAttributes(token)
+            self.parser.adjustForeignAttributes(token)
+            token["namespace"] = namespaces["svg"]
+            self.tree.insertElement(token)
+            # Need to get the parse error right for the case where the token
+            # has a namespace not equal to the xmlns attribute
+            if token["selfClosing"]:
+                self.tree.openElements.pop()
+                token["selfClosingAcknowledged"] = True
+
+        def startTagMisplaced(self, token):
+            """ Elements that should be children of other elements that have a
+            different insertion mode; here they are ignored
+            "caption", "col", "colgroup", "frame", "frameset", "head",
+            "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+            "tr", "noscript"
+            """
+            self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
+
+        def startTagOther(self, token):
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(token)
+
+        def endTagP(self, token):
+            if not self.tree.elementInScope("p", variant="button"):
+                self.startTagCloseP(impliedTagToken("p", "StartTag"))
+                self.parser.parseError("unexpected-end-tag", {"name": "p"})
+                self.endTagP(impliedTagToken("p", "EndTag"))
+            else:
+                self.tree.generateImpliedEndTags("p")
+                if self.tree.openElements[-1].name != "p":
+                    self.parser.parseError("unexpected-end-tag", {"name": "p"})
+                node = self.tree.openElements.pop()
+                while node.name != "p":
+                    node = self.tree.openElements.pop()
+
+        def endTagBody(self, token):
+            if not self.tree.elementInScope("body"):
+                self.parser.parseError()
+                return
+            elif self.tree.openElements[-1].name != "body":
+                for node in self.tree.openElements[2:]:
+                    if node.name not in frozenset(("dd", "dt", "li", "optgroup",
+                                                   "option", "p", "rp", "rt",
+                                                   "tbody", "td", "tfoot",
+                                                   "th", "thead", "tr", "body",
+                                                   "html")):
+                        # Not sure this is the correct name for the parse error
+                        self.parser.parseError(
+                            "expected-one-end-tag-but-got-another",
+                            {"gotName": "body", "expectedName": node.name})
+                        break
+            self.parser.phase = self.parser.phases["afterBody"]
+
+        def endTagHtml(self, token):
+            # We repeat the test for the body end tag token being ignored here
+            if self.tree.elementInScope("body"):
+                self.endTagBody(impliedTagToken("body"))
+                return token
+
+        def endTagBlock(self, token):
+            # Put us back in the right whitespace handling mode
+            if token["name"] == "pre":
+                self.processSpaceCharacters = self.processSpaceCharactersNonPre
+            inScope = self.tree.elementInScope(token["name"])
+            if inScope:
+                self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+            if inScope:
+                node = self.tree.openElements.pop()
+                while node.name != token["name"]:
+                    node = self.tree.openElements.pop()
+
+        def endTagForm(self, token):
+            node = self.tree.formPointer
+            self.tree.formPointer = None
+            if node is None or not self.tree.elementInScope(node):
+                self.parser.parseError("unexpected-end-tag",
+                                       {"name": "form"})
+            else:
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1] != node:
+                    self.parser.parseError("end-tag-too-early-ignored",
+                                           {"name": "form"})
+                self.tree.openElements.remove(node)
+
+        def endTagListItem(self, token):
+            if token["name"] == "li":
+                variant = "list"
+            else:
+                variant = None
+            if not self.tree.elementInScope(token["name"], variant=variant):
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+            else:
+                self.tree.generateImpliedEndTags(exclude=token["name"])
+                if self.tree.openElements[-1].name != token["name"]:
+                    self.parser.parseError(
+                        "end-tag-too-early",
+                        {"name": token["name"]})
+                node = self.tree.openElements.pop()
+                while node.name != token["name"]:
+                    node = self.tree.openElements.pop()
+
+        def endTagHeading(self, token):
+            for item in headingElements:
+                if self.tree.elementInScope(item):
+                    self.tree.generateImpliedEndTags()
+                    break
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+
+            for item in headingElements:
+                if self.tree.elementInScope(item):
+                    item = self.tree.openElements.pop()
+                    while item.name not in headingElements:
+                        item = self.tree.openElements.pop()
+                    break
+
+        def endTagFormatting(self, token):
+            """The much-feared adoption agency algorithm"""
+            # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
+            # XXX Better parseError messages appreciated.
+
+            # Step 1
+            outerLoopCounter = 0
+
+            # Step 2
+            while outerLoopCounter < 8:
+
+                # Step 3
+                outerLoopCounter += 1
+
+                # Step 4:
+
+                # Let the formatting element be the last element in
+                # the list of active formatting elements that:
+                # - is between the end of the list and the last scope
+                # marker in the list, if any, or the start of the list
+                # otherwise, and
+                # - has the same tag name as the token.
+                formattingElement = self.tree.elementInActiveFormattingElements(
+                    token["name"])
+                if (not formattingElement or
+                    (formattingElement in self.tree.openElements and
+                     not self.tree.elementInScope(formattingElement.name))):
+                    # If there is no such node, then abort these steps
+                    # and instead act as described in the "any other
+                    # end tag" entry below.
+                    self.endTagOther(token)
+                    return
+
+                # Otherwise, if there is such a node, but that node is
+                # not in the stack of open elements, then this is a
+                # parse error; remove the element from the list, and
+                # abort these steps.
+                elif formattingElement not in self.tree.openElements:
+                    self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
+                    self.tree.activeFormattingElements.remove(formattingElement)
+                    return
+
+                # Otherwise, if there is such a node, and that node is
+                # also in the stack of open elements, but the element
+                # is not in scope, then this is a parse error; ignore
+                # the token, and abort these steps.
+                elif not self.tree.elementInScope(formattingElement.name):
+                    self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
+                    return
+
+                # Otherwise, there is a formatting element and that
+                # element is in the stack and is in scope. If the
+                # element is not the current node, this is a parse
+                # error. In any case, proceed with the algorithm as
+                # written in the following steps.
+                else:
+                    if formattingElement != self.tree.openElements[-1]:
+                        self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
+
+                # Step 5:
+
+                # Let the furthest block be the topmost node in the
+                # stack of open elements that is lower in the stack
+                # than the formatting element, and is an element in
+                # the special category. There might not be one.
+                afeIndex = self.tree.openElements.index(formattingElement)
+                furthestBlock = None
+                for element in self.tree.openElements[afeIndex:]:
+                    if element.nameTuple in specialElements:
+                        furthestBlock = element
+                        break
+
+                # Step 6:
+
+                # If there is no furthest block, then the UA must
+                # first pop all the nodes from the bottom of the stack
+                # of open elements, from the current node up to and
+                # including the formatting element, then remove the
+                # formatting element from the list of active
+                # formatting elements, and finally abort these steps.
+                if furthestBlock is None:
+                    element = self.tree.openElements.pop()
+                    while element != formattingElement:
+                        element = self.tree.openElements.pop()
+                    self.tree.activeFormattingElements.remove(element)
+                    return
+
+                # Step 7
+                commonAncestor = self.tree.openElements[afeIndex - 1]
+
+                # Step 8:
+                # The bookmark is supposed to help us identify where to reinsert
+                # nodes in step 15. We have to ensure that we reinsert nodes after
+                # the node before the active formatting element. Note the bookmark
+                # can move in step 9.7
+                bookmark = self.tree.activeFormattingElements.index(formattingElement)
+
+                # Step 9
+                lastNode = node = furthestBlock
+                innerLoopCounter = 0
+
+                index = self.tree.openElements.index(node)
+                while innerLoopCounter < 3:
+                    innerLoopCounter += 1
+                    # Node is element before node in open elements
+                    index -= 1
+                    node = self.tree.openElements[index]
+                    if node not in self.tree.activeFormattingElements:
+                        self.tree.openElements.remove(node)
+                        continue
+                    # Step 9.6
+                    if node == formattingElement:
+                        break
+                    # Step 9.7
+                    if lastNode == furthestBlock:
+                        bookmark = self.tree.activeFormattingElements.index(node) + 1
+                    # Step 9.8
+                    clone = node.cloneNode()
+                    # Replace node with clone
+                    self.tree.activeFormattingElements[
+                        self.tree.activeFormattingElements.index(node)] = clone
+                    self.tree.openElements[
+                        self.tree.openElements.index(node)] = clone
+                    node = clone
+                    # Step 9.9
+                    # Remove lastNode from its parents, if any
+                    if lastNode.parent:
+                        lastNode.parent.removeChild(lastNode)
+                    node.appendChild(lastNode)
+                    # Step 9.10
+                    lastNode = node
+
+                # Step 10
+                # Foster parent lastNode if commonAncestor is a
+                # table, tbody, tfoot, thead, or tr we need to foster
+                # parent the lastNode
+                if lastNode.parent:
+                    lastNode.parent.removeChild(lastNode)
+
+                if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
+                    parent, insertBefore = self.tree.getTableMisnestedNodePosition()
+                    parent.insertBefore(lastNode, insertBefore)
+                else:
+                    commonAncestor.appendChild(lastNode)
+
+                # Step 11
+                clone = formattingElement.cloneNode()
+
+                # Step 12
+                furthestBlock.reparentChildren(clone)
+
+                # Step 13
+                furthestBlock.appendChild(clone)
+
+                # Step 14
+                self.tree.activeFormattingElements.remove(formattingElement)
+                self.tree.activeFormattingElements.insert(bookmark, clone)
+
+                # Step 15
+                self.tree.openElements.remove(formattingElement)
+                self.tree.openElements.insert(
+                    self.tree.openElements.index(furthestBlock) + 1, clone)
+
+        def endTagAppletMarqueeObject(self, token):
+            if self.tree.elementInScope(token["name"]):
+                self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("end-tag-too-early", {"name": token["name"]})
+
+            if self.tree.elementInScope(token["name"]):
+                element = self.tree.openElements.pop()
+                while element.name != token["name"]:
+                    element = self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+
+        def endTagBr(self, token):
+            self.parser.parseError("unexpected-end-tag-treated-as",
+                                   {"originalName": "br", "newName": "br element"})
+            self.tree.reconstructActiveFormattingElements()
+            self.tree.insertElement(impliedTagToken("br", "StartTag"))
+            self.tree.openElements.pop()
+
+        def endTagOther(self, token):
+            for node in self.tree.openElements[::-1]:
+                if node.name == token["name"]:
+                    self.tree.generateImpliedEndTags(exclude=token["name"])
+                    if self.tree.openElements[-1].name != token["name"]:
+                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+                    while self.tree.openElements.pop() != node:
+                        pass
+                    break
+                else:
+                    if node.nameTuple in specialElements:
+                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+                        break
+
+    class TextPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([])
+            self.startTagHandler.default = self.startTagOther
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("script", self.endTagScript)])
+            self.endTagHandler.default = self.endTagOther
+
+        def processCharacters(self, token):
+            self.tree.insertText(token["data"])
+
+        def processEOF(self):
+            self.parser.parseError("expected-named-closing-tag-but-got-eof",
+                                   {"name": self.tree.openElements[-1].name})
+            self.tree.openElements.pop()
+            self.parser.phase = self.parser.originalPhase
+            return True
+
+        def startTagOther(self, token):
+            assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
+
+        def endTagScript(self, token):
+            node = self.tree.openElements.pop()
+            assert node.name == "script"
+            self.parser.phase = self.parser.originalPhase
+            # The rest of this method is all stuff that only happens if
+            # document.write works
+
+        def endTagOther(self, token):
+            self.tree.openElements.pop()
+            self.parser.phase = self.parser.originalPhase
+
+    class InTablePhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-table
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("caption", self.startTagCaption),
+                ("colgroup", self.startTagColgroup),
+                ("col", self.startTagCol),
+                (("tbody", "tfoot", "thead"), self.startTagRowGroup),
+                (("td", "th", "tr"), self.startTagImplyTbody),
+                ("table", self.startTagTable),
+                (("style", "script"), self.startTagStyleScript),
+                ("input", self.startTagInput),
+                ("form", self.startTagForm)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("table", self.endTagTable),
+                (("body", "caption", "col", "colgroup", "html", "tbody", "td",
+                  "tfoot", "th", "thead", "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper methods
+        def clearStackToTableContext(self):
+            # "clear the stack back to a table context"
+            while self.tree.openElements[-1].name not in ("table", "html"):
+                # self.parser.parseError("unexpected-implied-end-tag-in-table",
+                #  {"name":  self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
+            # When the current node is <html> it's an innerHTML case
+
+        # processing methods
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-table")
+            else:
+                assert self.parser.innerHTML
+            # Stop parsing
+
+        def processSpaceCharacters(self, token):
+            originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["inTableText"]
+            self.parser.phase.originalPhase = originalPhase
+            self.parser.phase.processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            originalPhase = self.parser.phase
+            self.parser.phase = self.parser.phases["inTableText"]
+            self.parser.phase.originalPhase = originalPhase
+            self.parser.phase.processCharacters(token)
+
+        def insertText(self, token):
+            # If we get here there must be at least one non-whitespace character
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processCharacters(token)
+            self.tree.insertFromTable = False
+
+        def startTagCaption(self, token):
+            self.clearStackToTableContext()
+            self.tree.activeFormattingElements.append(Marker)
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inCaption"]
+
+        def startTagColgroup(self, token):
+            self.clearStackToTableContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inColumnGroup"]
+
+        def startTagCol(self, token):
+            self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
+            return token
+
+        def startTagRowGroup(self, token):
+            self.clearStackToTableContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inTableBody"]
+
+        def startTagImplyTbody(self, token):
+            self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
+            return token
+
+        def startTagTable(self, token):
+            self.parser.parseError("unexpected-start-tag-implies-end-tag",
+                                   {"startName": "table", "endName": "table"})
+            self.parser.phase.processEndTag(impliedTagToken("table"))
+            if not self.parser.innerHTML:
+                return token
+
+        def startTagStyleScript(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagInput(self, token):
+            if ("type" in token["data"] and
+                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+                self.parser.parseError("unexpected-hidden-input-in-table")
+                self.tree.insertElement(token)
+                # XXX associate with form
+                self.tree.openElements.pop()
+            else:
+                self.startTagOther(token)
+
+        def startTagForm(self, token):
+            self.parser.parseError("unexpected-form-in-table")
+            if self.tree.formPointer is None:
+                self.tree.insertElement(token)
+                self.tree.formPointer = self.tree.openElements[-1]
+                self.tree.openElements.pop()
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processStartTag(token)
+            self.tree.insertFromTable = False
+
+        def endTagTable(self, token):
+            if self.tree.elementInScope("table", variant="table"):
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "table":
+                    self.parser.parseError("end-tag-too-early-named",
+                                           {"gotName": "table",
+                                            "expectedName": self.tree.openElements[-1].name})
+                while self.tree.openElements[-1].name != "table":
+                    self.tree.openElements.pop()
+                self.tree.openElements.pop()
+                self.parser.resetInsertionMode()
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
+            # Do the table magic!
+            self.tree.insertFromTable = True
+            self.parser.phases["inBody"].processEndTag(token)
+            self.tree.insertFromTable = False
+
+    class InTableTextPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.originalPhase = None
+            self.characterTokens = []
+
+        def flushCharacters(self):
+            data = "".join([item["data"] for item in self.characterTokens])
+            if any([item not in spaceCharacters for item in data]):
+                token = {"type": tokenTypes["Characters"], "data": data}
+                self.parser.phases["inTable"].insertText(token)
+            elif data:
+                self.tree.insertText(data)
+            self.characterTokens = []
+
+        def processComment(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+        def processEOF(self):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return True
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                return
+            self.characterTokens.append(token)
+
+        def processSpaceCharacters(self, token):
+            # pretty sure we should never reach here
+            self.characterTokens.append(token)
+    #        assert False
+
+        def processStartTag(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+        def processEndTag(self, token):
+            self.flushCharacters()
+            self.parser.phase = self.originalPhase
+            return token
+
+    class InCaptionPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.startTagTableElement)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("caption", self.endTagCaption),
+                ("table", self.endTagTable),
+                (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def ignoreEndTagCaption(self):
+            return not self.tree.elementInScope("caption", variant="table")
+
+        def processEOF(self):
+            self.parser.phases["inBody"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inBody"].processCharacters(token)
+
+        def startTagTableElement(self, token):
+            self.parser.parseError()
+            # XXX Have to duplicate logic here to find out if the tag is ignored
+            ignoreEndTag = self.ignoreEndTagCaption()
+            self.parser.phase.processEndTag(impliedTagToken("caption"))
+            if not ignoreEndTag:
+                return token
+
+        def startTagOther(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def endTagCaption(self, token):
+            if not self.ignoreEndTagCaption():
+                # AT this code is quite similar to endTagTable in "InTable"
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != "caption":
+                    self.parser.parseError("expected-one-end-tag-but-got-another",
+                                           {"gotName": "caption",
+                                            "expectedName": self.tree.openElements[-1].name})
+                while self.tree.openElements[-1].name != "caption":
+                    self.tree.openElements.pop()
+                self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+                self.parser.phase = self.parser.phases["inTable"]
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagTable(self, token):
+            self.parser.parseError()
+            ignoreEndTag = self.ignoreEndTagCaption()
+            self.parser.phase.processEndTag(impliedTagToken("caption"))
+            if not ignoreEndTag:
+                return token
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagOther(self, token):
+            return self.parser.phases["inBody"].processEndTag(token)
+
+    class InColumnGroupPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-column
+
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("col", self.startTagCol)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("colgroup", self.endTagColgroup),
+                ("col", self.endTagCol)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def ignoreEndTagColgroup(self):
+            return self.tree.openElements[-1].name == "html"
+
+        def processEOF(self):
+            if self.tree.openElements[-1].name == "html":
+                assert self.parser.innerHTML
+                return
+            else:
+                ignoreEndTag = self.ignoreEndTagColgroup()
+                self.endTagColgroup(impliedTagToken("colgroup"))
+                if not ignoreEndTag:
+                    return True
+
+        def processCharacters(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+        def startTagCol(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+        def startTagOther(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+        def endTagColgroup(self, token):
+            if self.ignoreEndTagColgroup():
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+            else:
+                self.tree.openElements.pop()
+                self.parser.phase = self.parser.phases["inTable"]
+
+        def endTagCol(self, token):
+            self.parser.parseError("no-end-tag", {"name": "col"})
+
+        def endTagOther(self, token):
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup(impliedTagToken("colgroup"))
+            if not ignoreEndTag:
+                return token
+
+    class InTableBodyPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("tr", self.startTagTr),
+                (("td", "th"), self.startTagTableCell),
+                (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
+                 self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+                ("table", self.endTagTable),
+                (("body", "caption", "col", "colgroup", "html", "td", "th",
+                  "tr"), self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper methods
+        def clearStackToTableBodyContext(self):
+            while self.tree.openElements[-1].name not in ("tbody", "tfoot",
+                                                          "thead", "html"):
+                # self.parser.parseError("unexpected-implied-end-tag-in-table",
+                #  {"name": self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
+            if self.tree.openElements[-1].name == "html":
+                assert self.parser.innerHTML
+
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inTable"].processEOF()
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inTable"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            return self.parser.phases["inTable"].processCharacters(token)
+
+        def startTagTr(self, token):
+            self.clearStackToTableBodyContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inRow"]
+
+        def startTagTableCell(self, token):
+            self.parser.parseError("unexpected-cell-in-table-body",
+                                   {"name": token["name"]})
+            self.startTagTr(impliedTagToken("tr", "StartTag"))
+            return token
+
+        def startTagTableOther(self, token):
+            # XXX AT Any ideas on how to share this with endTagTable?
+            if (self.tree.elementInScope("tbody", variant="table") or
+                self.tree.elementInScope("thead", variant="table") or
+                    self.tree.elementInScope("tfoot", variant="table")):
+                self.clearStackToTableBodyContext()
+                self.endTagTableRowGroup(
+                    impliedTagToken(self.tree.openElements[-1].name))
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def startTagOther(self, token):
+            return self.parser.phases["inTable"].processStartTag(token)
+
+        def endTagTableRowGroup(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.clearStackToTableBodyContext()
+                self.tree.openElements.pop()
+                self.parser.phase = self.parser.phases["inTable"]
+            else:
+                self.parser.parseError("unexpected-end-tag-in-table-body",
+                                       {"name": token["name"]})
+
+        def endTagTable(self, token):
+            if (self.tree.elementInScope("tbody", variant="table") or
+                self.tree.elementInScope("thead", variant="table") or
+                    self.tree.elementInScope("tfoot", variant="table")):
+                self.clearStackToTableBodyContext()
+                self.endTagTableRowGroup(
+                    impliedTagToken(self.tree.openElements[-1].name))
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag-in-table-body",
+                                   {"name": token["name"]})
+
+        def endTagOther(self, token):
+            return self.parser.phases["inTable"].processEndTag(token)
+
+    class InRowPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-row
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("td", "th"), self.startTagTableCell),
+                (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
+                  "tr"), self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("tr", self.endTagTr),
+                ("table", self.endTagTable),
+                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
+                (("body", "caption", "col", "colgroup", "html", "td", "th"),
+                 self.endTagIgnore)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper methods (XXX unify this with other table helper methods)
+        def clearStackToTableRowContext(self):
+            while self.tree.openElements[-1].name not in ("tr", "html"):
+                self.parser.parseError("unexpected-implied-end-tag-in-table-row",
+                                       {"name": self.tree.openElements[-1].name})
+                self.tree.openElements.pop()
+
+        def ignoreEndTagTr(self):
+            return not self.tree.elementInScope("tr", variant="table")
+
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inTable"].processEOF()
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inTable"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            return self.parser.phases["inTable"].processCharacters(token)
+
+        def startTagTableCell(self, token):
+            self.clearStackToTableRowContext()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inCell"]
+            self.tree.activeFormattingElements.append(Marker)
+
+        def startTagTableOther(self, token):
+            ignoreEndTag = self.ignoreEndTagTr()
+            self.endTagTr(impliedTagToken("tr"))
+            # XXX how are we sure it's always ignored in the innerHTML case?
+            if not ignoreEndTag:
+                return token
+
+        def startTagOther(self, token):
+            return self.parser.phases["inTable"].processStartTag(token)
+
+        def endTagTr(self, token):
+            if not self.ignoreEndTagTr():
+                self.clearStackToTableRowContext()
+                self.tree.openElements.pop()
+                self.parser.phase = self.parser.phases["inTableBody"]
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagTable(self, token):
+            ignoreEndTag = self.ignoreEndTagTr()
+            self.endTagTr(impliedTagToken("tr"))
+            # Reprocess the current tag if the tr end tag was not ignored
+            # XXX how are we sure it's always ignored in the innerHTML case?
+            if not ignoreEndTag:
+                return token
+
+        def endTagTableRowGroup(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.endTagTr(impliedTagToken("tr"))
+                return token
+            else:
+                self.parser.parseError()
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag-in-table-row",
+                                   {"name": token["name"]})
+
+        def endTagOther(self, token):
+            return self.parser.phases["inTable"].processEndTag(token)
+
+    class InCellPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
+                  "thead", "tr"), self.startTagTableOther)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                (("td", "th"), self.endTagTableCell),
+                (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
+                (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # helper
+        def closeCell(self):
+            if self.tree.elementInScope("td", variant="table"):
+                self.endTagTableCell(impliedTagToken("td"))
+            elif self.tree.elementInScope("th", variant="table"):
+                self.endTagTableCell(impliedTagToken("th"))
+
+        # the rest
+        def processEOF(self):
+            self.parser.phases["inBody"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inBody"].processCharacters(token)
+
+        def startTagTableOther(self, token):
+            if (self.tree.elementInScope("td", variant="table") or
+                    self.tree.elementInScope("th", variant="table")):
+                self.closeCell()
+                return token
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def startTagOther(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def endTagTableCell(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.tree.generateImpliedEndTags(token["name"])
+                if self.tree.openElements[-1].name != token["name"]:
+                    self.parser.parseError("unexpected-cell-end-tag",
+                                           {"name": token["name"]})
+                    while True:
+                        node = self.tree.openElements.pop()
+                        if node.name == token["name"]:
+                            break
+                else:
+                    self.tree.openElements.pop()
+                self.tree.clearActiveFormattingElements()
+                self.parser.phase = self.parser.phases["inRow"]
+            else:
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagIgnore(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def endTagImply(self, token):
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.closeCell()
+                return token
+            else:
+                # sometimes innerHTML case
+                self.parser.parseError()
+
+        def endTagOther(self, token):
+            return self.parser.phases["inBody"].processEndTag(token)
+
+    class InSelectPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("option", self.startTagOption),
+                ("optgroup", self.startTagOptgroup),
+                ("select", self.startTagSelect),
+                (("input", "keygen", "textarea"), self.startTagInput),
+                ("script", self.startTagScript)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("option", self.endTagOption),
+                ("optgroup", self.endTagOptgroup),
+                ("select", self.endTagSelect)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-select
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-select")
+            else:
+                assert self.parser.innerHTML
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                return
+            self.tree.insertText(token["data"])
+
+        def startTagOption(self, token):
+            # We need to imply </option> if <option> is the current node.
+            if self.tree.openElements[-1].name == "option":
+                self.tree.openElements.pop()
+            self.tree.insertElement(token)
+
+        def startTagOptgroup(self, token):
+            if self.tree.openElements[-1].name == "option":
+                self.tree.openElements.pop()
+            if self.tree.openElements[-1].name == "optgroup":
+                self.tree.openElements.pop()
+            self.tree.insertElement(token)
+
+        def startTagSelect(self, token):
+            self.parser.parseError("unexpected-select-in-select")
+            self.endTagSelect(impliedTagToken("select"))
+
+        def startTagInput(self, token):
+            self.parser.parseError("unexpected-input-in-select")
+            if self.tree.elementInScope("select", variant="select"):
+                self.endTagSelect(impliedTagToken("select"))
+                return token
+            else:
+                assert self.parser.innerHTML
+
+        def startTagScript(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-in-select",
+                                   {"name": token["name"]})
+
+        def endTagOption(self, token):
+            if self.tree.openElements[-1].name == "option":
+                self.tree.openElements.pop()
+            else:
+                self.parser.parseError("unexpected-end-tag-in-select",
+                                       {"name": "option"})
+
+        def endTagOptgroup(self, token):
+            # </optgroup> implicitly closes <option>
+            if (self.tree.openElements[-1].name == "option" and
+                    self.tree.openElements[-2].name == "optgroup"):
+                self.tree.openElements.pop()
+            # It also closes </optgroup>
+            if self.tree.openElements[-1].name == "optgroup":
+                self.tree.openElements.pop()
+            # But nothing else
+            else:
+                self.parser.parseError("unexpected-end-tag-in-select",
+                                       {"name": "optgroup"})
+
+        def endTagSelect(self, token):
+            if self.tree.elementInScope("select", variant="select"):
+                node = self.tree.openElements.pop()
+                while node.name != "select":
+                    node = self.tree.openElements.pop()
+                self.parser.resetInsertionMode()
+            else:
+                # innerHTML case
+                assert self.parser.innerHTML
+                self.parser.parseError()
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-in-select",
+                                   {"name": token["name"]})
+
+    class InSelectInTablePhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+                 self.startTagTable)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+                 self.endTagTable)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.parser.phases["inSelect"].processEOF()
+
+        def processCharacters(self, token):
+            return self.parser.phases["inSelect"].processCharacters(token)
+
+        def startTagTable(self, token):
+            self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
+            self.endTagOther(impliedTagToken("select"))
+            return token
+
+        def startTagOther(self, token):
+            return self.parser.phases["inSelect"].processStartTag(token)
+
+        def endTagTable(self, token):
+            self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
+            if self.tree.elementInScope(token["name"], variant="table"):
+                self.endTagOther(impliedTagToken("select"))
+                return token
+
+        def endTagOther(self, token):
+            return self.parser.phases["inSelect"].processEndTag(token)
+
+    class InForeignContentPhase(Phase):
+        breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
+                                      "center", "code", "dd", "div", "dl", "dt",
+                                      "em", "embed", "h1", "h2", "h3",
+                                      "h4", "h5", "h6", "head", "hr", "i", "img",
+                                      "li", "listing", "menu", "meta", "nobr",
+                                      "ol", "p", "pre", "ruby", "s", "small",
+                                      "span", "strong", "strike", "sub", "sup",
+                                      "table", "tt", "u", "ul", "var"])
+
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+        def adjustSVGTagNames(self, token):
+            replacements = {"altglyph": "altGlyph",
+                            "altglyphdef": "altGlyphDef",
+                            "altglyphitem": "altGlyphItem",
+                            "animatecolor": "animateColor",
+                            "animatemotion": "animateMotion",
+                            "animatetransform": "animateTransform",
+                            "clippath": "clipPath",
+                            "feblend": "feBlend",
+                            "fecolormatrix": "feColorMatrix",
+                            "fecomponenttransfer": "feComponentTransfer",
+                            "fecomposite": "feComposite",
+                            "feconvolvematrix": "feConvolveMatrix",
+                            "fediffuselighting": "feDiffuseLighting",
+                            "fedisplacementmap": "feDisplacementMap",
+                            "fedistantlight": "feDistantLight",
+                            "feflood": "feFlood",
+                            "fefunca": "feFuncA",
+                            "fefuncb": "feFuncB",
+                            "fefuncg": "feFuncG",
+                            "fefuncr": "feFuncR",
+                            "fegaussianblur": "feGaussianBlur",
+                            "feimage": "feImage",
+                            "femerge": "feMerge",
+                            "femergenode": "feMergeNode",
+                            "femorphology": "feMorphology",
+                            "feoffset": "feOffset",
+                            "fepointlight": "fePointLight",
+                            "fespecularlighting": "feSpecularLighting",
+                            "fespotlight": "feSpotLight",
+                            "fetile": "feTile",
+                            "feturbulence": "feTurbulence",
+                            "foreignobject": "foreignObject",
+                            "glyphref": "glyphRef",
+                            "lineargradient": "linearGradient",
+                            "radialgradient": "radialGradient",
+                            "textpath": "textPath"}
+
+            if token["name"] in replacements:
+                token["name"] = replacements[token["name"]]
+
+        def processCharacters(self, token):
+            if token["data"] == "\u0000":
+                token["data"] = "\uFFFD"
+            elif (self.parser.framesetOK and
+                  any(char not in spaceCharacters for char in token["data"])):
+                self.parser.framesetOK = False
+            Phase.processCharacters(self, token)
+
+        def processStartTag(self, token):
+            currentNode = self.tree.openElements[-1]
+            if (token["name"] in self.breakoutElements or
+                (token["name"] == "font" and
+                 set(token["data"].keys()) & set(["color", "face", "size"]))):
+                self.parser.parseError("unexpected-html-element-in-foreign-content",
+                                       {"name": token["name"]})
+                while (self.tree.openElements[-1].namespace !=
+                       self.tree.defaultNamespace and
+                       not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
+                       not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
+                    self.tree.openElements.pop()
+                return token
+
+            else:
+                if currentNode.namespace == namespaces["mathml"]:
+                    self.parser.adjustMathMLAttributes(token)
+                elif currentNode.namespace == namespaces["svg"]:
+                    self.adjustSVGTagNames(token)
+                    self.parser.adjustSVGAttributes(token)
+                self.parser.adjustForeignAttributes(token)
+                token["namespace"] = currentNode.namespace
+                self.tree.insertElement(token)
+                if token["selfClosing"]:
+                    self.tree.openElements.pop()
+                    token["selfClosingAcknowledged"] = True
+
+        def processEndTag(self, token):
+            nodeIndex = len(self.tree.openElements) - 1
+            node = self.tree.openElements[-1]
+            if node.name.translate(asciiUpper2Lower) != token["name"]:
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+            while True:
+                if node.name.translate(asciiUpper2Lower) == token["name"]:
+                    # XXX this isn't in the spec but it seems necessary
+                    if self.parser.phase == self.parser.phases["inTableText"]:
+                        self.parser.phase.flushCharacters()
+                        self.parser.phase = self.parser.phase.originalPhase
+                    while self.tree.openElements.pop() != node:
+                        assert self.tree.openElements
+                    new_token = None
+                    break
+                nodeIndex -= 1
+
+                node = self.tree.openElements[nodeIndex]
+                if node.namespace != self.tree.defaultNamespace:
+                    continue
+                else:
+                    new_token = self.parser.phase.processEndTag(token)
+                    break
+            return new_token
+
+    class AfterBodyPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            # Stop parsing
+            pass
+
+        def processComment(self, token):
+            # This is needed because data is to be appended to the <html> element
+            # here and not to whatever is currently open.
+            self.tree.insertComment(token, self.tree.openElements[0])
+
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-after-body")
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-after-body",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+        def endTagHtml(self, name):
+            if self.parser.innerHTML:
+                self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
+            else:
+                self.parser.phase = self.parser.phases["afterAfterBody"]
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-after-body",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+    class InFramesetPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("frameset", self.startTagFrameset),
+                ("frame", self.startTagFrame),
+                ("noframes", self.startTagNoframes)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("frameset", self.endTagFrameset)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            if self.tree.openElements[-1].name != "html":
+                self.parser.parseError("eof-in-frameset")
+            else:
+                assert self.parser.innerHTML
+
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-in-frameset")
+
+        def startTagFrameset(self, token):
+            self.tree.insertElement(token)
+
+        def startTagFrame(self, token):
+            self.tree.insertElement(token)
+            self.tree.openElements.pop()
+
+        def startTagNoframes(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-in-frameset",
+                                   {"name": token["name"]})
+
+        def endTagFrameset(self, token):
+            if self.tree.openElements[-1].name == "html":
+                # innerHTML case
+                self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
+            else:
+                self.tree.openElements.pop()
+            if (not self.parser.innerHTML and
+                    self.tree.openElements[-1].name != "frameset"):
+                # If we're not in innerHTML mode and the current node is not a
+                # "frameset" element (anymore) then switch.
+                self.parser.phase = self.parser.phases["afterFrameset"]
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-in-frameset",
+                                   {"name": token["name"]})
+
+    class AfterFramesetPhase(Phase):
+        # http://www.whatwg.org/specs/web-apps/current-work/#after3
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("noframes", self.startTagNoframes)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("html", self.endTagHtml)
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            # Stop parsing
+            pass
+
+        def processCharacters(self, token):
+            self.parser.parseError("unexpected-char-after-frameset")
+
+        def startTagNoframes(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-start-tag-after-frameset",
+                                   {"name": token["name"]})
+
+        def endTagHtml(self, token):
+            self.parser.phase = self.parser.phases["afterAfterFrameset"]
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag-after-frameset",
+                                   {"name": token["name"]})
+
+    class AfterAfterBodyPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+        def processEOF(self):
+            pass
+
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inBody"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            self.parser.parseError("expected-eof-but-got-char")
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("expected-eof-but-got-start-tag",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+        def processEndTag(self, token):
+            self.parser.parseError("expected-eof-but-got-end-tag",
+                                   {"name": token["name"]})
+            self.parser.phase = self.parser.phases["inBody"]
+            return token
+
+    class AfterAfterFramesetPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                ("noframes", self.startTagNoFrames)
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+        def processEOF(self):
+            pass
+
+        def processComment(self, token):
+            self.tree.insertComment(token, self.tree.document)
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inBody"].processSpaceCharacters(token)
+
+        def processCharacters(self, token):
+            self.parser.parseError("expected-eof-but-got-char")
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagNoFrames(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagOther(self, token):
+            self.parser.parseError("expected-eof-but-got-start-tag",
+                                   {"name": token["name"]})
+
+        def processEndTag(self, token):
+            self.parser.parseError("expected-eof-but-got-end-tag",
+                                   {"name": token["name"]})
+    # pylint:enable=unused-argument
+
+    return {
+        "initial": InitialPhase,
+        "beforeHtml": BeforeHtmlPhase,
+        "beforeHead": BeforeHeadPhase,
+        "inHead": InHeadPhase,
+        "inHeadNoscript": InHeadNoscriptPhase,
+        "afterHead": AfterHeadPhase,
+        "inBody": InBodyPhase,
+        "text": TextPhase,
+        "inTable": InTablePhase,
+        "inTableText": InTableTextPhase,
+        "inCaption": InCaptionPhase,
+        "inColumnGroup": InColumnGroupPhase,
+        "inTableBody": InTableBodyPhase,
+        "inRow": InRowPhase,
+        "inCell": InCellPhase,
+        "inSelect": InSelectPhase,
+        "inSelectInTable": InSelectInTablePhase,
+        "inForeignContent": InForeignContentPhase,
+        "afterBody": AfterBodyPhase,
+        "inFrameset": InFramesetPhase,
+        "afterFrameset": AfterFramesetPhase,
+        "afterAfterBody": AfterAfterBodyPhase,
+        "afterAfterFrameset": AfterAfterFramesetPhase,
+        # XXX after after frameset
+    }
+
+
+def adjust_attributes(token, replacements):
+    if PY3 or _utils.PY27:
+        needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
+    else:
+        needs_adjustment = frozenset(token['data']) & frozenset(replacements)
+    if needs_adjustment:
+        token['data'] = OrderedDict((replacements.get(k, k), v)
+                                    for k, v in token['data'].items())
+
+
+def impliedTagToken(name, type="EndTag", attributes=None,
+                    selfClosing=False):
+    if attributes is None:
+        attributes = {}
+    return {"type": tokenTypes[type], "name": name, "data": attributes,
+            "selfClosing": selfClosing}
+
+
+class ParseError(Exception):
+    """Error in parsed document"""
+    pass
diff --git a/html_cleaner/html5lib/serializer.py b/html_cleaner/html5lib/serializer.py
new file mode 100644
index 0000000..103dd20
--- /dev/null
+++ b/html_cleaner/html5lib/serializer.py
@@ -0,0 +1,334 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+import re
+
+from codecs import register_error, xmlcharrefreplace_errors
+
+from .constants import voidElements, booleanAttributes, spaceCharacters
+from .constants import rcdataElements, entities, xmlEntities
+from . import treewalkers, _utils
+from xml.sax.saxutils import escape
+
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
+_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
+_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
+                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                   "\u3000]")
+
+
+_encode_entity_map = {}
+_is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+    # skip multi-character entities
+    if ((_is_ucs4 and len(v) > 1) or
+            (not _is_ucs4 and len(v) > 2)):
+        continue
+    if v != "&":
+        if len(v) == 2:
+            v = _utils.surrogatePairToCodepoint(v)
+        else:
+            v = ord(v)
+        if v not in _encode_entity_map or k.islower():
+            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+            _encode_entity_map[v] = k
+
+
+def htmlentityreplace_errors(exc):
+    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+        res = []
+        codepoints = []
+        skip = False
+        for i, c in enumerate(exc.object[exc.start:exc.end]):
+            if skip:
+                skip = False
+                continue
+            index = i + exc.start
+            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+                skip = True
+            else:
+                codepoint = ord(c)
+            codepoints.append(codepoint)
+        for cp in codepoints:
+            e = _encode_entity_map.get(cp)
+            if e:
+                res.append("&")
+                res.append(e)
+                if not e.endswith(";"):
+                    res.append(";")
+            else:
+                res.append("&#x%s;" % (hex(cp)[2:]))
+        return ("".join(res), exc.end)
+    else:
+        return xmlcharrefreplace_errors(exc)
+
+register_error("htmlentityreplace", htmlentityreplace_errors)
+
+
+def serialize(input, tree="etree", encoding=None, **serializer_opts):
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree)
+    s = HTMLSerializer(**serializer_opts)
+    return s.render(walker(input), encoding)
+
+
+class HTMLSerializer(object):
+
+    # attribute quoting options
+    quote_attr_values = "legacy"  # be secure by default
+    quote_char = '"'
+    use_best_quote_char = True
+
+    # tag syntax options
+    omit_optional_tags = True
+    minimize_boolean_attributes = True
+    use_trailing_solidus = False
+    space_before_trailing_solidus = True
+
+    # escaping options
+    escape_lt_in_attrs = False
+    escape_rcdata = False
+    resolve_entities = True
+
+    # miscellaneous options
+    alphabetical_attributes = False
+    inject_meta_charset = True
+    strip_whitespace = False
+    sanitize = False
+
+    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+               "omit_optional_tags", "minimize_boolean_attributes",
+               "use_trailing_solidus", "space_before_trailing_solidus",
+               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
+               "alphabetical_attributes", "inject_meta_charset",
+               "strip_whitespace", "sanitize")
+
+    def __init__(self, **kwargs):
+        """Initialize HTMLSerializer.
+
+        Keyword options (default given first unless specified) include:
+
+        inject_meta_charset=True|False
+          Whether it insert a meta element to define the character set of the
+          document.
+        quote_attr_values="legacy"|"spec"|"always"
+          Whether to quote attribute values that don't require quoting
+          per legacy browser behaviour, when required by the standard, or always.
+        quote_char=u'"'|u"'"
+          Use given quote character for attribute quoting. Default is to
+          use double quote unless attribute value contains a double quote,
+          in which case single quotes are used instead.
+        escape_lt_in_attrs=False|True
+          Whether to escape < in attribute values.
+        escape_rcdata=False|True
+          Whether to escape characters that need to be escaped within normal
+          elements within rcdata elements such as style.
+        resolve_entities=True|False
+          Whether to resolve named character entities that appear in the
+          source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
+          are unaffected by this setting.
+        strip_whitespace=False|True
+          Whether to remove semantically meaningless whitespace. (This
+          compresses all whitespace to a single space except within pre.)
+        minimize_boolean_attributes=True|False
+          Shortens boolean attributes to give just the attribute value,
+          for example <input disabled="disabled"> becomes <input disabled>.
+        use_trailing_solidus=False|True
+          Includes a close-tag slash at the end of the start tag of void
+          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
+        space_before_trailing_solidus=True|False
+          Places a space immediately before the closing slash in a tag
+          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
+        sanitize=False|True
+          Strip all unsafe or unknown constructs from output.
+          See `html5lib user documentation`_
+        omit_optional_tags=True|False
+          Omit start/end tags that are optional.
+        alphabetical_attributes=False|True
+          Reorder attributes to be in alphabetical order.
+
+        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
+        """
+        unexpected_args = frozenset(kwargs) - frozenset(self.options)
+        if len(unexpected_args) > 0:
+            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
+        if 'quote_char' in kwargs:
+            self.use_best_quote_char = False
+        for attr in self.options:
+            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
+        self.errors = []
+        self.strict = False
+
+    def encode(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "htmlentityreplace")
+        else:
+            return string
+
+    def encodeStrict(self, string):
+        assert(isinstance(string, text_type))
+        if self.encoding:
+            return string.encode(self.encoding, "strict")
+        else:
+            return string
+
+    def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
+        self.encoding = encoding
+        in_cdata = False
+        self.errors = []
+
+        if encoding and self.inject_meta_charset:
+            from .filters.inject_meta_charset import Filter
+            treewalker = Filter(treewalker, encoding)
+        # Alphabetical attributes is here under the assumption that none of
+        # the later filters add or change order of attributes; it needs to be
+        # before the sanitizer so escaped elements come out correctly
+        if self.alphabetical_attributes:
+            from .filters.alphabeticalattributes import Filter
+            treewalker = Filter(treewalker)
+        # WhitespaceFilter should be used before OptionalTagFilter
+        # for maximum efficiently of this latter filter
+        if self.strip_whitespace:
+            from .filters.whitespace import Filter
+            treewalker = Filter(treewalker)
+        if self.sanitize:
+            from .filters.sanitizer import Filter
+            treewalker = Filter(treewalker)
+        if self.omit_optional_tags:
+            from .filters.optionaltags import Filter
+            treewalker = Filter(treewalker)
+
+        for token in treewalker:
+            type = token["type"]
+            if type == "Doctype":
+                doctype = "<!DOCTYPE %s" % token["name"]
+
+                if token["publicId"]:
+                    doctype += ' PUBLIC "%s"' % token["publicId"]
+                elif token["systemId"]:
+                    doctype += " SYSTEM"
+                if token["systemId"]:
+                    if token["systemId"].find('"') >= 0:
+                        if token["systemId"].find("'") >= 0:
+                            self.serializeError("System identifer contains both single and double quote characters")
+                        quote_char = "'"
+                    else:
+                        quote_char = '"'
+                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
+
+                doctype += ">"
+                yield self.encodeStrict(doctype)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                if type == "SpaceCharacters" or in_cdata:
+                    if in_cdata and token["data"].find("</") >= 0:
+                        self.serializeError("Unexpected </ in CDATA")
+                    yield self.encode(token["data"])
+                else:
+                    yield self.encode(escape(token["data"]))
+
+            elif type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                yield self.encodeStrict("<%s" % name)
+                if name in rcdataElements and not self.escape_rcdata:
+                    in_cdata = True
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                for (_, attr_name), attr_value in token["data"].items():
+                    # TODO: Add namespace support here
+                    k = attr_name
+                    v = attr_value
+                    yield self.encodeStrict(' ')
+
+                    yield self.encodeStrict(k)
+                    if not self.minimize_boolean_attributes or \
+                        (k not in booleanAttributes.get(name, tuple()) and
+                         k not in booleanAttributes.get("", tuple())):
+                        yield self.encodeStrict("=")
+                        if self.quote_attr_values == "always" or len(v) == 0:
+                            quote_attr = True
+                        elif self.quote_attr_values == "spec":
+                            quote_attr = _quoteAttributeSpec.search(v) is not None
+                        elif self.quote_attr_values == "legacy":
+                            quote_attr = _quoteAttributeLegacy.search(v) is not None
+                        else:
+                            raise ValueError("quote_attr_values must be one of: "
+                                             "'always', 'spec', or 'legacy'")
+                        v = v.replace("&", "&amp;")
+                        if self.escape_lt_in_attrs:
+                            v = v.replace("<", "&lt;")
+                        if quote_attr:
+                            quote_char = self.quote_char
+                            if self.use_best_quote_char:
+                                if "'" in v and '"' not in v:
+                                    quote_char = '"'
+                                elif '"' in v and "'" not in v:
+                                    quote_char = "'"
+                            if quote_char == "'":
+                                v = v.replace("'", "&#39;")
+                            else:
+                                v = v.replace('"', "&quot;")
+                            yield self.encodeStrict(quote_char)
+                            yield self.encode(v)
+                            yield self.encodeStrict(quote_char)
+                        else:
+                            yield self.encode(v)
+                if name in voidElements and self.use_trailing_solidus:
+                    if self.space_before_trailing_solidus:
+                        yield self.encodeStrict(" /")
+                    else:
+                        yield self.encodeStrict("/")
+                yield self.encode(">")
+
+            elif type == "EndTag":
+                name = token["name"]
+                if name in rcdataElements:
+                    in_cdata = False
+                elif in_cdata:
+                    self.serializeError("Unexpected child element of a CDATA element")
+                yield self.encodeStrict("</%s>" % name)
+
+            elif type == "Comment":
+                data = token["data"]
+                if data.find("--") >= 0:
+                    self.serializeError("Comment contains --")
+                yield self.encodeStrict("<!--%s-->" % token["data"])
+
+            elif type == "Entity":
+                name = token["name"]
+                key = name + ";"
+                if key not in entities:
+                    self.serializeError("Entity %s not recognized" % name)
+                if self.resolve_entities and key not in xmlEntities:
+                    data = entities[key]
+                else:
+                    data = "&%s;" % name
+                yield self.encodeStrict(data)
+
+            else:
+                self.serializeError(token["data"])
+
+    def render(self, treewalker, encoding=None):
+        if encoding:
+            return b"".join(list(self.serialize(treewalker, encoding)))
+        else:
+            return "".join(list(self.serialize(treewalker)))
+
+    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
+        # XXX The idea is to make data mandatory.
+        self.errors.append(data)
+        if self.strict:
+            raise SerializeError
+
+
+class SerializeError(Exception):
+    """Error in serialized tree"""
+    pass
diff --git a/html_cleaner/html5lib/treeadapters/__init__.py b/html_cleaner/html5lib/treeadapters/__init__.py
new file mode 100644
index 0000000..4f97846
--- /dev/null
+++ b/html_cleaner/html5lib/treeadapters/__init__.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import sax
+
+__all__ = ["sax"]
+
+try:
+    from . import genshi  # noqa
+except ImportError:
+    pass
+else:
+    __all__.append("genshi")
diff --git a/html_cleaner/html5lib/treeadapters/genshi.py b/html_cleaner/html5lib/treeadapters/genshi.py
new file mode 100644
index 0000000..04e316d
--- /dev/null
+++ b/html_cleaner/html5lib/treeadapters/genshi.py
@@ -0,0 +1,47 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName, Attrs
+from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
+
+
+def to_genshi(walker):
+    text = []
+    for token in walker:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            text.append(token["data"])
+        elif text:
+            yield TEXT, "".join(text), (None, -1, -1)
+            text = []
+
+        if type in ("StartTag", "EmptyTag"):
+            if token["namespace"]:
+                name = "{%s}%s" % (token["namespace"], token["name"])
+            else:
+                name = token["name"]
+            attrs = Attrs([(QName("{%s}%s" % attr if attr[0] is not None else attr[1]), value)
+                           for attr, value in token["data"].items()])
+            yield (START, (QName(name), attrs), (None, -1, -1))
+            if type == "EmptyTag":
+                type = "EndTag"
+
+        if type == "EndTag":
+            if token["namespace"]:
+                name = "{%s}%s" % (token["namespace"], token["name"])
+            else:
+                name = token["name"]
+
+            yield END, QName(name), (None, -1, -1)
+
+        elif type == "Comment":
+            yield COMMENT, token["data"], (None, -1, -1)
+
+        elif type == "Doctype":
+            yield DOCTYPE, (token["name"], token["publicId"],
+                            token["systemId"]), (None, -1, -1)
+
+        else:
+            pass  # FIXME: What to do?
+
+    if text:
+        yield TEXT, "".join(text), (None, -1, -1)
diff --git a/html_cleaner/html5lib/treeadapters/sax.py b/html_cleaner/html5lib/treeadapters/sax.py
new file mode 100644
index 0000000..ad47df9
--- /dev/null
+++ b/html_cleaner/html5lib/treeadapters/sax.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.sax.xmlreader import AttributesNSImpl
+
+from ..constants import adjustForeignAttributes, unadjustForeignAttributes
+
+prefix_mapping = {}
+for prefix, localName, namespace in adjustForeignAttributes.values():
+    if prefix is not None:
+        prefix_mapping[prefix] = namespace
+
+
+def to_sax(walker, handler):
+    """Call SAX-like content handler based on treewalker walker"""
+    handler.startDocument()
+    for prefix, namespace in prefix_mapping.items():
+        handler.startPrefixMapping(prefix, namespace)
+
+    for token in walker:
+        type = token["type"]
+        if type == "Doctype":
+            continue
+        elif type in ("StartTag", "EmptyTag"):
+            attrs = AttributesNSImpl(token["data"],
+                                     unadjustForeignAttributes)
+            handler.startElementNS((token["namespace"], token["name"]),
+                                   token["name"],
+                                   attrs)
+            if type == "EmptyTag":
+                handler.endElementNS((token["namespace"], token["name"]),
+                                     token["name"])
+        elif type == "EndTag":
+            handler.endElementNS((token["namespace"], token["name"]),
+                                 token["name"])
+        elif type in ("Characters", "SpaceCharacters"):
+            handler.characters(token["data"])
+        elif type == "Comment":
+            pass
+        else:
+            assert False, "Unknown token type"
+
+    for prefix, namespace in prefix_mapping.items():
+        handler.endPrefixMapping(prefix)
+    handler.endDocument()
diff --git a/html_cleaner/html5lib/treebuilders/__init__.py b/html_cleaner/html5lib/treebuilders/__init__.py
new file mode 100644
index 0000000..e232884
--- /dev/null
+++ b/html_cleaner/html5lib/treebuilders/__init__.py
@@ -0,0 +1,76 @@
+"""A collection of modules for building different kinds of tree from
+HTML documents.
+
+To create a treebuilder for a new type of tree, you need to do
+implement several things:
+
+1) A set of classes for various types of elements: Document, Doctype,
+Comment, Element. These must implement the interface of
+_base.treebuilders.Node (although comment nodes have a different
+signature for their constructor, see treebuilders.etree.Comment)
+Textual content may also be implemented as another node type, or not, as
+your tree implementation requires.
+
+2) A treebuilder object (called TreeBuilder by convention) that
+inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
+documentClass - the class to use for the bottommost node of a document
+elementClass - the class to use for HTML Elements
+commentClass - the class to use for comments
+doctypeClass - the class to use for doctypes
+It also has one required method:
+getDocument - Returns the root node of the complete document tree
+
+3) If you wish to run the unit tests, you must also create a
+testSerializer method on your treebuilder which accepts a node and
+returns a string containing Node and its children serialized according
+to the format used in the unittests
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .._utils import default_etree
+
+treeBuilderCache = {}
+
+
+def getTreeBuilder(treeType, implementation=None, **kwargs):
+    """Get a TreeBuilder class for various types of tree with built-in support
+
+    treeType - the name of the tree type required (case-insensitive). Supported
+               values are:
+
+               "dom" - A generic builder for DOM implementations, defaulting to
+                       a xml.dom.minidom based implementation.
+               "etree" - A generic builder for tree implementations exposing an
+                         ElementTree-like interface, defaulting to
+                         xml.etree.cElementTree if available and
+                         xml.etree.ElementTree if not.
+               "lxml" - A etree-based builder for lxml.etree, handling
+                        limitations of lxml's implementation.
+
+    implementation - (Currently applies to the "etree" and "dom" tree types). A
+                      module implementing the tree type e.g.
+                      xml.etree.ElementTree or xml.etree.cElementTree."""
+
+    treeType = treeType.lower()
+    if treeType not in treeBuilderCache:
+        if treeType == "dom":
+            from . import dom
+            # Come up with a sane default (pref. from the stdlib)
+            if implementation is None:
+                from xml.dom import minidom
+                implementation = minidom
+            # NEVER cache here, caching is done in the dom submodule
+            return dom.getDomModule(implementation, **kwargs).TreeBuilder
+        elif treeType == "lxml":
+            from . import etree_lxml
+            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeBuilder
+        else:
+            raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
+    return treeBuilderCache.get(treeType)
diff --git a/html_cleaner/html5lib/treebuilders/base.py b/html_cleaner/html5lib/treebuilders/base.py
new file mode 100644
index 0000000..a4b2792
--- /dev/null
+++ b/html_cleaner/html5lib/treebuilders/base.py
@@ -0,0 +1,383 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from ..constants import scopingElements, tableInsertModeElements, namespaces
+
+# The scope markers are inserted when entering object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, object elements, and marquees.
+Marker = None
+
+listElementsMap = {
+    None: (frozenset(scopingElements), False),
+    "button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
+    "list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
+                                              (namespaces["html"], "ul")])), False),
+    "table": (frozenset([(namespaces["html"], "html"),
+                         (namespaces["html"], "table")]), False),
+    "select": (frozenset([(namespaces["html"], "optgroup"),
+                          (namespaces["html"], "option")]), True)
+}
+
+
+class Node(object):
+    def __init__(self, name):
+        """Node representing an item in the tree.
+        name - The tag name associated with the node
+        parent - The parent of the current node (or None for the document node)
+        value - The value of the current node (applies to text nodes and
+        comments
+        attributes - a dict holding name, value pairs for attributes of the node
+        childNodes - a list of child nodes of the current node. This must
+        include all elements but not necessarily other node types
+        _flags - A list of miscellaneous flags that can be set on the node
+        """
+        self.name = name
+        self.parent = None
+        self.value = None
+        self.attributes = {}
+        self.childNodes = []
+        self._flags = []
+
+    def __str__(self):
+        attributesStr = " ".join(["%s=\"%s\"" % (name, value)
+                                  for name, value in
+                                  self.attributes.items()])
+        if attributesStr:
+            return "<%s %s>" % (self.name, attributesStr)
+        else:
+            return "<%s>" % (self.name)
+
+    def __repr__(self):
+        return "<%s>" % (self.name)
+
+    def appendChild(self, node):
+        """Insert node as a child of the current node
+        """
+        raise NotImplementedError
+
+    def insertText(self, data, insertBefore=None):
+        """Insert data as text in the current node, positioned before the
+        start of node insertBefore or to the end of the node's text.
+        """
+        raise NotImplementedError
+
+    def insertBefore(self, node, refNode):
+        """Insert node as a child of the current node, before refNode in the
+        list of child nodes. Raises ValueError if refNode is not a child of
+        the current node"""
+        raise NotImplementedError
+
+    def removeChild(self, node):
+        """Remove node from the children of the current node
+        """
+        raise NotImplementedError
+
+    def reparentChildren(self, newParent):
+        """Move all the children of the current node to newParent.
+        This is needed so that trees that don't store text as nodes move the
+        text in the correct way
+        """
+        # XXX - should this method be made more general?
+        for child in self.childNodes:
+            newParent.appendChild(child)
+        self.childNodes = []
+
+    def cloneNode(self):
+        """Return a shallow copy of the current node i.e. a node with the same
+        name and attributes but with no parent or child nodes
+        """
+        raise NotImplementedError
+
+    def hasContent(self):
+        """Return true if the node has children or text, false otherwise
+        """
+        raise NotImplementedError
+
+
+class ActiveFormattingElements(list):
+    def append(self, node):
+        equalCount = 0
+        if node != Marker:
+            for element in self[::-1]:
+                if element == Marker:
+                    break
+                if self.nodesEqual(element, node):
+                    equalCount += 1
+                if equalCount == 3:
+                    self.remove(element)
+                    break
+        list.append(self, node)
+
+    def nodesEqual(self, node1, node2):
+        if not node1.nameTuple == node2.nameTuple:
+            return False
+
+        if not node1.attributes == node2.attributes:
+            return False
+
+        return True
+
+
+class TreeBuilder(object):
+    """Base treebuilder implementation
+    documentClass - the class to use for the bottommost node of a document
+    elementClass - the class to use for HTML Elements
+    commentClass - the class to use for comments
+    doctypeClass - the class to use for doctypes
+    """
+    # pylint:disable=not-callable
+
+    # Document class
+    documentClass = None
+
+    # The class to use for creating a node
+    elementClass = None
+
+    # The class to use for creating comments
+    commentClass = None
+
+    # The class to use for creating doctypes
+    doctypeClass = None
+
+    # Fragment class
+    fragmentClass = None
+
+    def __init__(self, namespaceHTMLElements):
+        if namespaceHTMLElements:
+            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
+        else:
+            self.defaultNamespace = None
+        self.reset()
+
+    def reset(self):
+        self.openElements = []
+        self.activeFormattingElements = ActiveFormattingElements()
+
+        # XXX - rename these to headElement, formElement
+        self.headPointer = None
+        self.formPointer = None
+
+        self.insertFromTable = False
+
+        self.document = self.documentClass()
+
+    def elementInScope(self, target, variant=None):
+
+        # If we pass a node in we match that. if we pass a string
+        # match any node with that name
+        exactNode = hasattr(target, "nameTuple")
+        if not exactNode:
+            if isinstance(target, text_type):
+                target = (namespaces["html"], target)
+            assert isinstance(target, tuple)
+
+        listElements, invert = listElementsMap[variant]
+
+        for node in reversed(self.openElements):
+            if exactNode and node == target:
+                return True
+            elif not exactNode and node.nameTuple == target:
+                return True
+            elif (invert ^ (node.nameTuple in listElements)):
+                return False
+
+        assert False  # We should never reach this point
+
+    def reconstructActiveFormattingElements(self):
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        if not self.activeFormattingElements:
+            return
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = len(self.activeFormattingElements) - 1
+        entry = self.activeFormattingElements[i]
+        if entry == Marker or entry in self.openElements:
+            return
+
+        # Step 6
+        while entry != Marker and entry not in self.openElements:
+            if i == 0:
+                # This will be reset to 0 below
+                i = -1
+                break
+            i -= 1
+            # Step 5: let entry be one earlier in the list.
+            entry = self.activeFormattingElements[i]
+
+        while True:
+            # Step 7
+            i += 1
+
+            # Step 8
+            entry = self.activeFormattingElements[i]
+            clone = entry.cloneNode()  # Mainly to get a new copy of the attributes
+
+            # Step 9
+            element = self.insertElement({"type": "StartTag",
+                                          "name": clone.name,
+                                          "namespace": clone.namespace,
+                                          "data": clone.attributes})
+
+            # Step 10
+            self.activeFormattingElements[i] = element
+
+            # Step 11
+            if element == self.activeFormattingElements[-1]:
+                break
+
+    def clearActiveFormattingElements(self):
+        entry = self.activeFormattingElements.pop()
+        while self.activeFormattingElements and entry != Marker:
+            entry = self.activeFormattingElements.pop()
+
+    def elementInActiveFormattingElements(self, name):
+        """Check if an element exists between the end of the active
+        formatting elements and the last marker. If it does, return it, else
+        return false"""
+
+        for item in self.activeFormattingElements[::-1]:
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            if item == Marker:
+                break
+            elif item.name == name:
+                return item
+        return False
+
+    def insertRoot(self, token):
+        element = self.createElement(token)
+        self.openElements.append(element)
+        self.document.appendChild(element)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.document.appendChild(doctype)
+
+    def insertComment(self, token, parent=None):
+        if parent is None:
+            parent = self.openElements[-1]
+        parent.appendChild(self.commentClass(token["data"]))
+
+    def createElement(self, token):
+        """Create an element but don't insert it anywhere"""
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        return element
+
+    def _getInsertFromTable(self):
+        return self._insertFromTable
+
+    def _setInsertFromTable(self, value):
+        """Switch the function used to insert an element from the
+        normal one to the misnested table one and back again"""
+        self._insertFromTable = value
+        if value:
+            self.insertElement = self.insertElementTable
+        else:
+            self.insertElement = self.insertElementNormal
+
+    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
+
+    def insertElementNormal(self, token):
+        name = token["name"]
+        assert isinstance(name, text_type), "Element %s not unicode" % name
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        self.openElements[-1].appendChild(element)
+        self.openElements.append(element)
+        return element
+
+    def insertElementTable(self, token):
+        """Create an element and insert it into the tree"""
+        element = self.createElement(token)
+        if self.openElements[-1].name not in tableInsertModeElements:
+            return self.insertElementNormal(token)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            if insertBefore is None:
+                parent.appendChild(element)
+            else:
+                parent.insertBefore(element, insertBefore)
+            self.openElements.append(element)
+        return element
+
+    def insertText(self, data, parent=None):
+        """Insert text data."""
+        if parent is None:
+            parent = self.openElements[-1]
+
+        if (not self.insertFromTable or (self.insertFromTable and
+                                         self.openElements[-1].name
+                                         not in tableInsertModeElements)):
+            parent.insertText(data)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            parent.insertText(data, insertBefore)
+
+    def getTableMisnestedNodePosition(self):
+        """Get the foster parent element, and sibling to insert before
+        (or None) when inserting a misnested table node"""
+        # The foster parent element is the one which comes before the most
+        # recently opened table element
+        # XXX - this is really inelegant
+        lastTable = None
+        fosterParent = None
+        insertBefore = None
+        for elm in self.openElements[::-1]:
+            if elm.name == "table":
+                lastTable = elm
+                break
+        if lastTable:
+            # XXX - we should really check that this parent is actually a
+            # node here
+            if lastTable.parent:
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else:
+                fosterParent = self.openElements[
+                    self.openElements.index(lastTable) - 1]
+        else:
+            fosterParent = self.openElements[0]
+        return fosterParent, insertBefore
+
+    def generateImpliedEndTags(self, exclude=None):
+        name = self.openElements[-1].name
+        # XXX td, th and tr are not actually needed
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
+                name != exclude):
+            self.openElements.pop()
+            # XXX This is not entirely what the specification says. We should
+            # investigate it more closely.
+            self.generateImpliedEndTags(exclude)
+
+    def getDocument(self):
+        "Return the final tree"
+        return self.document
+
+    def getFragment(self):
+        "Return the final fragment"
+        # assert self.innerHTML
+        fragment = self.fragmentClass()
+        self.openElements[0].reparentChildren(fragment)
+        return fragment
+
+    def testSerializer(self, node):
+        """Serialize the subtree of node in the format required by unit tests
+        node - the node from which to start serializing"""
+        raise NotImplementedError
diff --git a/html_cleaner/html5lib/treebuilders/dom.py b/html_cleaner/html5lib/treebuilders/dom.py
new file mode 100644
index 0000000..dcfac22
--- /dev/null
+++ b/html_cleaner/html5lib/treebuilders/dom.py
@@ -0,0 +1,236 @@
+from __future__ import absolute_import, division, unicode_literals
+
+
+from collections import MutableMapping
+from xml.dom import minidom, Node
+import weakref
+
+from . import base
+from .. import constants
+from ..constants import namespaces
+from .._utils import moduleFactoryFactory
+
+
+def getDomBuilder(DomImplementation):
+    Dom = DomImplementation
+
+    class AttrList(MutableMapping):
+        def __init__(self, element):
+            self.element = element
+
+        def __iter__(self):
+            return iter(self.element.attributes.keys())
+
+        def __setitem__(self, name, value):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                attr = self.element.ownerDocument.createAttribute(name)
+                attr.value = value
+                self.element.attributes[name] = attr
+
+        def __len__(self):
+            return len(self.element.attributes)
+
+        def items(self):
+            return list(self.element.attributes.items())
+
+        def values(self):
+            return list(self.element.attributes.values())
+
+        def __getitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.attributes[name].value
+
+        def __delitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                del self.element.attributes[name]
+
+    class NodeBuilder(base.Node):
+        def __init__(self, element):
+            base.Node.__init__(self, element.nodeName)
+            self.element = element
+
+        namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
+                             self.element.namespaceURI or None)
+
+        def appendChild(self, node):
+            node.parent = self
+            self.element.appendChild(node.element)
+
+        def insertText(self, data, insertBefore=None):
+            text = self.element.ownerDocument.createTextNode(data)
+            if insertBefore:
+                self.element.insertBefore(text, insertBefore.element)
+            else:
+                self.element.appendChild(text)
+
+        def insertBefore(self, node, refNode):
+            self.element.insertBefore(node.element, refNode.element)
+            node.parent = self
+
+        def removeChild(self, node):
+            if node.element.parentNode == self.element:
+                self.element.removeChild(node.element)
+            node.parent = None
+
+        def reparentChildren(self, newParent):
+            while self.element.hasChildNodes():
+                child = self.element.firstChild
+                self.element.removeChild(child)
+                newParent.element.appendChild(child)
+            self.childNodes = []
+
+        def getAttributes(self):
+            return AttrList(self.element)
+
+        def setAttributes(self, attributes):
+            if attributes:
+                for name, value in list(attributes.items()):
+                    if isinstance(name, tuple):
+                        if name[0] is not None:
+                            qualifiedName = (name[0] + ":" + name[1])
+                        else:
+                            qualifiedName = name[1]
+                        self.element.setAttributeNS(name[2], qualifiedName,
+                                                    value)
+                    else:
+                        self.element.setAttribute(
+                            name, value)
+        attributes = property(getAttributes, setAttributes)
+
+        def cloneNode(self):
+            return NodeBuilder(self.element.cloneNode(False))
+
+        def hasContent(self):
+            return self.element.hasChildNodes()
+
+        def getNameTuple(self):
+            if self.namespace is None:
+                return namespaces["html"], self.name
+            else:
+                return self.namespace, self.name
+
+        nameTuple = property(getNameTuple)
+
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
+        def documentClass(self):
+            self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
+            return weakref.proxy(self)
+
+        def insertDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+
+            domimpl = Dom.getDOMImplementation()
+            doctype = domimpl.createDocumentType(name, publicId, systemId)
+            self.document.appendChild(NodeBuilder(doctype))
+            if Dom == minidom:
+                doctype.ownerDocument = self.dom
+
+        def elementClass(self, name, namespace=None):
+            if namespace is None and self.defaultNamespace is None:
+                node = self.dom.createElement(name)
+            else:
+                node = self.dom.createElementNS(namespace, name)
+
+            return NodeBuilder(node)
+
+        def commentClass(self, data):
+            return NodeBuilder(self.dom.createComment(data))
+
+        def fragmentClass(self):
+            return NodeBuilder(self.dom.createDocumentFragment())
+
+        def appendChild(self, node):
+            self.dom.appendChild(node.element)
+
+        def testSerializer(self, element):
+            return testSerializer(element)
+
+        def getDocument(self):
+            return self.dom
+
+        def getFragment(self):
+            return base.TreeBuilder.getFragment(self).element
+
+        def insertText(self, data, parent=None):
+            data = data
+            if parent != self:
+                base.TreeBuilder.insertText(self, data, parent)
+            else:
+                # HACK: allow text nodes as children of the document node
+                if hasattr(self.dom, '_child_node_types'):
+                    # pylint:disable=protected-access
+                    if Node.TEXT_NODE not in self.dom._child_node_types:
+                        self.dom._child_node_types = list(self.dom._child_node_types)
+                        self.dom._child_node_types.append(Node.TEXT_NODE)
+                self.dom.appendChild(self.dom.createTextNode(data))
+
+        implementation = DomImplementation
+        name = None
+
+    def testSerializer(element):
+        element.normalize()
+        rv = []
+
+        def serializeElement(element, indent=0):
+            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
+                if element.name:
+                    if element.publicId or element.systemId:
+                        publicId = element.publicId or ""
+                        systemId = element.systemId or ""
+                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (' ' * indent, element.name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, element.name))
+                else:
+                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
+            elif element.nodeType == Node.DOCUMENT_NODE:
+                rv.append("#document")
+            elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+                rv.append("#document-fragment")
+            elif element.nodeType == Node.COMMENT_NODE:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.nodeValue))
+            elif element.nodeType == Node.TEXT_NODE:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.nodeValue))
+            else:
+                if (hasattr(element, "namespaceURI") and
+                        element.namespaceURI is not None):
+                    name = "%s %s" % (constants.prefixes[element.namespaceURI],
+                                      element.nodeName)
+                else:
+                    name = element.nodeName
+                rv.append("|%s<%s>" % (' ' * indent, name))
+                if element.hasAttributes():
+                    attributes = []
+                    for i in range(len(element.attributes)):
+                        attr = element.attributes.item(i)
+                        name = attr.nodeName
+                        value = attr.value
+                        ns = attr.namespaceURI
+                        if ns:
+                            name = "%s %s" % (constants.prefixes[ns], attr.localName)
+                        else:
+                            name = attr.nodeName
+                        attributes.append((name, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+            indent += 2
+            for child in element.childNodes:
+                serializeElement(child, indent)
+        serializeElement(element, 0)
+
+        return "\n".join(rv)
+
+    return locals()
+
+
+# The actual means to get a module!
+getDomModule = moduleFactoryFactory(getDomBuilder)
diff --git a/html_cleaner/html5lib/treebuilders/etree.py b/html_cleaner/html5lib/treebuilders/etree.py
new file mode 100644
index 0000000..cb1d4ae
--- /dev/null
+++ b/html_cleaner/html5lib/treebuilders/etree.py
@@ -0,0 +1,340 @@
+from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
+from six import text_type
+
+import re
+
+from . import base
+from .. import _ihatexml
+from .. import constants
+from ..constants import namespaces
+from .._utils import moduleFactoryFactory
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+
+def getETreeBuilder(ElementTreeImplementation, fullTree=False):
+    ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
+    class Element(base.Node):
+        def __init__(self, name, namespace=None):
+            self._name = name
+            self._namespace = namespace
+            self._element = ElementTree.Element(self._getETreeTag(name,
+                                                                  namespace))
+            if namespace is None:
+                self.nameTuple = namespaces["html"], self._name
+            else:
+                self.nameTuple = self._namespace, self._name
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getETreeTag(self, name, namespace):
+            if namespace is None:
+                etree_tag = name
+            else:
+                etree_tag = "{%s}%s" % (namespace, name)
+            return etree_tag
+
+        def _setName(self, name):
+            self._name = name
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getName(self):
+            return self._name
+
+        name = property(_getName, _setName)
+
+        def _setNamespace(self, namespace):
+            self._namespace = namespace
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getNamespace(self):
+            return self._namespace
+
+        namespace = property(_getNamespace, _setNamespace)
+
+        def _getAttributes(self):
+            return self._element.attrib
+
+        def _setAttributes(self, attributes):
+            # Delete existing attributes first
+            # XXX - there may be a better way to do this...
+            for key in list(self._element.attrib.keys()):
+                del self._element.attrib[key]
+            for key, value in attributes.items():
+                if isinstance(key, tuple):
+                    name = "{%s}%s" % (key[2], key[1])
+                else:
+                    name = key
+                self._element.set(name, value)
+
+        attributes = property(_getAttributes, _setAttributes)
+
+        def _getChildNodes(self):
+            return self._childNodes
+
+        def _setChildNodes(self, value):
+            del self._element[:]
+            self._childNodes = []
+            for element in value:
+                self.insertChild(element)
+
+        childNodes = property(_getChildNodes, _setChildNodes)
+
+        def hasContent(self):
+            """Return true if the node has children or text"""
+            return bool(self._element.text or len(self._element))
+
+        def appendChild(self, node):
+            self._childNodes.append(node)
+            self._element.append(node._element)
+            node.parent = self
+
+        def insertBefore(self, node, refNode):
+            index = list(self._element).index(refNode._element)
+            self._element.insert(index, node._element)
+            node.parent = self
+
+        def removeChild(self, node):
+            self._childNodes.remove(node)
+            self._element.remove(node._element)
+            node.parent = None
+
+        def insertText(self, data, insertBefore=None):
+            if not(len(self._element)):
+                if not self._element.text:
+                    self._element.text = ""
+                self._element.text += data
+            elif insertBefore is None:
+                # Insert the text as the tail of the last child element
+                if not self._element[-1].tail:
+                    self._element[-1].tail = ""
+                self._element[-1].tail += data
+            else:
+                # Insert the text before the specified node
+                children = list(self._element)
+                index = children.index(insertBefore._element)
+                if index > 0:
+                    if not self._element[index - 1].tail:
+                        self._element[index - 1].tail = ""
+                    self._element[index - 1].tail += data
+                else:
+                    if not self._element.text:
+                        self._element.text = ""
+                    self._element.text += data
+
+        def cloneNode(self):
+            element = type(self)(self.name, self.namespace)
+            for name, value in self.attributes.items():
+                element.attributes[name] = value
+            return element
+
+        def reparentChildren(self, newParent):
+            if newParent.childNodes:
+                newParent.childNodes[-1]._element.tail += self._element.text
+            else:
+                if not newParent._element.text:
+                    newParent._element.text = ""
+                if self._element.text is not None:
+                    newParent._element.text += self._element.text
+            self._element.text = ""
+            base.Node.reparentChildren(self, newParent)
+
+    class Comment(Element):
+        def __init__(self, data):
+            # Use the superclass constructor to set all properties on the
+            # wrapper element
+            self._element = ElementTree.Comment(data)
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getData(self):
+            return self._element.text
+
+        def _setData(self, value):
+            self._element.text = value
+
+        data = property(_getData, _setData)
+
+    class DocumentType(Element):
+        def __init__(self, name, publicId, systemId):
+            Element.__init__(self, "<!DOCTYPE>")
+            self._element.text = name
+            self.publicId = publicId
+            self.systemId = systemId
+
+        def _getPublicId(self):
+            return self._element.get("publicId", "")
+
+        def _setPublicId(self, value):
+            if value is not None:
+                self._element.set("publicId", value)
+
+        publicId = property(_getPublicId, _setPublicId)
+
+        def _getSystemId(self):
+            return self._element.get("systemId", "")
+
+        def _setSystemId(self, value):
+            if value is not None:
+                self._element.set("systemId", value)
+
+        systemId = property(_getSystemId, _setSystemId)
+
+    class Document(Element):
+        def __init__(self):
+            Element.__init__(self, "DOCUMENT_ROOT")
+
+    class DocumentFragment(Element):
+        def __init__(self):
+            Element.__init__(self, "DOCUMENT_FRAGMENT")
+
+    def testSerializer(element):
+        rv = []
+
+        def serializeElement(element, indent=0):
+            if not(hasattr(element, "tag")):
+                element = element.getroot()
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append("""<!DOCTYPE %s "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                rv.append("#document")
+                if element.text is not None:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+            elif element.tag == ElementTreeCommentType:
+                rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            else:
+                assert isinstance(element.tag, text_type), \
+                    "Expected unicode, got %s, %s" % (type(element.tag), element.tag)
+                nsmatch = tag_regexp.match(element.tag)
+
+                if nsmatch is None:
+                    name = element.tag
+                else:
+                    ns, name = nsmatch.groups()
+                    prefix = constants.prefixes[ns]
+                    name = "%s %s" % (prefix, name)
+                rv.append("|%s<%s>" % (' ' * indent, name))
+
+                if hasattr(element, "attrib"):
+                    attributes = []
+                    for name, value in element.attrib.items():
+                        nsmatch = tag_regexp.match(name)
+                        if nsmatch is not None:
+                            ns, name = nsmatch.groups()
+                            prefix = constants.prefixes[ns]
+                            attr_string = "%s %s" % (prefix, name)
+                        else:
+                            attr_string = name
+                        attributes.append((attr_string, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+                if element.text:
+                    rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
+        serializeElement(element, 0)
+
+        return "\n".join(rv)
+
+    def tostring(element):  # pylint:disable=unused-variable
+        """Serialize an element and its child nodes to a string"""
+        rv = []
+        filter = _ihatexml.InfosetFilter()
+
+        def serializeElement(element):
+            if isinstance(element, ElementTree.ElementTree):
+                element = element.getroot()
+
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
+                              (element.text, publicId, systemId))
+                else:
+                    rv.append("<!DOCTYPE %s>" % (element.text,))
+            elif element.tag == "DOCUMENT_ROOT":
+                if element.text is not None:
+                    rv.append(element.text)
+                if element.tail is not None:
+                    raise TypeError("Document node cannot have tail")
+                if hasattr(element, "attrib") and len(element.attrib):
+                    raise TypeError("Document node cannot have attributes")
+
+                for child in element:
+                    serializeElement(child)
+
+            elif element.tag == ElementTreeCommentType:
+                rv.append("<!--%s-->" % (element.text,))
+            else:
+                # This is assumed to be an ordinary element
+                if not element.attrib:
+                    rv.append("<%s>" % (filter.fromXmlName(element.tag),))
+                else:
+                    attr = " ".join(["%s=\"%s\"" % (
+                        filter.fromXmlName(name), value)
+                        for name, value in element.attrib.items()])
+                    rv.append("<%s %s>" % (element.tag, attr))
+                if element.text:
+                    rv.append(element.text)
+
+                for child in element:
+                    serializeElement(child)
+
+                rv.append("</%s>" % (element.tag,))
+
+            if element.tail:
+                rv.append(element.tail)
+
+        serializeElement(element)
+
+        return "".join(rv)
+
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
+        documentClass = Document
+        doctypeClass = DocumentType
+        elementClass = Element
+        commentClass = Comment
+        fragmentClass = DocumentFragment
+        implementation = ElementTreeImplementation
+
+        def testSerializer(self, element):
+            return testSerializer(element)
+
+        def getDocument(self):
+            if fullTree:
+                return self.document._element
+            else:
+                if self.defaultNamespace is not None:
+                    return self.document._element.find(
+                        "{%s}html" % self.defaultNamespace)
+                else:
+                    return self.document._element.find("html")
+
+        def getFragment(self):
+            return base.TreeBuilder.getFragment(self)._element
+
+    return locals()
+
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/html_cleaner/html5lib/treebuilders/etree_lxml.py b/html_cleaner/html5lib/treebuilders/etree_lxml.py
new file mode 100644
index 0000000..908820c
--- /dev/null
+++ b/html_cleaner/html5lib/treebuilders/etree_lxml.py
@@ -0,0 +1,367 @@
+"""Module for supporting the lxml.etree library. The idea here is to use as much
+of the native library as possible, without using fragile hacks like custom element
+names that break between releases. The downside of this is that we cannot represent
+all possible trees; specifically the following are known to cause problems:
+
+Text or comments as siblings of the root element
+Docypes with no name
+
+When any of these things occur, we emit a DataLossWarning
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
+import warnings
+import re
+import sys
+
+from . import base
+from ..constants import DataLossWarning
+from .. import constants
+from . import etree as etree_builders
+from .. import _ihatexml
+
+import lxml.etree as etree
+
+
+fullTree = True
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+comment_type = etree.Comment("asd").tag
+
+
+class DocumentType(object):
+    def __init__(self, name, publicId, systemId):
+        self.name = name
+        self.publicId = publicId
+        self.systemId = systemId
+
+
+class Document(object):
+    def __init__(self):
+        self._elementTree = None
+        self._childNodes = []
+
+    def appendChild(self, element):
+        self._elementTree.getroot().addnext(element._element)
+
+    def _getChildNodes(self):
+        return self._childNodes
+
+    childNodes = property(_getChildNodes)
+
+
+def testSerializer(element):
+    rv = []
+    infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
+
+    def serializeElement(element, indent=0):
+        if not hasattr(element, "tag"):
+            if hasattr(element, "getroot"):
+                # Full tree case
+                rv.append("#document")
+                if element.docinfo.internalDTD:
+                    if not (element.docinfo.public_id or
+                            element.docinfo.system_url):
+                        dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
+                    else:
+                        dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
+                            element.docinfo.root_name,
+                            element.docinfo.public_id,
+                            element.docinfo.system_url)
+                    rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
+                next_element = element.getroot()
+                while next_element.getprevious() is not None:
+                    next_element = next_element.getprevious()
+                while next_element is not None:
+                    serializeElement(next_element, indent + 2)
+                    next_element = next_element.getnext()
+            elif isinstance(element, str) or isinstance(element, bytes):
+                # Text in a fragment
+                assert isinstance(element, str) or sys.version_info[0] == 2
+                rv.append("|%s\"%s\"" % (' ' * indent, element))
+            else:
+                # Fragment case
+                rv.append("#document-fragment")
+                for next_element in element:
+                    serializeElement(next_element, indent + 2)
+        elif element.tag == comment_type:
+            rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
+        else:
+            assert isinstance(element, etree._Element)
+            nsmatch = etree_builders.tag_regexp.match(element.tag)
+            if nsmatch is not None:
+                ns = nsmatch.group(1)
+                tag = nsmatch.group(2)
+                prefix = constants.prefixes[ns]
+                rv.append("|%s<%s %s>" % (' ' * indent, prefix,
+                                          infosetFilter.fromXmlName(tag)))
+            else:
+                rv.append("|%s<%s>" % (' ' * indent,
+                                       infosetFilter.fromXmlName(element.tag)))
+
+            if hasattr(element, "attrib"):
+                attributes = []
+                for name, value in element.attrib.items():
+                    nsmatch = tag_regexp.match(name)
+                    if nsmatch is not None:
+                        ns, name = nsmatch.groups()
+                        name = infosetFilter.fromXmlName(name)
+                        prefix = constants.prefixes[ns]
+                        attr_string = "%s %s" % (prefix, name)
+                    else:
+                        attr_string = infosetFilter.fromXmlName(name)
+                    attributes.append((attr_string, value))
+
+                for name, value in sorted(attributes):
+                    rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+
+            if element.text:
+                rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if hasattr(element, "tail") and element.tail:
+                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
+    serializeElement(element, 0)
+
+    return "\n".join(rv)
+
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+
+    def serializeElement(element):
+        if not hasattr(element, "tag"):
+            if element.docinfo.internalDTD:
+                if element.docinfo.doctype:
+                    dtd_str = element.docinfo.doctype
+                else:
+                    dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
+                rv.append(dtd_str)
+            serializeElement(element.getroot())
+
+        elif element.tag == comment_type:
+            rv.append("<!--%s-->" % (element.text,))
+
+        else:
+            # This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>" % (element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\"" % (name, value)
+                                 for name, value in element.attrib.items()])
+                rv.append("<%s %s>" % (element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element:
+                serializeElement(child)
+
+            rv.append("</%s>" % (element.tag,))
+
+        if hasattr(element, "tail") and element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    return "".join(rv)
+
+
+class TreeBuilder(base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = None
+    commentClass = None
+    fragmentClass = Document
+    implementation = etree
+
+    def __init__(self, namespaceHTMLElements, fullTree=False):
+        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
+        infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
+        self.namespaceHTMLElements = namespaceHTMLElements
+
+        class Attributes(dict):
+            def __init__(self, element, value=None):
+                if value is None:
+                    value = {}
+                self._element = element
+                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
+                for key, value in self.items():
+                    if isinstance(key, tuple):
+                        name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
+                    else:
+                        name = infosetFilter.coerceAttribute(key)
+                    self._element._element.attrib[name] = value
+
+            def __setitem__(self, key, value):
+                dict.__setitem__(self, key, value)
+                if isinstance(key, tuple):
+                    name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
+                else:
+                    name = infosetFilter.coerceAttribute(key)
+                self._element._element.attrib[name] = value
+
+        class Element(builder.Element):
+            def __init__(self, name, namespace):
+                name = infosetFilter.coerceElement(name)
+                builder.Element.__init__(self, name, namespace=namespace)
+                self._attributes = Attributes(self)
+
+            def _setName(self, name):
+                self._name = infosetFilter.coerceElement(name)
+                self._element.tag = self._getETreeTag(
+                    self._name, self._namespace)
+
+            def _getName(self):
+                return infosetFilter.fromXmlName(self._name)
+
+            name = property(_getName, _setName)
+
+            def _getAttributes(self):
+                return self._attributes
+
+            def _setAttributes(self, attributes):
+                self._attributes = Attributes(self, attributes)
+
+            attributes = property(_getAttributes, _setAttributes)
+
+            def insertText(self, data, insertBefore=None):
+                data = infosetFilter.coerceCharacters(data)
+                builder.Element.insertText(self, data, insertBefore)
+
+            def appendChild(self, child):
+                builder.Element.appendChild(self, child)
+
+        class Comment(builder.Comment):
+            def __init__(self, data):
+                data = infosetFilter.coerceComment(data)
+                builder.Comment.__init__(self, data)
+
+            def _setData(self, data):
+                data = infosetFilter.coerceComment(data)
+                self._element.text = data
+
+            def _getData(self):
+                return self._element.text
+
+            data = property(_getData, _setData)
+
+        self.elementClass = Element
+        self.commentClass = Comment
+        # self.fragmentClass = builder.DocumentFragment
+        base.TreeBuilder.__init__(self, namespaceHTMLElements)
+
+    def reset(self):
+        base.TreeBuilder.reset(self)
+        self.insertComment = self.insertCommentInitial
+        self.initial_comments = []
+        self.doctype = None
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        if fullTree:
+            return self.document._elementTree
+        else:
+            return self.document._elementTree.getroot()
+
+    def getFragment(self):
+        fragment = []
+        element = self.openElements[0]._element
+        if element.text:
+            fragment.append(element.text)
+        fragment.extend(list(element))
+        if element.tail:
+            fragment.append(element.tail)
+        return fragment
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if not name:
+            warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
+            self.doctype = None
+        else:
+            coercedName = self.infosetFilter.coerceElement(name)
+            if coercedName != name:
+                warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
+
+            doctype = self.doctypeClass(coercedName, publicId, systemId)
+            self.doctype = doctype
+
+    def insertCommentInitial(self, data, parent=None):
+        assert parent is None or parent is self.document
+        assert self.document._elementTree is None
+        self.initial_comments.append(data)
+
+    def insertCommentMain(self, data, parent=None):
+        if (parent == self.document and
+                self.document._elementTree.getroot()[-1].tag == comment_type):
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+        super(TreeBuilder, self).insertComment(data, parent)
+
+    def insertRoot(self, token):
+        """Create the document root"""
+        # Because of the way libxml2 works, it doesn't seem to be possible to
+        # alter information like the doctype after the tree has been parsed.
+        # Therefore we need to use the built-in parser to create our initial
+        # tree, after which we can add elements like normal
+        docStr = ""
+        if self.doctype:
+            assert self.doctype.name
+            docStr += "<!DOCTYPE %s" % self.doctype.name
+            if (self.doctype.publicId is not None or
+                    self.doctype.systemId is not None):
+                docStr += (' PUBLIC "%s" ' %
+                           (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
+                if self.doctype.systemId:
+                    sysid = self.doctype.systemId
+                    if sysid.find("'") >= 0 and sysid.find('"') >= 0:
+                        warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
+                        sysid = sysid.replace("'", 'U00027')
+                    if sysid.find("'") >= 0:
+                        docStr += '"%s"' % sysid
+                    else:
+                        docStr += "'%s'" % sysid
+                else:
+                    docStr += "''"
+            docStr += ">"
+            if self.doctype.name != token["name"]:
+                warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
+        docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
+        root = etree.fromstring(docStr)
+
+        # Append the initial comments:
+        for comment_token in self.initial_comments:
+            comment = self.commentClass(comment_token["data"])
+            root.addprevious(comment._element)
+
+        # Create the root document and add the ElementTree to it
+        self.document = self.documentClass()
+        self.document._elementTree = root.getroottree()
+
+        # Give the root element the right name
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        if namespace is None:
+            etree_tag = name
+        else:
+            etree_tag = "{%s}%s" % (namespace, name)
+        root.tag = etree_tag
+
+        # Add the root element to the internal child/open data structures
+        root_element = self.elementClass(name, namespace)
+        root_element._element = root
+        self.document._childNodes.append(root_element)
+        self.openElements.append(root_element)
+
+        # Reset to the default insert comment function
+        self.insertComment = self.insertCommentMain
diff --git a/html_cleaner/html5lib/treewalkers/__init__.py b/html_cleaner/html5lib/treewalkers/__init__.py
new file mode 100644
index 0000000..9e19a55
--- /dev/null
+++ b/html_cleaner/html5lib/treewalkers/__init__.py
@@ -0,0 +1,143 @@
+"""A collection of modules for iterating through different kinds of
+tree, generating tokens identical to those produced by the tokenizer
+module.
+
+To create a tree walker for a new type of tree, you need to do
+implement a tree walker object (called TreeWalker by convention) that
+implements a 'serialize' method taking a tree as sole argument and
+returning an iterator generating tokens.
+"""
+
+from __future__ import absolute_import, division, unicode_literals
+
+from .. import constants
+from .._utils import default_etree
+
+__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]
+
+treeWalkerCache = {}
+
+
+def getTreeWalker(treeType, implementation=None, **kwargs):
+    """Get a TreeWalker class for various types of tree with built-in support
+
+    Args:
+        treeType (str): the name of the tree type required (case-insensitive).
+            Supported values are:
+
+            - "dom": The xml.dom.minidom DOM implementation
+            - "etree": A generic walker for tree implementations exposing an
+                       elementtree-like interface (known to work with
+                       ElementTree, cElementTree and lxml.etree).
+            - "lxml": Optimized walker for lxml.etree
+            - "genshi": a Genshi stream
+
+        Implementation: A module implementing the tree type e.g.
+            xml.etree.ElementTree or cElementTree (Currently applies to the
+            "etree" tree type only).
+    """
+
+    treeType = treeType.lower()
+    if treeType not in treeWalkerCache:
+        if treeType == "dom":
+            from . import dom
+            treeWalkerCache[treeType] = dom.TreeWalker
+        elif treeType == "genshi":
+            from . import genshi
+            treeWalkerCache[treeType] = genshi.TreeWalker
+        elif treeType == "lxml":
+            from . import etree_lxml
+            treeWalkerCache[treeType] = etree_lxml.TreeWalker
+        elif treeType == "etree":
+            from . import etree
+            if implementation is None:
+                implementation = default_etree
+            # XXX: NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeWalker
+    return treeWalkerCache.get(treeType)
+
+
+def concatenateCharacterTokens(tokens):
+    pendingCharacters = []
+    for token in tokens:
+        type = token["type"]
+        if type in ("Characters", "SpaceCharacters"):
+            pendingCharacters.append(token["data"])
+        else:
+            if pendingCharacters:
+                yield {"type": "Characters", "data": "".join(pendingCharacters)}
+                pendingCharacters = []
+            yield token
+    if pendingCharacters:
+        yield {"type": "Characters", "data": "".join(pendingCharacters)}
+
+
+def pprint(walker):
+    """Pretty printer for tree walkers"""
+    output = []
+    indent = 0
+    for token in concatenateCharacterTokens(walker):
+        type = token["type"]
+        if type in ("StartTag", "EmptyTag"):
+            # tag name
+            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
+                if token["namespace"] in constants.prefixes:
+                    ns = constants.prefixes[token["namespace"]]
+                else:
+                    ns = token["namespace"]
+                name = "%s %s" % (ns, token["name"])
+            else:
+                name = token["name"]
+            output.append("%s<%s>" % (" " * indent, name))
+            indent += 2
+            # attributes (sorted for consistent ordering)
+            attrs = token["data"]
+            for (namespace, localname), value in sorted(attrs.items()):
+                if namespace:
+                    if namespace in constants.prefixes:
+                        ns = constants.prefixes[namespace]
+                    else:
+                        ns = namespace
+                    name = "%s %s" % (ns, localname)
+                else:
+                    name = localname
+                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
+            # self-closing
+            if type == "EmptyTag":
+                indent -= 2
+
+        elif type == "EndTag":
+            indent -= 2
+
+        elif type == "Comment":
+            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
+
+        elif type == "Doctype":
+            if token["name"]:
+                if token["publicId"]:
+                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["publicId"],
+                                   token["systemId"] if token["systemId"] else ""))
+                elif token["systemId"]:
+                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
+                                  (" " * indent,
+                                   token["name"],
+                                   token["systemId"]))
+                else:
+                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
+                                                       token["name"]))
+            else:
+                output.append("%s<!DOCTYPE >" % (" " * indent,))
+
+        elif type == "Characters":
+            output.append("%s\"%s\"" % (" " * indent, token["data"]))
+
+        elif type == "SpaceCharacters":
+            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
+
+        else:
+            raise ValueError("Unknown token type, %s" % type)
+
+    return "\n".join(output)
diff --git a/html_cleaner/html5lib/treewalkers/base.py b/html_cleaner/html5lib/treewalkers/base.py
new file mode 100644
index 0000000..36e1ba2
--- /dev/null
+++ b/html_cleaner/html5lib/treewalkers/base.py
@@ -0,0 +1,150 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+from ..constants import namespaces, voidElements, spaceCharacters
+
+__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
+           "TreeWalker", "NonRecursiveTreeWalker"]
+
+DOCUMENT = Node.DOCUMENT_NODE
+DOCTYPE = Node.DOCUMENT_TYPE_NODE
+TEXT = Node.TEXT_NODE
+ELEMENT = Node.ELEMENT_NODE
+COMMENT = Node.COMMENT_NODE
+ENTITY = Node.ENTITY_NODE
+UNKNOWN = "<#UNKNOWN#>"
+
+spaceCharacters = "".join(spaceCharacters)
+
+
+class TreeWalker(object):
+    def __init__(self, tree):
+        self.tree = tree
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def error(self, msg):
+        return {"type": "SerializeError", "data": msg}
+
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        yield {"type": "EmptyTag", "name": name,
+               "namespace": namespace,
+               "data": attrs}
+        if hasChildren:
+            yield self.error("Void element has children")
+
+    def startTag(self, namespace, name, attrs):
+        return {"type": "StartTag",
+                "name": name,
+                "namespace": namespace,
+                "data": attrs}
+
+    def endTag(self, namespace, name):
+        return {"type": "EndTag",
+                "name": name,
+                "namespace": namespace}
+
+    def text(self, data):
+        data = data
+        middle = data.lstrip(spaceCharacters)
+        left = data[:len(data) - len(middle)]
+        if left:
+            yield {"type": "SpaceCharacters", "data": left}
+        data = middle
+        middle = data.rstrip(spaceCharacters)
+        right = data[len(middle):]
+        if middle:
+            yield {"type": "Characters", "data": middle}
+        if right:
+            yield {"type": "SpaceCharacters", "data": right}
+
+    def comment(self, data):
+        return {"type": "Comment", "data": data}
+
+    def doctype(self, name, publicId=None, systemId=None):
+        return {"type": "Doctype",
+                "name": name,
+                "publicId": publicId,
+                "systemId": systemId}
+
+    def entity(self, name):
+        return {"type": "Entity", "name": name}
+
+    def unknown(self, nodeType):
+        return self.error("Unknown node type: " + nodeType)
+
+
+class NonRecursiveTreeWalker(TreeWalker):
+    def getNodeDetails(self, node):
+        raise NotImplementedError
+
+    def getFirstChild(self, node):
+        raise NotImplementedError
+
+    def getNextSibling(self, node):
+        raise NotImplementedError
+
+    def getParentNode(self, node):
+        raise NotImplementedError
+
+    def __iter__(self):
+        currentNode = self.tree
+        while currentNode is not None:
+            details = self.getNodeDetails(currentNode)
+            type, details = details[0], details[1:]
+            hasChildren = False
+
+            if type == DOCTYPE:
+                yield self.doctype(*details)
+
+            elif type == TEXT:
+                for token in self.text(*details):
+                    yield token
+
+            elif type == ELEMENT:
+                namespace, name, attributes, hasChildren = details
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    for token in self.emptyTag(namespace, name, attributes,
+                                               hasChildren):
+                        yield token
+                    hasChildren = False
+                else:
+                    yield self.startTag(namespace, name, attributes)
+
+            elif type == COMMENT:
+                yield self.comment(details[0])
+
+            elif type == ENTITY:
+                yield self.entity(details[0])
+
+            elif type == DOCUMENT:
+                hasChildren = True
+
+            else:
+                yield self.unknown(details[0])
+
+            if hasChildren:
+                firstChild = self.getFirstChild(currentNode)
+            else:
+                firstChild = None
+
+            if firstChild is not None:
+                currentNode = firstChild
+            else:
+                while currentNode is not None:
+                    details = self.getNodeDetails(currentNode)
+                    type, details = details[0], details[1:]
+                    if type == ELEMENT:
+                        namespace, name, attributes, hasChildren = details
+                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
+                            yield self.endTag(namespace, name)
+                    if self.tree is currentNode:
+                        currentNode = None
+                        break
+                    nextSibling = self.getNextSibling(currentNode)
+                    if nextSibling is not None:
+                        currentNode = nextSibling
+                        break
+                    else:
+                        currentNode = self.getParentNode(currentNode)
diff --git a/html_cleaner/html5lib/treewalkers/dom.py b/html_cleaner/html5lib/treewalkers/dom.py
new file mode 100644
index 0000000..b0c89b0
--- /dev/null
+++ b/html_cleaner/html5lib/treewalkers/dom.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.dom import Node
+
+from . import base
+
+
+class TreeWalker(base.NonRecursiveTreeWalker):
+    def getNodeDetails(self, node):
+        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
+            return base.DOCTYPE, node.name, node.publicId, node.systemId
+
+        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
+            return base.TEXT, node.nodeValue
+
+        elif node.nodeType == Node.ELEMENT_NODE:
+            attrs = {}
+            for attr in list(node.attributes.keys()):
+                attr = node.getAttributeNode(attr)
+                if attr.namespaceURI:
+                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
+                else:
+                    attrs[(None, attr.name)] = attr.value
+            return (base.ELEMENT, node.namespaceURI, node.nodeName,
+                    attrs, node.hasChildNodes())
+
+        elif node.nodeType == Node.COMMENT_NODE:
+            return base.COMMENT, node.nodeValue
+
+        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
+            return (base.DOCUMENT,)
+
+        else:
+            return base.UNKNOWN, node.nodeType
+
+    def getFirstChild(self, node):
+        return node.firstChild
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parentNode
diff --git a/html_cleaner/html5lib/treewalkers/etree.py b/html_cleaner/html5lib/treewalkers/etree.py
new file mode 100644
index 0000000..8f30f07
--- /dev/null
+++ b/html_cleaner/html5lib/treewalkers/etree.py
@@ -0,0 +1,137 @@
+from __future__ import absolute_import, division, unicode_literals
+
+try:
+    from collections import OrderedDict
+except ImportError:
+    try:
+        from ordereddict import OrderedDict
+    except ImportError:
+        OrderedDict = dict
+
+import re
+
+from six import string_types
+
+from . import base
+from .._utils import moduleFactoryFactory
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+
+def getETreeBuilder(ElementTreeImplementation):
+    ElementTree = ElementTreeImplementation
+    ElementTreeCommentType = ElementTree.Comment("asd").tag
+
+    class TreeWalker(base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
+        """Given the particular ElementTree representation, this implementation,
+        to avoid using recursion, returns "nodes" as tuples with the following
+        content:
+
+        1. The current element
+
+        2. The index of the element relative to its parent
+
+        3. A stack of ancestor elements
+
+        4. A flag "text", "tail" or None to indicate if the current node is a
+           text node; either the text or tail of the current element (1)
+        """
+        def getNodeDetails(self, node):
+            if isinstance(node, tuple):  # It might be the root Element
+                elt, _, _, flag = node
+                if flag in ("text", "tail"):
+                    return base.TEXT, getattr(elt, flag)
+                else:
+                    node = elt
+
+            if not(hasattr(node, "tag")):
+                node = node.getroot()
+
+            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
+                return (base.DOCUMENT,)
+
+            elif node.tag == "<!DOCTYPE>":
+                return (base.DOCTYPE, node.text,
+                        node.get("publicId"), node.get("systemId"))
+
+            elif node.tag == ElementTreeCommentType:
+                return base.COMMENT, node.text
+
+            else:
+                assert isinstance(node.tag, string_types), type(node.tag)
+                # This is assumed to be an ordinary element
+                match = tag_regexp.match(node.tag)
+                if match:
+                    namespace, tag = match.groups()
+                else:
+                    namespace = None
+                    tag = node.tag
+                attrs = OrderedDict()
+                for name, value in list(node.attrib.items()):
+                    match = tag_regexp.match(name)
+                    if match:
+                        attrs[(match.group(1), match.group(2))] = value
+                    else:
+                        attrs[(None, name)] = value
+                return (base.ELEMENT, namespace, tag,
+                        attrs, len(node) or node.text)
+
+        def getFirstChild(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                element, key, parents, flag = node, None, [], None
+
+            if flag in ("text", "tail"):
+                return None
+            else:
+                if element.text:
+                    return element, key, parents, "text"
+                elif len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+
+        def getNextSibling(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+
+            if flag == "text":
+                if len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+            else:
+                if element.tail and flag != "tail":
+                    return element, key, parents, "tail"
+                elif key < len(parents[-1]) - 1:
+                    return parents[-1][key + 1], key + 1, parents, None
+                else:
+                    return None
+
+        def getParentNode(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+
+            if flag == "text":
+                if not parents:
+                    return element
+                else:
+                    return element, key, parents, None
+            else:
+                parent = parents.pop()
+                if not parents:
+                    return parent
+                else:
+                    assert list(parents[-1]).count(parent) == 1
+                    return parent, list(parents[-1]).index(parent), parents, None
+
+    return locals()
+
+getETreeModule = moduleFactoryFactory(getETreeBuilder)
diff --git a/html_cleaner/html5lib/treewalkers/etree_lxml.py b/html_cleaner/html5lib/treewalkers/etree_lxml.py
new file mode 100644
index 0000000..fb23631
--- /dev/null
+++ b/html_cleaner/html5lib/treewalkers/etree_lxml.py
@@ -0,0 +1,213 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
+from lxml import etree
+from ..treebuilders.etree import tag_regexp
+
+from . import base
+
+from .. import _ihatexml
+
+
+def ensure_str(s):
+    if s is None:
+        return None
+    elif isinstance(s, text_type):
+        return s
+    else:
+        return s.decode("ascii", "strict")
+
+
+class Root(object):
+    def __init__(self, et):
+        self.elementtree = et
+        self.children = []
+
+        try:
+            if et.docinfo.internalDTD:
+                self.children.append(Doctype(self,
+                                             ensure_str(et.docinfo.root_name),
+                                             ensure_str(et.docinfo.public_id),
+                                             ensure_str(et.docinfo.system_url)))
+        except AttributeError:
+            pass
+
+        try:
+            node = et.getroot()
+        except AttributeError:
+            node = et
+
+        while node.getprevious() is not None:
+            node = node.getprevious()
+        while node is not None:
+            self.children.append(node)
+            node = node.getnext()
+
+        self.text = None
+        self.tail = None
+
+    def __getitem__(self, key):
+        return self.children[key]
+
+    def getnext(self):
+        return None
+
+    def __len__(self):
+        return 1
+
+
+class Doctype(object):
+    def __init__(self, root_node, name, public_id, system_id):
+        self.root_node = root_node
+        self.name = name
+        self.public_id = public_id
+        self.system_id = system_id
+
+        self.text = None
+        self.tail = None
+
+    def getnext(self):
+        return self.root_node.children[1]
+
+
+class FragmentRoot(Root):
+    def __init__(self, children):
+        self.children = [FragmentWrapper(self, child) for child in children]
+        self.text = self.tail = None
+
+    def getnext(self):
+        return None
+
+
+class FragmentWrapper(object):
+    def __init__(self, fragment_root, obj):
+        self.root_node = fragment_root
+        self.obj = obj
+        if hasattr(self.obj, 'text'):
+            self.text = ensure_str(self.obj.text)
+        else:
+            self.text = None
+        if hasattr(self.obj, 'tail'):
+            self.tail = ensure_str(self.obj.tail)
+        else:
+            self.tail = None
+
+    def __getattr__(self, name):
+        return getattr(self.obj, name)
+
+    def getnext(self):
+        siblings = self.root_node.children
+        idx = siblings.index(self)
+        if idx < len(siblings) - 1:
+            return siblings[idx + 1]
+        else:
+            return None
+
+    def __getitem__(self, key):
+        return self.obj[key]
+
+    def __bool__(self):
+        return bool(self.obj)
+
+    def getparent(self):
+        return None
+
+    def __str__(self):
+        return str(self.obj)
+
+    def __unicode__(self):
+        return str(self.obj)
+
+    def __len__(self):
+        return len(self.obj)
+
+
+class TreeWalker(base.NonRecursiveTreeWalker):
+    def __init__(self, tree):
+        # pylint:disable=redefined-variable-type
+        if isinstance(tree, list):
+            self.fragmentChildren = set(tree)
+            tree = FragmentRoot(tree)
+        else:
+            self.fragmentChildren = set()
+            tree = Root(tree)
+        base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = _ihatexml.InfosetFilter()
+
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            return base.TEXT, ensure_str(getattr(node, key))
+
+        elif isinstance(node, Root):
+            return (base.DOCUMENT,)
+
+        elif isinstance(node, Doctype):
+            return base.DOCTYPE, node.name, node.public_id, node.system_id
+
+        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
+            return base.TEXT, ensure_str(node.obj)
+
+        elif node.tag == etree.Comment:
+            return base.COMMENT, ensure_str(node.text)
+
+        elif node.tag == etree.Entity:
+            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
+
+        else:
+            # This is assumed to be an ordinary element
+            match = tag_regexp.match(ensure_str(node.tag))
+            if match:
+                namespace, tag = match.groups()
+            else:
+                namespace = None
+                tag = ensure_str(node.tag)
+            attrs = {}
+            for name, value in list(node.attrib.items()):
+                name = ensure_str(name)
+                value = ensure_str(value)
+                match = tag_regexp.match(name)
+                if match:
+                    attrs[(match.group(1), match.group(2))] = value
+                else:
+                    attrs[(None, name)] = value
+            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
+                    attrs, len(node) > 0 or node.text)
+
+    def getFirstChild(self, node):
+        assert not isinstance(node, tuple), "Text nodes have no children"
+
+        assert len(node) or node.text, "Node has no children"
+        if node.text:
+            return (node, "text")
+        else:
+            return node[0]
+
+    def getNextSibling(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            if key == "text":
+                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
+                # because node[0] might evaluate to False if it has no child element
+                if len(node):
+                    return node[0]
+                else:
+                    return None
+            else:  # tail
+                return node.getnext()
+
+        return (node, "tail") if node.tail else node.getnext()
+
+    def getParentNode(self, node):
+        if isinstance(node, tuple):  # Text node
+            node, key = node
+            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
+            if key == "text":
+                return node
+            # else: fallback to "normal" processing
+        elif node in self.fragmentChildren:
+            return None
+
+        return node.getparent()
diff --git a/html_cleaner/html5lib/treewalkers/genshi.py b/html_cleaner/html5lib/treewalkers/genshi.py
new file mode 100644
index 0000000..7483be2
--- /dev/null
+++ b/html_cleaner/html5lib/treewalkers/genshi.py
@@ -0,0 +1,69 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from genshi.core import QName
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
+from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
+
+from . import base
+
+from ..constants import voidElements, namespaces
+
+
+class TreeWalker(base.TreeWalker):
+    def __iter__(self):
+        # Buffer the events so we can pass in the following one
+        previous = None
+        for event in self.tree:
+            if previous is not None:
+                for token in self.tokens(previous, event):
+                    yield token
+            previous = event
+
+        # Don't forget the final event!
+        if previous is not None:
+            for token in self.tokens(previous, None):
+                yield token
+
+    def tokens(self, event, next):
+        kind, data, _ = event
+        if kind == START:
+            tag, attribs = data
+            name = tag.localname
+            namespace = tag.namespace
+            converted_attribs = {}
+            for k, v in attribs:
+                if isinstance(k, QName):
+                    converted_attribs[(k.namespace, k.localname)] = v
+                else:
+                    converted_attribs[(None, k)] = v
+
+            if namespace == namespaces["html"] and name in voidElements:
+                for token in self.emptyTag(namespace, name, converted_attribs,
+                                           not next or next[0] != END or
+                                           next[1] != tag):
+                    yield token
+            else:
+                yield self.startTag(namespace, name, converted_attribs)
+
+        elif kind == END:
+            name = data.localname
+            namespace = data.namespace
+            if namespace != namespaces["html"] or name not in voidElements:
+                yield self.endTag(namespace, name)
+
+        elif kind == COMMENT:
+            yield self.comment(data)
+
+        elif kind == TEXT:
+            for token in self.text(data):
+                yield token
+
+        elif kind == DOCTYPE:
+            yield self.doctype(*data)
+
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS,
+                      START_CDATA, END_CDATA, PI):
+            pass
+
+        else:
+            yield self.unknown(kind)
diff --git a/html_cleaner/main.py b/html_cleaner/main.py
new file mode 100644
index 0000000..3fb1743
--- /dev/null
+++ b/html_cleaner/main.py
@@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+
+"""
+Anki Add-on: HTML Cleaner
+
+Main module
+
+Cleans and minifies HTML content of current field, removing extraneous
+tags and attributes copied over from apps like Word, etc.
+
+Copyright: (c) Glutanimate 2017
+License: GNU AGPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
+"""
+
+### USER CONFIGURATION START ###
+
+## BASIC
+
+html_clean_hotkey = "Alt+H"
+html_paste_hotkey = "Alt+V"
+
+## ADVANCED
+
+# Html tags to preserve
+keep_tags = ['blockquote', 'a', 'img', 'em', 'b', 'p', 'strong',
+        'h1', 'h2', 'h3', 'h4', 'h5', 'ul', 'ol', 'li', 'sub', 'sup',
+        'abbr', 'acronym', 'dl', 'dt', 'dd', 'cite',
+        'dft', 'br', 'table', 'tr', 'td', 'th', 'thead',
+        'tbody', 'tfoot', 'div', 'span', 'u', 'i']
+
+# Tag attributes to preserve
+keep_attrs = [ 'style', 'rev', 'prompt', 'color', 'colspan', 
+        'usemap', 'cols', 'accept', 'datetime', 'char', 
+        'accept-charset', 'shape', 'href', 'hreflang', 
+        'selected', 'frame', 'type', 'alt', 'nowrap', 
+        'border', 'axis', 'compact', 'rows', 'checked',
+        'for', 'start', 'hspace', 'charset', 'ismap', 'label',
+        'target', 'method', 'readonly', 'rel', 'valign', 'scope',
+        'size', 'cellspacing', 'cite', 'media', 'multiple', 'src',
+        'rules', 'nohref', 'action', 'rowspan', 'abbr', 'span', 'height',
+        'enctype', 'lang', 'disabled', 'name', 'charoff', 'clear', 'summary',
+        'value', 'longdesc', 'headers', 'vspace', 'noshade', 'coords', 'width',
+        'maxlength', 'cellpadding', 'title', 'align', 'dir', 'tabindex']
+
+# Styles to preserve in the style attribute
+keep_styles = ["color", "background", "font-weight", "font-family",
+                "font-style", "font-size", "text-decoration"]
+
+### USER CONFIGURATION END ###
+
+import sys
+import os
+import re
+
+# import modules from local path
+# (insert needed in order to skip system packages)
+sys.path.insert(0, os.path.dirname(__file__))
+import bleach
+
+from aqt.qt import *
+from aqt.editor import Editor
+from anki.hooks import wrap
+from anki.utils import json
+
+
+# insert linebreak after regex match
+brtags = (r"(</(div|p|br|li|ul|ol|blockquote|tr|"
+            "table|thead|tfoot|tbody|h[1-9]|)>)([^\n])")
+
+
+def cleanHtml(html):
+    cleaned = bleach.clean(html.replace("\n", ""),
+        tags = keep_tags,
+        attributes = keep_attrs,
+        styles = keep_styles,
+        strip = True
+        )
+    # remove empty style attributes, try to pretty-format tags
+    cleaned = cleaned.replace(' style=""', '').replace("\n", "")
+    cleaned = re.sub(brtags, r"\1\n\3", cleaned)
+    return cleaned
+
+def onHtmlClean(self):
+    modifiers = self.mw.app.queryKeyboardModifiers()
+    shift_and_click = modifiers == Qt.ShiftModifier
+    if shift_and_click:
+        self.onFieldUndo()
+        return
+    self.saveNow()
+    n = self.currentField
+    html = self.note.fields[n]
+    if not html:
+        return 
+    self._fieldUndo = (n, html)
+    cleaned = cleanHtml(html)
+    self.note.fields[n] = cleaned
+    self.loadNote()
+    self.web.setFocus()
+    self.web.eval("focusField(%d);" % n)
+
+
+def onFieldUndo(self):
+    if not hasattr(self, "_fieldUndo") or not self._fieldUndo:
+        return
+    n, html = self._fieldUndo
+    self.note.fields[n] = html
+    self.loadNote()
+    self.web.setFocus()
+    self.web.eval("focusField(%d);" % n)
+
+
+def onSetNote(self, note, hide, focus):
+    self._fieldUndo = None
+
+
+def onHtmlPaste(self):
+    mime = self.web.mungeClip(mode=QClipboard.Clipboard)
+    html = mime.html()
+    if not html:
+        return
+    html = cleanHtml(html)
+    self.web.eval("""
+        var pasteHTML = function(html) {
+            setFormat("inserthtml", html);
+        };
+        var filterHTML = function(html) {
+            // wrap it in <top> as we aren't allowed to change top level elements
+            var top = $.parseHTML("<ankitop>" + html + "</ankitop>")[0];
+            filterNode(top);
+            var outHtml = top.innerHTML;
+            // get rid of nbsp
+            outHtml = outHtml.replace(/&nbsp;/ig, " ");
+            //console.log(`input html: ${html}`);
+            //console.log(`outpt html: ${outHtml}`);
+            return outHtml;
+        };
+        pasteHTML(%s);
+        """ % json.dumps(html))
+
+def setupButtons(self):
+    self._addButton("CleanHtml", lambda: self.onHtmlClean(),
+        text="cH",
+        tip="Clean HTML ({})<br>Undo with shift-click".format(html_clean_hotkey),
+        key=html_clean_hotkey)
+    t = QShortcut(QKeySequence("Shift+"+html_clean_hotkey), self.parentWindow)
+    t.activated.connect(lambda: self.onFieldUndo())
+    t = QShortcut(QKeySequence(html_paste_hotkey), self.parentWindow)
+    t.activated.connect(lambda: self.onHtmlPaste())
+
+
+Editor.onHtmlPaste = onHtmlPaste
+Editor.onFieldUndo = onFieldUndo
+Editor.onHtmlClean = onHtmlClean
+Editor.setupButtons = wrap(Editor.setupButtons, setupButtons)
\ No newline at end of file
diff --git a/html_cleaner/six.py b/html_cleaner/six.py
new file mode 100644
index 0000000..190c023
--- /dev/null
+++ b/html_cleaner/six.py
@@ -0,0 +1,868 @@
+"""Utilities for writing code that runs on Python 2 and 3"""
+
+# Copyright (c) 2010-2015 Benjamin Peterson
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import absolute_import
+
+import functools
+import itertools
+import operator
+import sys
+import types
+
+__author__ = "Benjamin Peterson <benjamin@python.org>"
+__version__ = "1.10.0"
+
+
+# Useful for very coarse version differentiation.
+PY2 = sys.version_info[0] == 2
+PY3 = sys.version_info[0] == 3
+PY34 = sys.version_info[0:2] >= (3, 4)
+
+if PY3:
+    string_types = str,
+    integer_types = int,
+    class_types = type,
+    text_type = str
+    binary_type = bytes
+
+    MAXSIZE = sys.maxsize
+else:
+    string_types = basestring,
+    integer_types = (int, long)
+    class_types = (type, types.ClassType)
+    text_type = unicode
+    binary_type = str
+
+    if sys.platform.startswith("java"):
+        # Jython always uses 32 bits.
+        MAXSIZE = int((1 << 31) - 1)
+    else:
+        # It's possible to have sizeof(long) != sizeof(Py_ssize_t).
+        class X(object):
+
+            def __len__(self):
+                return 1 << 31
+        try:
+            len(X())
+        except OverflowError:
+            # 32-bit
+            MAXSIZE = int((1 << 31) - 1)
+        else:
+            # 64-bit
+            MAXSIZE = int((1 << 63) - 1)
+        del X
+
+
+def _add_doc(func, doc):
+    """Add documentation to a function."""
+    func.__doc__ = doc
+
+
+def _import_module(name):
+    """Import module, returning the module after the last dot."""
+    __import__(name)
+    return sys.modules[name]
+
+
+class _LazyDescr(object):
+
+    def __init__(self, name):
+        self.name = name
+
+    def __get__(self, obj, tp):
+        result = self._resolve()
+        setattr(obj, self.name, result)  # Invokes __set__.
+        try:
+            # This is a bit ugly, but it avoids running this again by
+            # removing this descriptor.
+            delattr(obj.__class__, self.name)
+        except AttributeError:
+            pass
+        return result
+
+
+class MovedModule(_LazyDescr):
+
+    def __init__(self, name, old, new=None):
+        super(MovedModule, self).__init__(name)
+        if PY3:
+            if new is None:
+                new = name
+            self.mod = new
+        else:
+            self.mod = old
+
+    def _resolve(self):
+        return _import_module(self.mod)
+
+    def __getattr__(self, attr):
+        _module = self._resolve()
+        value = getattr(_module, attr)
+        setattr(self, attr, value)
+        return value
+
+
+class _LazyModule(types.ModuleType):
+
+    def __init__(self, name):
+        super(_LazyModule, self).__init__(name)
+        self.__doc__ = self.__class__.__doc__
+
+    def __dir__(self):
+        attrs = ["__doc__", "__name__"]
+        attrs += [attr.name for attr in self._moved_attributes]
+        return attrs
+
+    # Subclasses should override this
+    _moved_attributes = []
+
+
+class MovedAttribute(_LazyDescr):
+
+    def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None):
+        super(MovedAttribute, self).__init__(name)
+        if PY3:
+            if new_mod is None:
+                new_mod = name
+            self.mod = new_mod
+            if new_attr is None:
+                if old_attr is None:
+                    new_attr = name
+                else:
+                    new_attr = old_attr
+            self.attr = new_attr
+        else:
+            self.mod = old_mod
+            if old_attr is None:
+                old_attr = name
+            self.attr = old_attr
+
+    def _resolve(self):
+        module = _import_module(self.mod)
+        return getattr(module, self.attr)
+
+
+class _SixMetaPathImporter(object):
+
+    """
+    A meta path importer to import six.moves and its submodules.
+
+    This class implements a PEP302 finder and loader. It should be compatible
+    with Python 2.5 and all existing versions of Python3
+    """
+
+    def __init__(self, six_module_name):
+        self.name = six_module_name
+        self.known_modules = {}
+
+    def _add_module(self, mod, *fullnames):
+        for fullname in fullnames:
+            self.known_modules[self.name + "." + fullname] = mod
+
+    def _get_module(self, fullname):
+        return self.known_modules[self.name + "." + fullname]
+
+    def find_module(self, fullname, path=None):
+        if fullname in self.known_modules:
+            return self
+        return None
+
+    def __get_module(self, fullname):
+        try:
+            return self.known_modules[fullname]
+        except KeyError:
+            raise ImportError("This loader does not know module " + fullname)
+
+    def load_module(self, fullname):
+        try:
+            # in case of a reload
+            return sys.modules[fullname]
+        except KeyError:
+            pass
+        mod = self.__get_module(fullname)
+        if isinstance(mod, MovedModule):
+            mod = mod._resolve()
+        else:
+            mod.__loader__ = self
+        sys.modules[fullname] = mod
+        return mod
+
+    def is_package(self, fullname):
+        """
+        Return true, if the named module is a package.
+
+        We need this method to get correct spec objects with
+        Python 3.4 (see PEP451)
+        """
+        return hasattr(self.__get_module(fullname), "__path__")
+
+    def get_code(self, fullname):
+        """Return None
+
+        Required, if is_package is implemented"""
+        self.__get_module(fullname)  # eventually raises ImportError
+        return None
+    get_source = get_code  # same as get_code
+
+_importer = _SixMetaPathImporter(__name__)
+
+
+class _MovedItems(_LazyModule):
+
+    """Lazy loading of moved objects"""
+    __path__ = []  # mark as package
+
+
+_moved_attributes = [
+    MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"),
+    MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"),
+    MovedAttribute("filterfalse", "itertools", "itertools", "ifilterfalse", "filterfalse"),
+    MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"),
+    MovedAttribute("intern", "__builtin__", "sys"),
+    MovedAttribute("map", "itertools", "builtins", "imap", "map"),
+    MovedAttribute("getcwd", "os", "os", "getcwdu", "getcwd"),
+    MovedAttribute("getcwdb", "os", "os", "getcwd", "getcwdb"),
+    MovedAttribute("range", "__builtin__", "builtins", "xrange", "range"),
+    MovedAttribute("reload_module", "__builtin__", "importlib" if PY34 else "imp", "reload"),
+    MovedAttribute("reduce", "__builtin__", "functools"),
+    MovedAttribute("shlex_quote", "pipes", "shlex", "quote"),
+    MovedAttribute("StringIO", "StringIO", "io"),
+    MovedAttribute("UserDict", "UserDict", "collections"),
+    MovedAttribute("UserList", "UserList", "collections"),
+    MovedAttribute("UserString", "UserString", "collections"),
+    MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"),
+    MovedAttribute("zip", "itertools", "builtins", "izip", "zip"),
+    MovedAttribute("zip_longest", "itertools", "itertools", "izip_longest", "zip_longest"),
+    MovedModule("builtins", "__builtin__"),
+    MovedModule("configparser", "ConfigParser"),
+    MovedModule("copyreg", "copy_reg"),
+    MovedModule("dbm_gnu", "gdbm", "dbm.gnu"),
+    MovedModule("_dummy_thread", "dummy_thread", "_dummy_thread"),
+    MovedModule("http_cookiejar", "cookielib", "http.cookiejar"),
+    MovedModule("http_cookies", "Cookie", "http.cookies"),
+    MovedModule("html_entities", "htmlentitydefs", "html.entities"),
+    MovedModule("html_parser", "HTMLParser", "html.parser"),
+    MovedModule("http_client", "httplib", "http.client"),
+    MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"),
+    MovedModule("email_mime_nonmultipart", "email.MIMENonMultipart", "email.mime.nonmultipart"),
+    MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"),
+    MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"),
+    MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"),
+    MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"),
+    MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"),
+    MovedModule("cPickle", "cPickle", "pickle"),
+    MovedModule("queue", "Queue"),
+    MovedModule("reprlib", "repr"),
+    MovedModule("socketserver", "SocketServer"),
+    MovedModule("_thread", "thread", "_thread"),
+    MovedModule("tkinter", "Tkinter"),
+    MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"),
+    MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"),
+    MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"),
+    MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"),
+    MovedModule("tkinter_tix", "Tix", "tkinter.tix"),
+    MovedModule("tkinter_ttk", "ttk", "tkinter.ttk"),
+    MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"),
+    MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"),
+    MovedModule("tkinter_colorchooser", "tkColorChooser",
+                "tkinter.colorchooser"),
+    MovedModule("tkinter_commondialog", "tkCommonDialog",
+                "tkinter.commondialog"),
+    MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"),
+    MovedModule("tkinter_font", "tkFont", "tkinter.font"),
+    MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"),
+    MovedModule("tkinter_tksimpledialog", "tkSimpleDialog",
+                "tkinter.simpledialog"),
+    MovedModule("urllib_parse", __name__ + ".moves.urllib_parse", "urllib.parse"),
+    MovedModule("urllib_error", __name__ + ".moves.urllib_error", "urllib.error"),
+    MovedModule("urllib", __name__ + ".moves.urllib", __name__ + ".moves.urllib"),
+    MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"),
+    MovedModule("xmlrpc_client", "xmlrpclib", "xmlrpc.client"),
+    MovedModule("xmlrpc_server", "SimpleXMLRPCServer", "xmlrpc.server"),
+]
+# Add windows specific modules.
+if sys.platform == "win32":
+    _moved_attributes += [
+        MovedModule("winreg", "_winreg"),
+    ]
+
+for attr in _moved_attributes:
+    setattr(_MovedItems, attr.name, attr)
+    if isinstance(attr, MovedModule):
+        _importer._add_module(attr, "moves." + attr.name)
+del attr
+
+_MovedItems._moved_attributes = _moved_attributes
+
+moves = _MovedItems(__name__ + ".moves")
+_importer._add_module(moves, "moves")
+
+
+class Module_six_moves_urllib_parse(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_parse"""
+
+
+_urllib_parse_moved_attributes = [
+    MovedAttribute("ParseResult", "urlparse", "urllib.parse"),
+    MovedAttribute("SplitResult", "urlparse", "urllib.parse"),
+    MovedAttribute("parse_qs", "urlparse", "urllib.parse"),
+    MovedAttribute("parse_qsl", "urlparse", "urllib.parse"),
+    MovedAttribute("urldefrag", "urlparse", "urllib.parse"),
+    MovedAttribute("urljoin", "urlparse", "urllib.parse"),
+    MovedAttribute("urlparse", "urlparse", "urllib.parse"),
+    MovedAttribute("urlsplit", "urlparse", "urllib.parse"),
+    MovedAttribute("urlunparse", "urlparse", "urllib.parse"),
+    MovedAttribute("urlunsplit", "urlparse", "urllib.parse"),
+    MovedAttribute("quote", "urllib", "urllib.parse"),
+    MovedAttribute("quote_plus", "urllib", "urllib.parse"),
+    MovedAttribute("unquote", "urllib", "urllib.parse"),
+    MovedAttribute("unquote_plus", "urllib", "urllib.parse"),
+    MovedAttribute("urlencode", "urllib", "urllib.parse"),
+    MovedAttribute("splitquery", "urllib", "urllib.parse"),
+    MovedAttribute("splittag", "urllib", "urllib.parse"),
+    MovedAttribute("splituser", "urllib", "urllib.parse"),
+    MovedAttribute("uses_fragment", "urlparse", "urllib.parse"),
+    MovedAttribute("uses_netloc", "urlparse", "urllib.parse"),
+    MovedAttribute("uses_params", "urlparse", "urllib.parse"),
+    MovedAttribute("uses_query", "urlparse", "urllib.parse"),
+    MovedAttribute("uses_relative", "urlparse", "urllib.parse"),
+]
+for attr in _urllib_parse_moved_attributes:
+    setattr(Module_six_moves_urllib_parse, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_parse._moved_attributes = _urllib_parse_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_parse(__name__ + ".moves.urllib_parse"),
+                      "moves.urllib_parse", "moves.urllib.parse")
+
+
+class Module_six_moves_urllib_error(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_error"""
+
+
+_urllib_error_moved_attributes = [
+    MovedAttribute("URLError", "urllib2", "urllib.error"),
+    MovedAttribute("HTTPError", "urllib2", "urllib.error"),
+    MovedAttribute("ContentTooShortError", "urllib", "urllib.error"),
+]
+for attr in _urllib_error_moved_attributes:
+    setattr(Module_six_moves_urllib_error, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_error._moved_attributes = _urllib_error_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_error(__name__ + ".moves.urllib.error"),
+                      "moves.urllib_error", "moves.urllib.error")
+
+
+class Module_six_moves_urllib_request(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_request"""
+
+
+_urllib_request_moved_attributes = [
+    MovedAttribute("urlopen", "urllib2", "urllib.request"),
+    MovedAttribute("install_opener", "urllib2", "urllib.request"),
+    MovedAttribute("build_opener", "urllib2", "urllib.request"),
+    MovedAttribute("pathname2url", "urllib", "urllib.request"),
+    MovedAttribute("url2pathname", "urllib", "urllib.request"),
+    MovedAttribute("getproxies", "urllib", "urllib.request"),
+    MovedAttribute("Request", "urllib2", "urllib.request"),
+    MovedAttribute("OpenerDirector", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPDefaultErrorHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPRedirectHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPCookieProcessor", "urllib2", "urllib.request"),
+    MovedAttribute("ProxyHandler", "urllib2", "urllib.request"),
+    MovedAttribute("BaseHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPPasswordMgr", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPPasswordMgrWithDefaultRealm", "urllib2", "urllib.request"),
+    MovedAttribute("AbstractBasicAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPBasicAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("ProxyBasicAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("AbstractDigestAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPDigestAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("ProxyDigestAuthHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPSHandler", "urllib2", "urllib.request"),
+    MovedAttribute("FileHandler", "urllib2", "urllib.request"),
+    MovedAttribute("FTPHandler", "urllib2", "urllib.request"),
+    MovedAttribute("CacheFTPHandler", "urllib2", "urllib.request"),
+    MovedAttribute("UnknownHandler", "urllib2", "urllib.request"),
+    MovedAttribute("HTTPErrorProcessor", "urllib2", "urllib.request"),
+    MovedAttribute("urlretrieve", "urllib", "urllib.request"),
+    MovedAttribute("urlcleanup", "urllib", "urllib.request"),
+    MovedAttribute("URLopener", "urllib", "urllib.request"),
+    MovedAttribute("FancyURLopener", "urllib", "urllib.request"),
+    MovedAttribute("proxy_bypass", "urllib", "urllib.request"),
+]
+for attr in _urllib_request_moved_attributes:
+    setattr(Module_six_moves_urllib_request, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_request._moved_attributes = _urllib_request_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_request(__name__ + ".moves.urllib.request"),
+                      "moves.urllib_request", "moves.urllib.request")
+
+
+class Module_six_moves_urllib_response(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_response"""
+
+
+_urllib_response_moved_attributes = [
+    MovedAttribute("addbase", "urllib", "urllib.response"),
+    MovedAttribute("addclosehook", "urllib", "urllib.response"),
+    MovedAttribute("addinfo", "urllib", "urllib.response"),
+    MovedAttribute("addinfourl", "urllib", "urllib.response"),
+]
+for attr in _urllib_response_moved_attributes:
+    setattr(Module_six_moves_urllib_response, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_response._moved_attributes = _urllib_response_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_response(__name__ + ".moves.urllib.response"),
+                      "moves.urllib_response", "moves.urllib.response")
+
+
+class Module_six_moves_urllib_robotparser(_LazyModule):
+
+    """Lazy loading of moved objects in six.moves.urllib_robotparser"""
+
+
+_urllib_robotparser_moved_attributes = [
+    MovedAttribute("RobotFileParser", "robotparser", "urllib.robotparser"),
+]
+for attr in _urllib_robotparser_moved_attributes:
+    setattr(Module_six_moves_urllib_robotparser, attr.name, attr)
+del attr
+
+Module_six_moves_urllib_robotparser._moved_attributes = _urllib_robotparser_moved_attributes
+
+_importer._add_module(Module_six_moves_urllib_robotparser(__name__ + ".moves.urllib.robotparser"),
+                      "moves.urllib_robotparser", "moves.urllib.robotparser")
+
+
+class Module_six_moves_urllib(types.ModuleType):
+
+    """Create a six.moves.urllib namespace that resembles the Python 3 namespace"""
+    __path__ = []  # mark as package
+    parse = _importer._get_module("moves.urllib_parse")
+    error = _importer._get_module("moves.urllib_error")
+    request = _importer._get_module("moves.urllib_request")
+    response = _importer._get_module("moves.urllib_response")
+    robotparser = _importer._get_module("moves.urllib_robotparser")
+
+    def __dir__(self):
+        return ['parse', 'error', 'request', 'response', 'robotparser']
+
+_importer._add_module(Module_six_moves_urllib(__name__ + ".moves.urllib"),
+                      "moves.urllib")
+
+
+def add_move(move):
+    """Add an item to six.moves."""
+    setattr(_MovedItems, move.name, move)
+
+
+def remove_move(name):
+    """Remove item from six.moves."""
+    try:
+        delattr(_MovedItems, name)
+    except AttributeError:
+        try:
+            del moves.__dict__[name]
+        except KeyError:
+            raise AttributeError("no such move, %r" % (name,))
+
+
+if PY3:
+    _meth_func = "__func__"
+    _meth_self = "__self__"
+
+    _func_closure = "__closure__"
+    _func_code = "__code__"
+    _func_defaults = "__defaults__"
+    _func_globals = "__globals__"
+else:
+    _meth_func = "im_func"
+    _meth_self = "im_self"
+
+    _func_closure = "func_closure"
+    _func_code = "func_code"
+    _func_defaults = "func_defaults"
+    _func_globals = "func_globals"
+
+
+try:
+    advance_iterator = next
+except NameError:
+    def advance_iterator(it):
+        return it.next()
+next = advance_iterator
+
+
+try:
+    callable = callable
+except NameError:
+    def callable(obj):
+        return any("__call__" in klass.__dict__ for klass in type(obj).__mro__)
+
+
+if PY3:
+    def get_unbound_function(unbound):
+        return unbound
+
+    create_bound_method = types.MethodType
+
+    def create_unbound_method(func, cls):
+        return func
+
+    Iterator = object
+else:
+    def get_unbound_function(unbound):
+        return unbound.im_func
+
+    def create_bound_method(func, obj):
+        return types.MethodType(func, obj, obj.__class__)
+
+    def create_unbound_method(func, cls):
+        return types.MethodType(func, None, cls)
+
+    class Iterator(object):
+
+        def next(self):
+            return type(self).__next__(self)
+
+    callable = callable
+_add_doc(get_unbound_function,
+         """Get the function out of a possibly unbound function""")
+
+
+get_method_function = operator.attrgetter(_meth_func)
+get_method_self = operator.attrgetter(_meth_self)
+get_function_closure = operator.attrgetter(_func_closure)
+get_function_code = operator.attrgetter(_func_code)
+get_function_defaults = operator.attrgetter(_func_defaults)
+get_function_globals = operator.attrgetter(_func_globals)
+
+
+if PY3:
+    def iterkeys(d, **kw):
+        return iter(d.keys(**kw))
+
+    def itervalues(d, **kw):
+        return iter(d.values(**kw))
+
+    def iteritems(d, **kw):
+        return iter(d.items(**kw))
+
+    def iterlists(d, **kw):
+        return iter(d.lists(**kw))
+
+    viewkeys = operator.methodcaller("keys")
+
+    viewvalues = operator.methodcaller("values")
+
+    viewitems = operator.methodcaller("items")
+else:
+    def iterkeys(d, **kw):
+        return d.iterkeys(**kw)
+
+    def itervalues(d, **kw):
+        return d.itervalues(**kw)
+
+    def iteritems(d, **kw):
+        return d.iteritems(**kw)
+
+    def iterlists(d, **kw):
+        return d.iterlists(**kw)
+
+    viewkeys = operator.methodcaller("viewkeys")
+
+    viewvalues = operator.methodcaller("viewvalues")
+
+    viewitems = operator.methodcaller("viewitems")
+
+_add_doc(iterkeys, "Return an iterator over the keys of a dictionary.")
+_add_doc(itervalues, "Return an iterator over the values of a dictionary.")
+_add_doc(iteritems,
+         "Return an iterator over the (key, value) pairs of a dictionary.")
+_add_doc(iterlists,
+         "Return an iterator over the (key, [values]) pairs of a dictionary.")
+
+
+if PY3:
+    def b(s):
+        return s.encode("latin-1")
+
+    def u(s):
+        return s
+    unichr = chr
+    import struct
+    int2byte = struct.Struct(">B").pack
+    del struct
+    byte2int = operator.itemgetter(0)
+    indexbytes = operator.getitem
+    iterbytes = iter
+    import io
+    StringIO = io.StringIO
+    BytesIO = io.BytesIO
+    _assertCountEqual = "assertCountEqual"
+    if sys.version_info[1] <= 1:
+        _assertRaisesRegex = "assertRaisesRegexp"
+        _assertRegex = "assertRegexpMatches"
+    else:
+        _assertRaisesRegex = "assertRaisesRegex"
+        _assertRegex = "assertRegex"
+else:
+    def b(s):
+        return s
+    # Workaround for standalone backslash
+
+    def u(s):
+        return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape")
+    unichr = unichr
+    int2byte = chr
+
+    def byte2int(bs):
+        return ord(bs[0])
+
+    def indexbytes(buf, i):
+        return ord(buf[i])
+    iterbytes = functools.partial(itertools.imap, ord)
+    import StringIO
+    StringIO = BytesIO = StringIO.StringIO
+    _assertCountEqual = "assertItemsEqual"
+    _assertRaisesRegex = "assertRaisesRegexp"
+    _assertRegex = "assertRegexpMatches"
+_add_doc(b, """Byte literal""")
+_add_doc(u, """Text literal""")
+
+
+def assertCountEqual(self, *args, **kwargs):
+    return getattr(self, _assertCountEqual)(*args, **kwargs)
+
+
+def assertRaisesRegex(self, *args, **kwargs):
+    return getattr(self, _assertRaisesRegex)(*args, **kwargs)
+
+
+def assertRegex(self, *args, **kwargs):
+    return getattr(self, _assertRegex)(*args, **kwargs)
+
+
+if PY3:
+    exec_ = getattr(moves.builtins, "exec")
+
+    def reraise(tp, value, tb=None):
+        if value is None:
+            value = tp()
+        if value.__traceback__ is not tb:
+            raise value.with_traceback(tb)
+        raise value
+
+else:
+    def exec_(_code_, _globs_=None, _locs_=None):
+        """Execute code in a namespace."""
+        if _globs_ is None:
+            frame = sys._getframe(1)
+            _globs_ = frame.f_globals
+            if _locs_ is None:
+                _locs_ = frame.f_locals
+            del frame
+        elif _locs_ is None:
+            _locs_ = _globs_
+        exec("""exec _code_ in _globs_, _locs_""")
+
+    exec_("""def reraise(tp, value, tb=None):
+    raise tp, value, tb
+""")
+
+
+if sys.version_info[:2] == (3, 2):
+    exec_("""def raise_from(value, from_value):
+    if from_value is None:
+        raise value
+    raise value from from_value
+""")
+elif sys.version_info[:2] > (3, 2):
+    exec_("""def raise_from(value, from_value):
+    raise value from from_value
+""")
+else:
+    def raise_from(value, from_value):
+        raise value
+
+
+print_ = getattr(moves.builtins, "print", None)
+if print_ is None:
+    def print_(*args, **kwargs):
+        """The new-style print function for Python 2.4 and 2.5."""
+        fp = kwargs.pop("file", sys.stdout)
+        if fp is None:
+            return
+
+        def write(data):
+            if not isinstance(data, basestring):
+                data = str(data)
+            # If the file has an encoding, encode unicode with it.
+            if (isinstance(fp, file) and
+                    isinstance(data, unicode) and
+                    fp.encoding is not None):
+                errors = getattr(fp, "errors", None)
+                if errors is None:
+                    errors = "strict"
+                data = data.encode(fp.encoding, errors)
+            fp.write(data)
+        want_unicode = False
+        sep = kwargs.pop("sep", None)
+        if sep is not None:
+            if isinstance(sep, unicode):
+                want_unicode = True
+            elif not isinstance(sep, str):
+                raise TypeError("sep must be None or a string")
+        end = kwargs.pop("end", None)
+        if end is not None:
+            if isinstance(end, unicode):
+                want_unicode = True
+            elif not isinstance(end, str):
+                raise TypeError("end must be None or a string")
+        if kwargs:
+            raise TypeError("invalid keyword arguments to print()")
+        if not want_unicode:
+            for arg in args:
+                if isinstance(arg, unicode):
+                    want_unicode = True
+                    break
+        if want_unicode:
+            newline = unicode("\n")
+            space = unicode(" ")
+        else:
+            newline = "\n"
+            space = " "
+        if sep is None:
+            sep = space
+        if end is None:
+            end = newline
+        for i, arg in enumerate(args):
+            if i:
+                write(sep)
+            write(arg)
+        write(end)
+if sys.version_info[:2] < (3, 3):
+    _print = print_
+
+    def print_(*args, **kwargs):
+        fp = kwargs.get("file", sys.stdout)
+        flush = kwargs.pop("flush", False)
+        _print(*args, **kwargs)
+        if flush and fp is not None:
+            fp.flush()
+
+_add_doc(reraise, """Reraise an exception.""")
+
+if sys.version_info[0:2] < (3, 4):
+    def wraps(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS,
+              updated=functools.WRAPPER_UPDATES):
+        def wrapper(f):
+            f = functools.wraps(wrapped, assigned, updated)(f)
+            f.__wrapped__ = wrapped
+            return f
+        return wrapper
+else:
+    wraps = functools.wraps
+
+
+def with_metaclass(meta, *bases):
+    """Create a base class with a metaclass."""
+    # This requires a bit of explanation: the basic idea is to make a dummy
+    # metaclass for one level of class instantiation that replaces itself with
+    # the actual metaclass.
+    class metaclass(meta):
+
+        def __new__(cls, name, this_bases, d):
+            return meta(name, bases, d)
+    return type.__new__(metaclass, 'temporary_class', (), {})
+
+
+def add_metaclass(metaclass):
+    """Class decorator for creating a class with a metaclass."""
+    def wrapper(cls):
+        orig_vars = cls.__dict__.copy()
+        slots = orig_vars.get('__slots__')
+        if slots is not None:
+            if isinstance(slots, str):
+                slots = [slots]
+            for slots_var in slots:
+                orig_vars.pop(slots_var)
+        orig_vars.pop('__dict__', None)
+        orig_vars.pop('__weakref__', None)
+        return metaclass(cls.__name__, cls.__bases__, orig_vars)
+    return wrapper
+
+
+def python_2_unicode_compatible(klass):
+    """
+    A decorator that defines __unicode__ and __str__ methods under Python 2.
+    Under Python 3 it does nothing.
+
+    To support Python 2 and 3 with a single code base, define a __str__ method
+    returning text and apply this decorator to the class.
+    """
+    if PY2:
+        if '__str__' not in klass.__dict__:
+            raise ValueError("@python_2_unicode_compatible cannot be applied "
+                             "to %s because it doesn't define __str__()." %
+                             klass.__name__)
+        klass.__unicode__ = klass.__str__
+        klass.__str__ = lambda self: self.__unicode__().encode('utf-8')
+    return klass
+
+
+# Complete the moves implementation.
+# This code is at the end of this module to speed up module loading.
+# Turn this module into a package.
+__path__ = []  # required for PEP 302 and PEP 451
+__package__ = __name__  # see PEP 366 @ReservedAssignment
+if globals().get("__spec__") is not None:
+    __spec__.submodule_search_locations = []  # PEP 451 @UndefinedVariable
+# Remove other six meta path importers, since they cause problems. This can
+# happen if six is removed from sys.modules and then reloaded. (Setuptools does
+# this for some reason.)
+if sys.meta_path:
+    for i, importer in enumerate(sys.meta_path):
+        # Here's some real nastiness: Another "instance" of the six module might
+        # be floating around. Therefore, we can't use isinstance() to check for
+        # the six meta path importer, since the other six instance will have
+        # inserted an importer with different class.
+        if (type(importer).__name__ == "_SixMetaPathImporter" and
+                importer.name == __name__):
+            del sys.meta_path[i]
+            break
+    del i, importer
+# Finally, add the importer to the meta path import hook.
+sys.meta_path.append(_importer)
diff --git a/html_cleaner/webencodings/__init__.py b/html_cleaner/webencodings/__init__.py
new file mode 100644
index 0000000..d21d697
--- /dev/null
+++ b/html_cleaner/webencodings/__init__.py
@@ -0,0 +1,342 @@
+# coding: utf-8
+"""
+
+    webencodings
+    ~~~~~~~~~~~~
+
+    This is a Python implementation of the `WHATWG Encoding standard
+    <http://encoding.spec.whatwg.org/>`. See README for details.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+import codecs
+
+from .labels import LABELS
+
+
+VERSION = '0.5.1'
+
+
+# Some names in Encoding are not valid Python aliases. Remap these.
+PYTHON_NAMES = {
+    'iso-8859-8-i': 'iso-8859-8',
+    'x-mac-cyrillic': 'mac-cyrillic',
+    'macintosh': 'mac-roman',
+    'windows-874': 'cp874'}
+
+CACHE = {}
+
+
+def ascii_lower(string):
+    r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
+
+    :param string: An Unicode string.
+    :returns: A new Unicode string.
+
+    This is used for `ASCII case-insensitive
+    <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
+    matching of encoding labels.
+    The same matching is also used, among other things,
+    for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
+
+    This is different from the :meth:`~py:str.lower` method of Unicode strings
+    which also affect non-ASCII characters,
+    sometimes mapping them into the ASCII range:
+
+        >>> keyword = u'Bac\N{KELVIN SIGN}ground'
+        >>> assert keyword.lower() == u'background'
+        >>> assert ascii_lower(keyword) != keyword.lower()
+        >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
+
+    """
+    # This turns out to be faster than unicode.translate()
+    return string.encode('utf8').lower().decode('utf8')
+
+
+def lookup(label):
+    """
+    Look for an encoding by its label.
+    This is the spec’s `get an encoding
+    <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
+    Supported labels are listed there.
+
+    :param label: A string.
+    :returns:
+        An :class:`Encoding` object, or :obj:`None` for an unknown label.
+
+    """
+    # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
+    label = ascii_lower(label.strip('\t\n\f\r '))
+    name = LABELS.get(label)
+    if name is None:
+        return None
+    encoding = CACHE.get(name)
+    if encoding is None:
+        if name == 'x-user-defined':
+            from .x_user_defined import codec_info
+        else:
+            python_name = PYTHON_NAMES.get(name, name)
+            # Any python_name value that gets to here should be valid.
+            codec_info = codecs.lookup(python_name)
+        encoding = Encoding(name, codec_info)
+        CACHE[name] = encoding
+    return encoding
+
+
+def _get_encoding(encoding_or_label):
+    """
+    Accept either an encoding object or label.
+
+    :param encoding: An :class:`Encoding` object or a label string.
+    :returns: An :class:`Encoding` object.
+    :raises: :exc:`~exceptions.LookupError` for an unknown label.
+
+    """
+    if hasattr(encoding_or_label, 'codec_info'):
+        return encoding_or_label
+
+    encoding = lookup(encoding_or_label)
+    if encoding is None:
+        raise LookupError('Unknown encoding label: %r' % encoding_or_label)
+    return encoding
+
+
+class Encoding(object):
+    """Reresents a character encoding such as UTF-8,
+    that can be used for decoding or encoding.
+
+    .. attribute:: name
+
+        Canonical name of the encoding
+
+    .. attribute:: codec_info
+
+        The actual implementation of the encoding,
+        a stdlib :class:`~codecs.CodecInfo` object.
+        See :func:`codecs.register`.
+
+    """
+    def __init__(self, name, codec_info):
+        self.name = name
+        self.codec_info = codec_info
+
+    def __repr__(self):
+        return '<Encoding %s>' % self.name
+
+
+#: The UTF-8 encoding. Should be used for new content and formats.
+UTF8 = lookup('utf-8')
+
+_UTF16LE = lookup('utf-16le')
+_UTF16BE = lookup('utf-16be')
+
+
+def decode(input, fallback_encoding, errors='replace'):
+    """
+    Decode a single string.
+
+    :param input: A byte string
+    :param fallback_encoding:
+        An :class:`Encoding` object or a label string.
+        The encoding to use if :obj:`input` does note have a BOM.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+    :return:
+        A ``(output, encoding)`` tuple of an Unicode string
+        and an :obj:`Encoding`.
+
+    """
+    # Fail early if `encoding` is an invalid label.
+    fallback_encoding = _get_encoding(fallback_encoding)
+    bom_encoding, input = _detect_bom(input)
+    encoding = bom_encoding or fallback_encoding
+    return encoding.codec_info.decode(input, errors)[0], encoding
+
+
+def _detect_bom(input):
+    """Return (bom_encoding, input), with any BOM removed from the input."""
+    if input.startswith(b'\xFF\xFE'):
+        return _UTF16LE, input[2:]
+    if input.startswith(b'\xFE\xFF'):
+        return _UTF16BE, input[2:]
+    if input.startswith(b'\xEF\xBB\xBF'):
+        return UTF8, input[3:]
+    return None, input
+
+
+def encode(input, encoding=UTF8, errors='strict'):
+    """
+    Encode a single string.
+
+    :param input: An Unicode string.
+    :param encoding: An :class:`Encoding` object or a label string.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+    :return: A byte string.
+
+    """
+    return _get_encoding(encoding).codec_info.encode(input, errors)[0]
+
+
+def iter_decode(input, fallback_encoding, errors='replace'):
+    """
+    "Pull"-based decoder.
+
+    :param input:
+        An iterable of byte strings.
+
+        The input is first consumed just enough to determine the encoding
+        based on the precense of a BOM,
+        then consumed on demand when the return value is.
+    :param fallback_encoding:
+        An :class:`Encoding` object or a label string.
+        The encoding to use if :obj:`input` does note have a BOM.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+    :returns:
+        An ``(output, encoding)`` tuple.
+        :obj:`output` is an iterable of Unicode strings,
+        :obj:`encoding` is the :obj:`Encoding` that is being used.
+
+    """
+
+    decoder = IncrementalDecoder(fallback_encoding, errors)
+    generator = _iter_decode_generator(input, decoder)
+    encoding = next(generator)
+    return generator, encoding
+
+
+def _iter_decode_generator(input, decoder):
+    """Return a generator that first yields the :obj:`Encoding`,
+    then yields output chukns as Unicode strings.
+
+    """
+    decode = decoder.decode
+    input = iter(input)
+    for chunck in input:
+        output = decode(chunck)
+        if output:
+            assert decoder.encoding is not None
+            yield decoder.encoding
+            yield output
+            break
+    else:
+        # Input exhausted without determining the encoding
+        output = decode(b'', final=True)
+        assert decoder.encoding is not None
+        yield decoder.encoding
+        if output:
+            yield output
+        return
+
+    for chunck in input:
+        output = decode(chunck)
+        if output:
+            yield output
+    output = decode(b'', final=True)
+    if output:
+        yield output
+
+
+def iter_encode(input, encoding=UTF8, errors='strict'):
+    """
+    “Pull”-based encoder.
+
+    :param input: An iterable of Unicode strings.
+    :param encoding: An :class:`Encoding` object or a label string.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+    :returns: An iterable of byte strings.
+
+    """
+    # Fail early if `encoding` is an invalid label.
+    encode = IncrementalEncoder(encoding, errors).encode
+    return _iter_encode_generator(input, encode)
+
+
+def _iter_encode_generator(input, encode):
+    for chunck in input:
+        output = encode(chunck)
+        if output:
+            yield output
+    output = encode('', final=True)
+    if output:
+        yield output
+
+
+class IncrementalDecoder(object):
+    """
+    “Push”-based decoder.
+
+    :param fallback_encoding:
+        An :class:`Encoding` object or a label string.
+        The encoding to use if :obj:`input` does note have a BOM.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+    """
+    def __init__(self, fallback_encoding, errors='replace'):
+        # Fail early if `encoding` is an invalid label.
+        self._fallback_encoding = _get_encoding(fallback_encoding)
+        self._errors = errors
+        self._buffer = b''
+        self._decoder = None
+        #: The actual :class:`Encoding` that is being used,
+        #: or :obj:`None` if that is not determined yet.
+        #: (Ie. if there is not enough input yet to determine
+        #: if there is a BOM.)
+        self.encoding = None  # Not known yet.
+
+    def decode(self, input, final=False):
+        """Decode one chunk of the input.
+
+        :param input: A byte string.
+        :param final:
+            Indicate that no more input is available.
+            Must be :obj:`True` if this is the last call.
+        :returns: An Unicode string.
+
+        """
+        decoder = self._decoder
+        if decoder is not None:
+            return decoder(input, final)
+
+        input = self._buffer + input
+        encoding, input = _detect_bom(input)
+        if encoding is None:
+            if len(input) < 3 and not final:  # Not enough data yet.
+                self._buffer = input
+                return ''
+            else:  # No BOM
+                encoding = self._fallback_encoding
+        decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
+        self._decoder = decoder
+        self.encoding = encoding
+        return decoder(input, final)
+
+
+class IncrementalEncoder(object):
+    """
+    “Push”-based encoder.
+
+    :param encoding: An :class:`Encoding` object or a label string.
+    :param errors: Type of error handling. See :func:`codecs.register`.
+    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
+
+    .. method:: encode(input, final=False)
+
+        :param input: An Unicode string.
+        :param final:
+            Indicate that no more input is available.
+            Must be :obj:`True` if this is the last call.
+        :returns: A byte string.
+
+    """
+    def __init__(self, encoding=UTF8, errors='strict'):
+        encoding = _get_encoding(encoding)
+        self.encode = encoding.codec_info.incrementalencoder(errors).encode
diff --git a/html_cleaner/webencodings/labels.py b/html_cleaner/webencodings/labels.py
new file mode 100644
index 0000000..29cbf91
--- /dev/null
+++ b/html_cleaner/webencodings/labels.py
@@ -0,0 +1,231 @@
+"""
+
+    webencodings.labels
+    ~~~~~~~~~~~~~~~~~~~
+
+    Map encoding labels to their name.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+    'unicode-1-1-utf-8':   'utf-8',
+    'utf-8':               'utf-8',
+    'utf8':                'utf-8',
+    '866':                 'ibm866',
+    'cp866':               'ibm866',
+    'csibm866':            'ibm866',
+    'ibm866':              'ibm866',
+    'csisolatin2':         'iso-8859-2',
+    'iso-8859-2':          'iso-8859-2',
+    'iso-ir-101':          'iso-8859-2',
+    'iso8859-2':           'iso-8859-2',
+    'iso88592':            'iso-8859-2',
+    'iso_8859-2':          'iso-8859-2',
+    'iso_8859-2:1987':     'iso-8859-2',
+    'l2':                  'iso-8859-2',
+    'latin2':              'iso-8859-2',
+    'csisolatin3':         'iso-8859-3',
+    'iso-8859-3':          'iso-8859-3',
+    'iso-ir-109':          'iso-8859-3',
+    'iso8859-3':           'iso-8859-3',
+    'iso88593':            'iso-8859-3',
+    'iso_8859-3':          'iso-8859-3',
+    'iso_8859-3:1988':     'iso-8859-3',
+    'l3':                  'iso-8859-3',
+    'latin3':              'iso-8859-3',
+    'csisolatin4':         'iso-8859-4',
+    'iso-8859-4':          'iso-8859-4',
+    'iso-ir-110':          'iso-8859-4',
+    'iso8859-4':           'iso-8859-4',
+    'iso88594':            'iso-8859-4',
+    'iso_8859-4':          'iso-8859-4',
+    'iso_8859-4:1988':     'iso-8859-4',
+    'l4':                  'iso-8859-4',
+    'latin4':              'iso-8859-4',
+    'csisolatincyrillic':  'iso-8859-5',
+    'cyrillic':            'iso-8859-5',
+    'iso-8859-5':          'iso-8859-5',
+    'iso-ir-144':          'iso-8859-5',
+    'iso8859-5':           'iso-8859-5',
+    'iso88595':            'iso-8859-5',
+    'iso_8859-5':          'iso-8859-5',
+    'iso_8859-5:1988':     'iso-8859-5',
+    'arabic':              'iso-8859-6',
+    'asmo-708':            'iso-8859-6',
+    'csiso88596e':         'iso-8859-6',
+    'csiso88596i':         'iso-8859-6',
+    'csisolatinarabic':    'iso-8859-6',
+    'ecma-114':            'iso-8859-6',
+    'iso-8859-6':          'iso-8859-6',
+    'iso-8859-6-e':        'iso-8859-6',
+    'iso-8859-6-i':        'iso-8859-6',
+    'iso-ir-127':          'iso-8859-6',
+    'iso8859-6':           'iso-8859-6',
+    'iso88596':            'iso-8859-6',
+    'iso_8859-6':          'iso-8859-6',
+    'iso_8859-6:1987':     'iso-8859-6',
+    'csisolatingreek':     'iso-8859-7',
+    'ecma-118':            'iso-8859-7',
+    'elot_928':            'iso-8859-7',
+    'greek':               'iso-8859-7',
+    'greek8':              'iso-8859-7',
+    'iso-8859-7':          'iso-8859-7',
+    'iso-ir-126':          'iso-8859-7',
+    'iso8859-7':           'iso-8859-7',
+    'iso88597':            'iso-8859-7',
+    'iso_8859-7':          'iso-8859-7',
+    'iso_8859-7:1987':     'iso-8859-7',
+    'sun_eu_greek':        'iso-8859-7',
+    'csiso88598e':         'iso-8859-8',
+    'csisolatinhebrew':    'iso-8859-8',
+    'hebrew':              'iso-8859-8',
+    'iso-8859-8':          'iso-8859-8',
+    'iso-8859-8-e':        'iso-8859-8',
+    'iso-ir-138':          'iso-8859-8',
+    'iso8859-8':           'iso-8859-8',
+    'iso88598':            'iso-8859-8',
+    'iso_8859-8':          'iso-8859-8',
+    'iso_8859-8:1988':     'iso-8859-8',
+    'visual':              'iso-8859-8',
+    'csiso88598i':         'iso-8859-8-i',
+    'iso-8859-8-i':        'iso-8859-8-i',
+    'logical':             'iso-8859-8-i',
+    'csisolatin6':         'iso-8859-10',
+    'iso-8859-10':         'iso-8859-10',
+    'iso-ir-157':          'iso-8859-10',
+    'iso8859-10':          'iso-8859-10',
+    'iso885910':           'iso-8859-10',
+    'l6':                  'iso-8859-10',
+    'latin6':              'iso-8859-10',
+    'iso-8859-13':         'iso-8859-13',
+    'iso8859-13':          'iso-8859-13',
+    'iso885913':           'iso-8859-13',
+    'iso-8859-14':         'iso-8859-14',
+    'iso8859-14':          'iso-8859-14',
+    'iso885914':           'iso-8859-14',
+    'csisolatin9':         'iso-8859-15',
+    'iso-8859-15':         'iso-8859-15',
+    'iso8859-15':          'iso-8859-15',
+    'iso885915':           'iso-8859-15',
+    'iso_8859-15':         'iso-8859-15',
+    'l9':                  'iso-8859-15',
+    'iso-8859-16':         'iso-8859-16',
+    'cskoi8r':             'koi8-r',
+    'koi':                 'koi8-r',
+    'koi8':                'koi8-r',
+    'koi8-r':              'koi8-r',
+    'koi8_r':              'koi8-r',
+    'koi8-u':              'koi8-u',
+    'csmacintosh':         'macintosh',
+    'mac':                 'macintosh',
+    'macintosh':           'macintosh',
+    'x-mac-roman':         'macintosh',
+    'dos-874':             'windows-874',
+    'iso-8859-11':         'windows-874',
+    'iso8859-11':          'windows-874',
+    'iso885911':           'windows-874',
+    'tis-620':             'windows-874',
+    'windows-874':         'windows-874',
+    'cp1250':              'windows-1250',
+    'windows-1250':        'windows-1250',
+    'x-cp1250':            'windows-1250',
+    'cp1251':              'windows-1251',
+    'windows-1251':        'windows-1251',
+    'x-cp1251':            'windows-1251',
+    'ansi_x3.4-1968':      'windows-1252',
+    'ascii':               'windows-1252',
+    'cp1252':              'windows-1252',
+    'cp819':               'windows-1252',
+    'csisolatin1':         'windows-1252',
+    'ibm819':              'windows-1252',
+    'iso-8859-1':          'windows-1252',
+    'iso-ir-100':          'windows-1252',
+    'iso8859-1':           'windows-1252',
+    'iso88591':            'windows-1252',
+    'iso_8859-1':          'windows-1252',
+    'iso_8859-1:1987':     'windows-1252',
+    'l1':                  'windows-1252',
+    'latin1':              'windows-1252',
+    'us-ascii':            'windows-1252',
+    'windows-1252':        'windows-1252',
+    'x-cp1252':            'windows-1252',
+    'cp1253':              'windows-1253',
+    'windows-1253':        'windows-1253',
+    'x-cp1253':            'windows-1253',
+    'cp1254':              'windows-1254',
+    'csisolatin5':         'windows-1254',
+    'iso-8859-9':          'windows-1254',
+    'iso-ir-148':          'windows-1254',
+    'iso8859-9':           'windows-1254',
+    'iso88599':            'windows-1254',
+    'iso_8859-9':          'windows-1254',
+    'iso_8859-9:1989':     'windows-1254',
+    'l5':                  'windows-1254',
+    'latin5':              'windows-1254',
+    'windows-1254':        'windows-1254',
+    'x-cp1254':            'windows-1254',
+    'cp1255':              'windows-1255',
+    'windows-1255':        'windows-1255',
+    'x-cp1255':            'windows-1255',
+    'cp1256':              'windows-1256',
+    'windows-1256':        'windows-1256',
+    'x-cp1256':            'windows-1256',
+    'cp1257':              'windows-1257',
+    'windows-1257':        'windows-1257',
+    'x-cp1257':            'windows-1257',
+    'cp1258':              'windows-1258',
+    'windows-1258':        'windows-1258',
+    'x-cp1258':            'windows-1258',
+    'x-mac-cyrillic':      'x-mac-cyrillic',
+    'x-mac-ukrainian':     'x-mac-cyrillic',
+    'chinese':             'gbk',
+    'csgb2312':            'gbk',
+    'csiso58gb231280':     'gbk',
+    'gb2312':              'gbk',
+    'gb_2312':             'gbk',
+    'gb_2312-80':          'gbk',
+    'gbk':                 'gbk',
+    'iso-ir-58':           'gbk',
+    'x-gbk':               'gbk',
+    'gb18030':             'gb18030',
+    'hz-gb-2312':          'hz-gb-2312',
+    'big5':                'big5',
+    'big5-hkscs':          'big5',
+    'cn-big5':             'big5',
+    'csbig5':              'big5',
+    'x-x-big5':            'big5',
+    'cseucpkdfmtjapanese': 'euc-jp',
+    'euc-jp':              'euc-jp',
+    'x-euc-jp':            'euc-jp',
+    'csiso2022jp':         'iso-2022-jp',
+    'iso-2022-jp':         'iso-2022-jp',
+    'csshiftjis':          'shift_jis',
+    'ms_kanji':            'shift_jis',
+    'shift-jis':           'shift_jis',
+    'shift_jis':           'shift_jis',
+    'sjis':                'shift_jis',
+    'windows-31j':         'shift_jis',
+    'x-sjis':              'shift_jis',
+    'cseuckr':             'euc-kr',
+    'csksc56011987':       'euc-kr',
+    'euc-kr':              'euc-kr',
+    'iso-ir-149':          'euc-kr',
+    'korean':              'euc-kr',
+    'ks_c_5601-1987':      'euc-kr',
+    'ks_c_5601-1989':      'euc-kr',
+    'ksc5601':             'euc-kr',
+    'ksc_5601':            'euc-kr',
+    'windows-949':         'euc-kr',
+    'csiso2022kr':         'iso-2022-kr',
+    'iso-2022-kr':         'iso-2022-kr',
+    'utf-16be':            'utf-16be',
+    'utf-16':              'utf-16le',
+    'utf-16le':            'utf-16le',
+    'x-user-defined':      'x-user-defined',
+}
diff --git a/html_cleaner/webencodings/mklabels.py b/html_cleaner/webencodings/mklabels.py
new file mode 100644
index 0000000..295dc92
--- /dev/null
+++ b/html_cleaner/webencodings/mklabels.py
@@ -0,0 +1,59 @@
+"""
+
+    webencodings.mklabels
+    ~~~~~~~~~~~~~~~~~~~~~
+
+    Regenarate the webencodings.labels module.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+import json
+try:
+    from urllib import urlopen
+except ImportError:
+    from urllib.request import urlopen
+
+
+def assert_lower(string):
+    assert string == string.lower()
+    return string
+
+
+def generate(url):
+    parts = ['''\
+"""
+
+    webencodings.labels
+    ~~~~~~~~~~~~~~~~~~~
+
+    Map encoding labels to their name.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+# XXX Do not edit!
+# This file is automatically generated by mklabels.py
+
+LABELS = {
+''']
+    labels = [
+        (repr(assert_lower(label)).lstrip('u'),
+         repr(encoding['name']).lstrip('u'))
+        for category in json.loads(urlopen(url).read().decode('ascii'))
+        for encoding in category['encodings']
+        for label in encoding['labels']]
+    max_len = max(len(label) for label, name in labels)
+    parts.extend(
+        '    %s:%s %s,\n' % (label, ' ' * (max_len - len(label)), name)
+        for label, name in labels)
+    parts.append('}')
+    return ''.join(parts)
+
+
+if __name__ == '__main__':
+    print(generate('http://encoding.spec.whatwg.org/encodings.json'))
diff --git a/html_cleaner/webencodings/x_user_defined.py b/html_cleaner/webencodings/x_user_defined.py
new file mode 100644
index 0000000..d16e326
--- /dev/null
+++ b/html_cleaner/webencodings/x_user_defined.py
@@ -0,0 +1,325 @@
+# coding: utf-8
+"""
+
+    webencodings.x_user_defined
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    An implementation of the x-user-defined encoding.
+
+    :copyright: Copyright 2012 by Simon Sapin
+    :license: BSD, see LICENSE for details.
+
+"""
+
+from __future__ import unicode_literals
+
+import codecs
+
+
+### Codec APIs
+
+class Codec(codecs.Codec):
+
+    def encode(self, input, errors='strict'):
+        return codecs.charmap_encode(input, errors, encoding_table)
+
+    def decode(self, input, errors='strict'):
+        return codecs.charmap_decode(input, errors, decoding_table)
+
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def encode(self, input, final=False):
+        return codecs.charmap_encode(input, self.errors, encoding_table)[0]
+
+
+class IncrementalDecoder(codecs.IncrementalDecoder):
+    def decode(self, input, final=False):
+        return codecs.charmap_decode(input, self.errors, decoding_table)[0]
+
+
+class StreamWriter(Codec, codecs.StreamWriter):
+    pass
+
+
+class StreamReader(Codec, codecs.StreamReader):
+    pass
+
+
+### encodings module API
+
+codec_info = codecs.CodecInfo(
+    name='x-user-defined',
+    encode=Codec().encode,
+    decode=Codec().decode,
+    incrementalencoder=IncrementalEncoder,
+    incrementaldecoder=IncrementalDecoder,
+    streamreader=StreamReader,
+    streamwriter=StreamWriter,
+)
+
+
+### Decoding Table
+
+# Python 3:
+# for c in range(256): print('    %r' % chr(c if c < 128 else c + 0xF700))
+decoding_table = (
+    '\x00'
+    '\x01'
+    '\x02'
+    '\x03'
+    '\x04'
+    '\x05'
+    '\x06'
+    '\x07'
+    '\x08'
+    '\t'
+    '\n'
+    '\x0b'
+    '\x0c'
+    '\r'
+    '\x0e'
+    '\x0f'
+    '\x10'
+    '\x11'
+    '\x12'
+    '\x13'
+    '\x14'
+    '\x15'
+    '\x16'
+    '\x17'
+    '\x18'
+    '\x19'
+    '\x1a'
+    '\x1b'
+    '\x1c'
+    '\x1d'
+    '\x1e'
+    '\x1f'
+    ' '
+    '!'
+    '"'
+    '#'
+    '$'
+    '%'
+    '&'
+    "'"
+    '('
+    ')'
+    '*'
+    '+'
+    ','
+    '-'
+    '.'
+    '/'
+    '0'
+    '1'
+    '2'
+    '3'
+    '4'
+    '5'
+    '6'
+    '7'
+    '8'
+    '9'
+    ':'
+    ';'
+    '<'
+    '='
+    '>'
+    '?'
+    '@'
+    'A'
+    'B'
+    'C'
+    'D'
+    'E'
+    'F'
+    'G'
+    'H'
+    'I'
+    'J'
+    'K'
+    'L'
+    'M'
+    'N'
+    'O'
+    'P'
+    'Q'
+    'R'
+    'S'
+    'T'
+    'U'
+    'V'
+    'W'
+    'X'
+    'Y'
+    'Z'
+    '['
+    '\\'
+    ']'
+    '^'
+    '_'
+    '`'
+    'a'
+    'b'
+    'c'
+    'd'
+    'e'
+    'f'
+    'g'
+    'h'
+    'i'
+    'j'
+    'k'
+    'l'
+    'm'
+    'n'
+    'o'
+    'p'
+    'q'
+    'r'
+    's'
+    't'
+    'u'
+    'v'
+    'w'
+    'x'
+    'y'
+    'z'
+    '{'
+    '|'
+    '}'
+    '~'
+    '\x7f'
+    '\uf780'
+    '\uf781'
+    '\uf782'
+    '\uf783'
+    '\uf784'
+    '\uf785'
+    '\uf786'
+    '\uf787'
+    '\uf788'
+    '\uf789'
+    '\uf78a'
+    '\uf78b'
+    '\uf78c'
+    '\uf78d'
+    '\uf78e'
+    '\uf78f'
+    '\uf790'
+    '\uf791'
+    '\uf792'
+    '\uf793'
+    '\uf794'
+    '\uf795'
+    '\uf796'
+    '\uf797'
+    '\uf798'
+    '\uf799'
+    '\uf79a'
+    '\uf79b'
+    '\uf79c'
+    '\uf79d'
+    '\uf79e'
+    '\uf79f'
+    '\uf7a0'
+    '\uf7a1'
+    '\uf7a2'
+    '\uf7a3'
+    '\uf7a4'
+    '\uf7a5'
+    '\uf7a6'
+    '\uf7a7'
+    '\uf7a8'
+    '\uf7a9'
+    '\uf7aa'
+    '\uf7ab'
+    '\uf7ac'
+    '\uf7ad'
+    '\uf7ae'
+    '\uf7af'
+    '\uf7b0'
+    '\uf7b1'
+    '\uf7b2'
+    '\uf7b3'
+    '\uf7b4'
+    '\uf7b5'
+    '\uf7b6'
+    '\uf7b7'
+    '\uf7b8'
+    '\uf7b9'
+    '\uf7ba'
+    '\uf7bb'
+    '\uf7bc'
+    '\uf7bd'
+    '\uf7be'
+    '\uf7bf'
+    '\uf7c0'
+    '\uf7c1'
+    '\uf7c2'
+    '\uf7c3'
+    '\uf7c4'
+    '\uf7c5'
+    '\uf7c6'
+    '\uf7c7'
+    '\uf7c8'
+    '\uf7c9'
+    '\uf7ca'
+    '\uf7cb'
+    '\uf7cc'
+    '\uf7cd'
+    '\uf7ce'
+    '\uf7cf'
+    '\uf7d0'
+    '\uf7d1'
+    '\uf7d2'
+    '\uf7d3'
+    '\uf7d4'
+    '\uf7d5'
+    '\uf7d6'
+    '\uf7d7'
+    '\uf7d8'
+    '\uf7d9'
+    '\uf7da'
+    '\uf7db'
+    '\uf7dc'
+    '\uf7dd'
+    '\uf7de'
+    '\uf7df'
+    '\uf7e0'
+    '\uf7e1'
+    '\uf7e2'
+    '\uf7e3'
+    '\uf7e4'
+    '\uf7e5'
+    '\uf7e6'
+    '\uf7e7'
+    '\uf7e8'
+    '\uf7e9'
+    '\uf7ea'
+    '\uf7eb'
+    '\uf7ec'
+    '\uf7ed'
+    '\uf7ee'
+    '\uf7ef'
+    '\uf7f0'
+    '\uf7f1'
+    '\uf7f2'
+    '\uf7f3'
+    '\uf7f4'
+    '\uf7f5'
+    '\uf7f6'
+    '\uf7f7'
+    '\uf7f8'
+    '\uf7f9'
+    '\uf7fa'
+    '\uf7fb'
+    '\uf7fc'
+    '\uf7fd'
+    '\uf7fe'
+    '\uf7ff'
+)
+
+### Encoding table
+encoding_table = codecs.charmap_build(decoding_table)
diff --git a/tools/test.py b/tools/test.py
new file mode 100644
index 0000000..0fe0276
--- /dev/null
+++ b/tools/test.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+import re
+import sys
+# import modules from local path
+# (insert needed in order to skip system packages)
+sys.path.insert(0, "../html_cleaner")
+import bleach
+
+
+html  = u"""
+<div><b>EXAMPLE 1</b></div>
+<!--StartFragment--><span style='font-size:15.0pt;mso-bidi-font-size:12.0pt;font-family:""Times New Roman"",""serif"";color:black;mso-ansi-language:EN;mso-fareast-language:EN;mso-bidi-language: SW-AS'>LOREM DOLOR <b style=""mso-bidi-font-weight:normal"">SIT</b> <u>AMET</u></span><!--EndFragment-->
+<div><b>EXAMPLE 2</b></div>
+<span style="color: rgb(51, 51, 51);font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif;font-size: 18px">Lorem ipsum doler sit amet&nbsp;</span><a href="http://www.spiegel.de/" title="Lorem" class="text-link-int lp-text-link-int" style="color: rgb(153, 0, 0); font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif; font-size: 18px;">Lorem ipsum</a><span style="color: rgb(51, 51, 51); font-family: SpiegelSansWeb, Calibri, Candara, Arial, Helvetica, sans-serif; font-size: 18px;">&nbsp;ipsum dolore sit amet.</span>
+"""
+
+# Html tags to preserve
+keep_tags = ['blockquote', 'a', 'img', 'em', 'b', 'p', 'strong',
+        'h1', 'h2', 'h3', 'h4', 'h5', 'ul', 'ol', 'li', 'sub', 'sup',
+        'abbr', 'acronym', 'dl', 'dt', 'dd', 'cite',
+        'dft', 'br', 'table', 'tr', 'td', 'th', 'thead',
+        'tbody', 'tfoot', 'div', 'span']
+
+# Tag attributes to preserve
+keep_attrs = ['rev', 'prompt', 'color', 'colspan', 'accesskey', 
+        'usemap', 'cols', 'accept', 'datetime', 'char', 
+        'accept-charset', 'shape', 'href', 'hreflang', 
+        'selected', 'frame', 'type', 'alt', 'nowrap', 
+        'border', 'axis', 'compact', 'rows', 'checked',
+        'for', 'start', 'hspace', 'charset', 'ismap', 'label',
+        'target', 'method', 'readonly', 'rel', 'valign', 'scope',
+        'size', 'cellspacing', 'cite', 'media', 'multiple', 'src',
+        'rules', 'nohref', 'action', 'rowspan', 'abbr', 'span', 'height',
+        'enctype', 'lang', 'disabled', 'name', 'charoff', 'clear', 'summary',
+        'value', 'longdesc', 'headers', 'vspace', 'noshade', 'coords', 'width',
+        'maxlength', 'cellpadding', 'title', 'align', 'dir', 'tabindex', 'style']
+
+# Styles to preserve in the style attribute
+keep_styles = ["color", "background", "font-weight", "font-family",
+                "font-style", "font-size"]
+
+
+# insert linebreak after regex match
+brtags = (r"(</(div|p|br|li|ul|ol|blockquote|tr|"
+            "table|thead|tfoot|tbody|h[1-9]|)>)([^\n])")
+
+cleaned = bleach.clean(html.replace("\n", ""),
+        tags = keep_tags,
+        attributes = keep_attrs,
+        styles = keep_styles,
+        strip = True
+        )
+cleaned = cleaned.replace(' style=""', '').replace("\n", "")
+cleaned = re.sub(brtags, r"\1\n\3", cleaned)
+print cleaned
\ No newline at end of file