[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+ # Match any character set and encoding
+ (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+ |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+ # Assume the rest is data
+ ,.*
+ $
+ ''',
+ re.VERBOSE)
+
+
+class Filter(base.Filter):
+ """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
+ def __init__(self,
+ source,
+ allowed_elements=allowed_elements,
+ allowed_attributes=allowed_attributes,
+ allowed_css_properties=allowed_css_properties,
+ allowed_css_keywords=allowed_css_keywords,
+ allowed_svg_properties=allowed_svg_properties,
+ allowed_protocols=allowed_protocols,
+ allowed_content_types=allowed_content_types,
+ attr_val_is_uri=attr_val_is_uri,
+ svg_attr_val_allows_ref=svg_attr_val_allows_ref,
+ svg_allow_local_href=svg_allow_local_href):
+ super(Filter, self).__init__(source)
+ self.allowed_elements = allowed_elements
+ self.allowed_attributes = allowed_attributes
+ self.allowed_css_properties = allowed_css_properties
+ self.allowed_css_keywords = allowed_css_keywords
+ self.allowed_svg_properties = allowed_svg_properties
+ self.allowed_protocols = allowed_protocols
+ self.allowed_content_types = allowed_content_types
+ self.attr_val_is_uri = attr_val_is_uri
+ self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+ self.svg_allow_local_href = svg_allow_local_href
+
+ def __iter__(self):
+ for token in base.Filter.__iter__(self):
+ token = self.sanitize_token(token)
+ if token:
+ yield token
+
+ # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+ # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+ # attributes are parsed, and a restricted set, # specified by
+ # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+ # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+ # in ALLOWED_PROTOCOLS are allowed.
+ #
+ # sanitize_html('')
+ # => <script> do_nasty_stuff() </script>
+ # sanitize_html('Click here for $100')
+ # => Click here for $100
+ def sanitize_token(self, token):
+
+ # accommodate filters which use token_type differently
+ token_type = token["type"]
+ if token_type in ("StartTag", "EndTag", "EmptyTag"):
+ name = token["name"]
+ namespace = token["namespace"]
+ if ((namespace, name) in self.allowed_elements or
+ (namespace is None and
+ (namespaces["html"], name) in self.allowed_elements)):
+ return self.allowed_token(token)
+ else:
+ return self.disallowed_token(token)
+ elif token_type == "Comment":
+ pass
+ else:
+ return token
+
+ def allowed_token(self, token):
+ if "data" in token:
+ attrs = token["data"]
+ attr_names = set(attrs.keys())
+
+ # Remove forbidden attributes
+ for to_remove in (attr_names - self.allowed_attributes):
+ del token["data"][to_remove]
+ attr_names.remove(to_remove)
+
+ # Remove attributes with disallowed URL values
+ for attr in (attr_names & self.attr_val_is_uri):
+ assert attr in attrs
+ # I don't have a clue where this regexp comes from or why it matches those
+ # characters, nor why we call unescape. I just know it's always been here.
+ # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
+ # this will do is remove *more* than it otherwise would.
+ val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
+ unescape(attrs[attr])).lower()
+ # remove replacement characters from unescaped characters
+ val_unescaped = val_unescaped.replace("\ufffd", "")
+ try:
+ uri = urlparse.urlparse(val_unescaped)
+ except ValueError:
+ uri = None
+ del attrs[attr]
+ if uri and uri.scheme:
+ if uri.scheme not in self.allowed_protocols:
+ del attrs[attr]
+ if uri.scheme == 'data':
+ m = data_content_type.match(uri.path)
+ if not m:
+ del attrs[attr]
+ elif m.group('content_type') not in self.allowed_content_types:
+ del attrs[attr]
+
+ for attr in self.svg_attr_val_allows_ref:
+ if attr in attrs:
+ attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+ ' ',
+ unescape(attrs[attr]))
+ if (token["name"] in self.svg_allow_local_href and
+ (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
+ attrs[(namespaces['xlink'], 'href')])):
+ del attrs[(namespaces['xlink'], 'href')]
+ if (None, 'style') in attrs:
+ attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
+ token["data"] = attrs
+ return token
+
+ def disallowed_token(self, token):
+ token_type = token["type"]
+ if token_type == "EndTag":
+ token["data"] = "%s>" % token["name"]
+ elif token["data"]:
+ assert token_type in ("StartTag", "EmptyTag")
+ attrs = []
+ for (ns, name), v in token["data"].items():
+ attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
+ token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+ else:
+ token["data"] = "<%s>" % token["name"]
+ if token.get("selfClosing"):
+ token["data"] = token["data"][:-1] + "/>"
+
+ token["type"] = "Characters"
+
+ del token["name"]
+ return token
+
+ def sanitize_css(self, style):
+ # disallow urls
+ style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+ # gauntlet
+ if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+ return ''
+ if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+ return ''
+
+ clean = []
+ for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
+ if not value:
+ continue
+ if prop.lower() in self.allowed_css_properties:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+ 'padding']:
+ for keyword in value.split():
+ if keyword not in self.allowed_css_keywords and \
+ not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
+ break
+ else:
+ clean.append(prop + ': ' + value + ';')
+ elif prop.lower() in self.allowed_svg_properties:
+ clean.append(prop + ': ' + value + ';')
+
+ return ' '.join(clean)
diff --git a/html_cleaner/html5lib/filters/whitespace.py b/html_cleaner/html5lib/filters/whitespace.py
new file mode 100644
index 0000000..8921052
--- /dev/null
+++ b/html_cleaner/html5lib/filters/whitespace.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import re
+
+from . import base
+from ..constants import rcdataElements, spaceCharacters
+spaceCharacters = "".join(spaceCharacters)
+
+SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
+
+
+class Filter(base.Filter):
+
+ spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+ def __iter__(self):
+ preserve = 0
+ for token in base.Filter.__iter__(self):
+ type = token["type"]
+ if type == "StartTag" \
+ and (preserve or token["name"] in self.spacePreserveElements):
+ preserve += 1
+
+ elif type == "EndTag" and preserve:
+ preserve -= 1
+
+ elif not preserve and type == "SpaceCharacters" and token["data"]:
+ # Test on token["data"] above to not introduce spaces where there were not
+ token["data"] = " "
+
+ elif not preserve and type == "Characters":
+ token["data"] = collapse_spaces(token["data"])
+
+ yield token
+
+
+def collapse_spaces(text):
+ return SPACES_REGEX.sub(' ', text)
diff --git a/html_cleaner/html5lib/html5parser.py b/html_cleaner/html5lib/html5parser.py
new file mode 100644
index 0000000..2abd63e
--- /dev/null
+++ b/html_cleaner/html5lib/html5parser.py
@@ -0,0 +1,2733 @@
+from __future__ import absolute_import, division, unicode_literals
+from six import with_metaclass, viewkeys, PY3
+
+import types
+
+try:
+ from collections import OrderedDict
+except ImportError:
+ from ordereddict import OrderedDict
+
+from . import _inputstream
+from . import _tokenizer
+
+from . import treebuilders
+from .treebuilders.base import Marker
+
+from . import _utils
+from .constants import (
+ spaceCharacters, asciiUpper2Lower,
+ specialElements, headingElements, cdataElements, rcdataElements,
+ tokenTypes, tagTokenTypes,
+ namespaces,
+ htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
+ adjustForeignAttributes as adjustForeignAttributesMap,
+ adjustMathMLAttributes, adjustSVGAttributes,
+ E,
+ ReparseException
+)
+
+
+def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
+ """Parse a string or file-like object into a tree"""
+ tb = treebuilders.getTreeBuilder(treebuilder)
+ p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+ return p.parse(doc, **kwargs)
+
+
+def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
+ tb = treebuilders.getTreeBuilder(treebuilder)
+ p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+ return p.parseFragment(doc, container=container, **kwargs)
+
+
+def method_decorator_metaclass(function):
+ class Decorated(type):
+ def __new__(meta, classname, bases, classDict):
+ for attributeName, attribute in classDict.items():
+ if isinstance(attribute, types.FunctionType):
+ attribute = function(attribute)
+
+ classDict[attributeName] = attribute
+ return type.__new__(meta, classname, bases, classDict)
+ return Decorated
+
+
+class HTMLParser(object):
+ """HTML parser. Generates a tree structure from a stream of (possibly
+ malformed) HTML"""
+
+ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
+ """
+ strict - raise an exception when a parse error is encountered
+
+ tree - a treebuilder class controlling the type of tree that will be
+ returned. Built in treebuilders can be accessed through
+ html5lib.treebuilders.getTreeBuilder(treeType)
+ """
+
+ # Raise an exception on the first error encountered
+ self.strict = strict
+
+ if tree is None:
+ tree = treebuilders.getTreeBuilder("etree")
+ self.tree = tree(namespaceHTMLElements)
+ self.errors = []
+
+ self.phases = dict([(name, cls(self, self.tree)) for name, cls in
+ getPhases(debug).items()])
+
+ def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
+
+ self.innerHTMLMode = innerHTML
+ self.container = container
+ self.scripting = scripting
+ self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
+ self.reset()
+
+ try:
+ self.mainLoop()
+ except ReparseException:
+ self.reset()
+ self.mainLoop()
+
+ def reset(self):
+ self.tree.reset()
+ self.firstStartTag = False
+ self.errors = []
+ self.log = [] # only used with debug mode
+ # "quirks" / "limited quirks" / "no quirks"
+ self.compatMode = "no quirks"
+
+ if self.innerHTMLMode:
+ self.innerHTML = self.container.lower()
+
+ if self.innerHTML in cdataElements:
+ self.tokenizer.state = self.tokenizer.rcdataState
+ elif self.innerHTML in rcdataElements:
+ self.tokenizer.state = self.tokenizer.rawtextState
+ elif self.innerHTML == 'plaintext':
+ self.tokenizer.state = self.tokenizer.plaintextState
+ else:
+ # state already is data state
+ # self.tokenizer.state = self.tokenizer.dataState
+ pass
+ self.phase = self.phases["beforeHtml"]
+ self.phase.insertHtmlElement()
+ self.resetInsertionMode()
+ else:
+ self.innerHTML = False # pylint:disable=redefined-variable-type
+ self.phase = self.phases["initial"]
+
+ self.lastPhase = None
+
+ self.beforeRCDataPhase = None
+
+ self.framesetOK = True
+
+ @property
+ def documentEncoding(self):
+ """The name of the character encoding
+ that was used to decode the input stream,
+ or :obj:`None` if that is not determined yet.
+
+ """
+ if not hasattr(self, 'tokenizer'):
+ return None
+ return self.tokenizer.stream.charEncoding[0].name
+
+ def isHTMLIntegrationPoint(self, element):
+ if (element.name == "annotation-xml" and
+ element.namespace == namespaces["mathml"]):
+ return ("encoding" in element.attributes and
+ element.attributes["encoding"].translate(
+ asciiUpper2Lower) in
+ ("text/html", "application/xhtml+xml"))
+ else:
+ return (element.namespace, element.name) in htmlIntegrationPointElements
+
+ def isMathMLTextIntegrationPoint(self, element):
+ return (element.namespace, element.name) in mathmlTextIntegrationPointElements
+
+ def mainLoop(self):
+ CharactersToken = tokenTypes["Characters"]
+ SpaceCharactersToken = tokenTypes["SpaceCharacters"]
+ StartTagToken = tokenTypes["StartTag"]
+ EndTagToken = tokenTypes["EndTag"]
+ CommentToken = tokenTypes["Comment"]
+ DoctypeToken = tokenTypes["Doctype"]
+ ParseErrorToken = tokenTypes["ParseError"]
+
+ for token in self.normalizedTokens():
+ prev_token = None
+ new_token = token
+ while new_token is not None:
+ prev_token = new_token
+ currentNode = self.tree.openElements[-1] if self.tree.openElements else None
+ currentNodeNamespace = currentNode.namespace if currentNode else None
+ currentNodeName = currentNode.name if currentNode else None
+
+ type = new_token["type"]
+
+ if type == ParseErrorToken:
+ self.parseError(new_token["data"], new_token.get("datavars", {}))
+ new_token = None
+ else:
+ if (len(self.tree.openElements) == 0 or
+ currentNodeNamespace == self.tree.defaultNamespace or
+ (self.isMathMLTextIntegrationPoint(currentNode) and
+ ((type == StartTagToken and
+ token["name"] not in frozenset(["mglyph", "malignmark"])) or
+ type in (CharactersToken, SpaceCharactersToken))) or
+ (currentNodeNamespace == namespaces["mathml"] and
+ currentNodeName == "annotation-xml" and
+ type == StartTagToken and
+ token["name"] == "svg") or
+ (self.isHTMLIntegrationPoint(currentNode) and
+ type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
+ phase = self.phase
+ else:
+ phase = self.phases["inForeignContent"]
+
+ if type == CharactersToken:
+ new_token = phase.processCharacters(new_token)
+ elif type == SpaceCharactersToken:
+ new_token = phase.processSpaceCharacters(new_token)
+ elif type == StartTagToken:
+ new_token = phase.processStartTag(new_token)
+ elif type == EndTagToken:
+ new_token = phase.processEndTag(new_token)
+ elif type == CommentToken:
+ new_token = phase.processComment(new_token)
+ elif type == DoctypeToken:
+ new_token = phase.processDoctype(new_token)
+
+ if (type == StartTagToken and prev_token["selfClosing"] and
+ not prev_token["selfClosingAcknowledged"]):
+ self.parseError("non-void-element-with-trailing-solidus",
+ {"name": prev_token["name"]})
+
+ # When the loop finishes it's EOF
+ reprocess = True
+ phases = []
+ while reprocess:
+ phases.append(self.phase)
+ reprocess = self.phase.processEOF()
+ if reprocess:
+ assert self.phase not in phases
+
+ def normalizedTokens(self):
+ for token in self.tokenizer:
+ yield self.normalizeToken(token)
+
+ def parse(self, stream, *args, **kwargs):
+ """Parse a HTML document into a well-formed tree
+
+ stream - a filelike object or string containing the HTML to be parsed
+
+ The optional encoding parameter must be a string that indicates
+ the encoding. If specified, that encoding will be used,
+ regardless of any BOM or later declaration (such as in a meta
+ element)
+
+ scripting - treat noscript elements as if javascript was turned on
+ """
+ self._parse(stream, False, None, *args, **kwargs)
+ return self.tree.getDocument()
+
+ def parseFragment(self, stream, *args, **kwargs):
+ """Parse a HTML fragment into a well-formed tree fragment
+
+ container - name of the element we're setting the innerHTML property
+ if set to None, default to 'div'
+
+ stream - a filelike object or string containing the HTML to be parsed
+
+ The optional encoding parameter must be a string that indicates
+ the encoding. If specified, that encoding will be used,
+ regardless of any BOM or later declaration (such as in a meta
+ element)
+
+ scripting - treat noscript elements as if javascript was turned on
+ """
+ self._parse(stream, True, *args, **kwargs)
+ return self.tree.getFragment()
+
+ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
+ # XXX The idea is to make errorcode mandatory.
+ if datavars is None:
+ datavars = {}
+ self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
+ if self.strict:
+ raise ParseError(E[errorcode] % datavars)
+
+ def normalizeToken(self, token):
+ """ HTML5 specific normalizations to the token stream """
+
+ if token["type"] == tokenTypes["StartTag"]:
+ raw = token["data"]
+ token["data"] = OrderedDict(raw)
+ if len(raw) > len(token["data"]):
+ # we had some duplicated attribute, fix so first wins
+ token["data"].update(raw[::-1])
+
+ return token
+
+ def adjustMathMLAttributes(self, token):
+ adjust_attributes(token, adjustMathMLAttributes)
+
+ def adjustSVGAttributes(self, token):
+ adjust_attributes(token, adjustSVGAttributes)
+
+ def adjustForeignAttributes(self, token):
+ adjust_attributes(token, adjustForeignAttributesMap)
+
+ def reparseTokenNormal(self, token):
+ # pylint:disable=unused-argument
+ self.parser.phase()
+
+ def resetInsertionMode(self):
+ # The name of this method is mostly historical. (It's also used in the
+ # specification.)
+ last = False
+ newModes = {
+ "select": "inSelect",
+ "td": "inCell",
+ "th": "inCell",
+ "tr": "inRow",
+ "tbody": "inTableBody",
+ "thead": "inTableBody",
+ "tfoot": "inTableBody",
+ "caption": "inCaption",
+ "colgroup": "inColumnGroup",
+ "table": "inTable",
+ "head": "inBody",
+ "body": "inBody",
+ "frameset": "inFrameset",
+ "html": "beforeHead"
+ }
+ for node in self.tree.openElements[::-1]:
+ nodeName = node.name
+ new_phase = None
+ if node == self.tree.openElements[0]:
+ assert self.innerHTML
+ last = True
+ nodeName = self.innerHTML
+ # Check for conditions that should only happen in the innerHTML
+ # case
+ if nodeName in ("select", "colgroup", "head", "html"):
+ assert self.innerHTML
+
+ if not last and node.namespace != self.tree.defaultNamespace:
+ continue
+
+ if nodeName in newModes:
+ new_phase = self.phases[newModes[nodeName]]
+ break
+ elif last:
+ new_phase = self.phases["inBody"]
+ break
+
+ self.phase = new_phase
+
+ def parseRCDataRawtext(self, token, contentType):
+ """Generic RCDATA/RAWTEXT Parsing algorithm
+ contentType - RCDATA or RAWTEXT
+ """
+ assert contentType in ("RAWTEXT", "RCDATA")
+
+ self.tree.insertElement(token)
+
+ if contentType == "RAWTEXT":
+ self.tokenizer.state = self.tokenizer.rawtextState
+ else:
+ self.tokenizer.state = self.tokenizer.rcdataState
+
+ self.originalPhase = self.phase
+
+ self.phase = self.phases["text"]
+
+
+@_utils.memoize
+def getPhases(debug):
+ def log(function):
+ """Logger that records which phase processes each token"""
+ type_names = dict((value, key) for key, value in
+ tokenTypes.items())
+
+ def wrapped(self, *args, **kwargs):
+ if function.__name__.startswith("process") and len(args) > 0:
+ token = args[0]
+ try:
+ info = {"type": type_names[token['type']]}
+ except:
+ raise
+ if token['type'] in tagTokenTypes:
+ info["name"] = token['name']
+
+ self.parser.log.append((self.parser.tokenizer.state.__name__,
+ self.parser.phase.__class__.__name__,
+ self.__class__.__name__,
+ function.__name__,
+ info))
+ return function(self, *args, **kwargs)
+ else:
+ return function(self, *args, **kwargs)
+ return wrapped
+
+ def getMetaclass(use_metaclass, metaclass_func):
+ if use_metaclass:
+ return method_decorator_metaclass(metaclass_func)
+ else:
+ return type
+
+ # pylint:disable=unused-argument
+ class Phase(with_metaclass(getMetaclass(debug, log))):
+ """Base class for helper object that implements each phase of processing
+ """
+
+ def __init__(self, parser, tree):
+ self.parser = parser
+ self.tree = tree
+
+ def processEOF(self):
+ raise NotImplementedError
+
+ def processComment(self, token):
+ # For most phases the following is correct. Where it's not it will be
+ # overridden.
+ self.tree.insertComment(token, self.tree.openElements[-1])
+
+ def processDoctype(self, token):
+ self.parser.parseError("unexpected-doctype")
+
+ def processCharacters(self, token):
+ self.tree.insertText(token["data"])
+
+ def processSpaceCharacters(self, token):
+ self.tree.insertText(token["data"])
+
+ def processStartTag(self, token):
+ return self.startTagHandler[token["name"]](token)
+
+ def startTagHtml(self, token):
+ if not self.parser.firstStartTag and token["name"] == "html":
+ self.parser.parseError("non-html-root")
+ # XXX Need a check here to see if the first start tag token emitted is
+ # this token... If it's not, invoke self.parser.parseError().
+ for attr, value in token["data"].items():
+ if attr not in self.tree.openElements[0].attributes:
+ self.tree.openElements[0].attributes[attr] = value
+ self.parser.firstStartTag = False
+
+ def processEndTag(self, token):
+ return self.endTagHandler[token["name"]](token)
+
+ class InitialPhase(Phase):
+ def processSpaceCharacters(self, token):
+ pass
+
+ def processComment(self, token):
+ self.tree.insertComment(token, self.tree.document)
+
+ def processDoctype(self, token):
+ name = token["name"]
+ publicId = token["publicId"]
+ systemId = token["systemId"]
+ correct = token["correct"]
+
+ if (name != "html" or publicId is not None or
+ systemId is not None and systemId != "about:legacy-compat"):
+ self.parser.parseError("unknown-doctype")
+
+ if publicId is None:
+ publicId = ""
+
+ self.tree.insertDoctype(token)
+
+ if publicId != "":
+ publicId = publicId.translate(asciiUpper2Lower)
+
+ if (not correct or token["name"] != "html" or
+ publicId.startswith(
+ ("+//silmaril//dtd html pro v0r11 19970101//",
+ "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+ "-//as//dtd html 3.0 aswedit + extensions//",
+ "-//ietf//dtd html 2.0 level 1//",
+ "-//ietf//dtd html 2.0 level 2//",
+ "-//ietf//dtd html 2.0 strict level 1//",
+ "-//ietf//dtd html 2.0 strict level 2//",
+ "-//ietf//dtd html 2.0 strict//",
+ "-//ietf//dtd html 2.0//",
+ "-//ietf//dtd html 2.1e//",
+ "-//ietf//dtd html 3.0//",
+ "-//ietf//dtd html 3.2 final//",
+ "-//ietf//dtd html 3.2//",
+ "-//ietf//dtd html 3//",
+ "-//ietf//dtd html level 0//",
+ "-//ietf//dtd html level 1//",
+ "-//ietf//dtd html level 2//",
+ "-//ietf//dtd html level 3//",
+ "-//ietf//dtd html strict level 0//",
+ "-//ietf//dtd html strict level 1//",
+ "-//ietf//dtd html strict level 2//",
+ "-//ietf//dtd html strict level 3//",
+ "-//ietf//dtd html strict//",
+ "-//ietf//dtd html//",
+ "-//metrius//dtd metrius presentational//",
+ "-//microsoft//dtd internet explorer 2.0 html strict//",
+ "-//microsoft//dtd internet explorer 2.0 html//",
+ "-//microsoft//dtd internet explorer 2.0 tables//",
+ "-//microsoft//dtd internet explorer 3.0 html strict//",
+ "-//microsoft//dtd internet explorer 3.0 html//",
+ "-//microsoft//dtd internet explorer 3.0 tables//",
+ "-//netscape comm. corp.//dtd html//",
+ "-//netscape comm. corp.//dtd strict html//",
+ "-//o'reilly and associates//dtd html 2.0//",
+ "-//o'reilly and associates//dtd html extended 1.0//",
+ "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+ "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+ "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+ "-//spyglass//dtd html 2.0 extended//",
+ "-//sq//dtd html 2.0 hotmetal + extensions//",
+ "-//sun microsystems corp.//dtd hotjava html//",
+ "-//sun microsystems corp.//dtd hotjava strict html//",
+ "-//w3c//dtd html 3 1995-03-24//",
+ "-//w3c//dtd html 3.2 draft//",
+ "-//w3c//dtd html 3.2 final//",
+ "-//w3c//dtd html 3.2//",
+ "-//w3c//dtd html 3.2s draft//",
+ "-//w3c//dtd html 4.0 frameset//",
+ "-//w3c//dtd html 4.0 transitional//",
+ "-//w3c//dtd html experimental 19960712//",
+ "-//w3c//dtd html experimental 970421//",
+ "-//w3c//dtd w3 html//",
+ "-//w3o//dtd w3 html 3.0//",
+ "-//webtechs//dtd mozilla html 2.0//",
+ "-//webtechs//dtd mozilla html//")) or
+ publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
+ "-/w3c/dtd html 4.0 transitional/en",
+ "html") or
+ publicId.startswith(
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId is None or
+ systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+ self.parser.compatMode = "quirks"
+ elif (publicId.startswith(
+ ("-//w3c//dtd xhtml 1.0 frameset//",
+ "-//w3c//dtd xhtml 1.0 transitional//")) or
+ publicId.startswith(
+ ("-//w3c//dtd html 4.01 frameset//",
+ "-//w3c//dtd html 4.01 transitional//")) and
+ systemId is not None):
+ self.parser.compatMode = "limited quirks"
+
+ self.parser.phase = self.parser.phases["beforeHtml"]
+
+ def anythingElse(self):
+ self.parser.compatMode = "quirks"
+ self.parser.phase = self.parser.phases["beforeHtml"]
+
+ def processCharacters(self, token):
+ self.parser.parseError("expected-doctype-but-got-chars")
+ self.anythingElse()
+ return token
+
+ def processStartTag(self, token):
+ self.parser.parseError("expected-doctype-but-got-start-tag",
+ {"name": token["name"]})
+ self.anythingElse()
+ return token
+
+ def processEndTag(self, token):
+ self.parser.parseError("expected-doctype-but-got-end-tag",
+ {"name": token["name"]})
+ self.anythingElse()
+ return token
+
+ def processEOF(self):
+ self.parser.parseError("expected-doctype-but-got-eof")
+ self.anythingElse()
+ return True
+
+ class BeforeHtmlPhase(Phase):
+ # helper methods
+ def insertHtmlElement(self):
+ self.tree.insertRoot(impliedTagToken("html", "StartTag"))
+ self.parser.phase = self.parser.phases["beforeHead"]
+
+ # other
+ def processEOF(self):
+ self.insertHtmlElement()
+ return True
+
+ def processComment(self, token):
+ self.tree.insertComment(token, self.tree.document)
+
+ def processSpaceCharacters(self, token):
+ pass
+
+ def processCharacters(self, token):
+ self.insertHtmlElement()
+ return token
+
+ def processStartTag(self, token):
+ if token["name"] == "html":
+ self.parser.firstStartTag = True
+ self.insertHtmlElement()
+ return token
+
+ def processEndTag(self, token):
+ if token["name"] not in ("head", "body", "html", "br"):
+ self.parser.parseError("unexpected-end-tag-before-html",
+ {"name": token["name"]})
+ else:
+ self.insertHtmlElement()
+ return token
+
+ class BeforeHeadPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = _utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("head", self.startTagHead)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = _utils.MethodDispatcher([
+ (("head", "body", "html", "br"), self.endTagImplyHead)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ self.startTagHead(impliedTagToken("head", "StartTag"))
+ return True
+
+ def processSpaceCharacters(self, token):
+ pass
+
+ def processCharacters(self, token):
+ self.startTagHead(impliedTagToken("head", "StartTag"))
+ return token
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagHead(self, token):
+ self.tree.insertElement(token)
+ self.tree.headPointer = self.tree.openElements[-1]
+ self.parser.phase = self.parser.phases["inHead"]
+
+ def startTagOther(self, token):
+ self.startTagHead(impliedTagToken("head", "StartTag"))
+ return token
+
+ def endTagImplyHead(self, token):
+ self.startTagHead(impliedTagToken("head", "StartTag"))
+ return token
+
+ def endTagOther(self, token):
+ self.parser.parseError("end-tag-after-implied-root",
+ {"name": token["name"]})
+
+ class InHeadPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = _utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("title", self.startTagTitle),
+ (("noframes", "style"), self.startTagNoFramesStyle),
+ ("noscript", self.startTagNoscript),
+ ("script", self.startTagScript),
+ (("base", "basefont", "bgsound", "command", "link"),
+ self.startTagBaseLinkCommand),
+ ("meta", self.startTagMeta),
+ ("head", self.startTagHead)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = _utils.MethodDispatcher([
+ ("head", self.endTagHead),
+ (("br", "html", "body"), self.endTagHtmlBodyBr)
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ # the real thing
+ def processEOF(self):
+ self.anythingElse()
+ return True
+
+ def processCharacters(self, token):
+ self.anythingElse()
+ return token
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagHead(self, token):
+ self.parser.parseError("two-heads-are-not-better-than-one")
+
+ def startTagBaseLinkCommand(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
+ def startTagMeta(self, token):
+ self.tree.insertElement(token)
+ self.tree.openElements.pop()
+ token["selfClosingAcknowledged"] = True
+
+ attributes = token["data"]
+ if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
+ if "charset" in attributes:
+ self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
+ elif ("content" in attributes and
+ "http-equiv" in attributes and
+ attributes["http-equiv"].lower() == "content-type"):
+ # Encoding it as UTF-8 here is a hack, as really we should pass
+ # the abstract Unicode string, and just use the
+ # ContentAttrParser on that, but using UTF-8 allows all chars
+ # to be encoded and as a ASCII-superset works.
+ data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
+ parser = _inputstream.ContentAttrParser(data)
+ codec = parser.parse()
+ self.parser.tokenizer.stream.changeEncoding(codec)
+
+ def startTagTitle(self, token):
+ self.parser.parseRCDataRawtext(token, "RCDATA")
+
+ def startTagNoFramesStyle(self, token):
+ # Need to decide whether to implement the scripting-disabled case
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
+
+ def startTagNoscript(self, token):
+ if self.parser.scripting:
+ self.parser.parseRCDataRawtext(token, "RAWTEXT")
+ else:
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inHeadNoscript"]
+
+ def startTagScript(self, token):
+ self.tree.insertElement(token)
+ self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
+ self.parser.originalPhase = self.parser.phase
+ self.parser.phase = self.parser.phases["text"]
+
+ def startTagOther(self, token):
+ self.anythingElse()
+ return token
+
+ def endTagHead(self, token):
+ node = self.parser.tree.openElements.pop()
+ assert node.name == "head", "Expected head got %s" % node.name
+ self.parser.phase = self.parser.phases["afterHead"]
+
+ def endTagHtmlBodyBr(self, token):
+ self.anythingElse()
+ return token
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def anythingElse(self):
+ self.endTagHead(impliedTagToken("head"))
+
+ class InHeadNoscriptPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = _utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
+ (("head", "noscript"), self.startTagHeadNoscript),
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = _utils.MethodDispatcher([
+ ("noscript", self.endTagNoscript),
+ ("br", self.endTagBr),
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ self.parser.parseError("eof-in-head-noscript")
+ self.anythingElse()
+ return True
+
+ def processComment(self, token):
+ return self.parser.phases["inHead"].processComment(token)
+
+ def processCharacters(self, token):
+ self.parser.parseError("char-in-head-noscript")
+ self.anythingElse()
+ return token
+
+ def processSpaceCharacters(self, token):
+ return self.parser.phases["inHead"].processSpaceCharacters(token)
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagBaseLinkCommand(self, token):
+ return self.parser.phases["inHead"].processStartTag(token)
+
+ def startTagHeadNoscript(self, token):
+ self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+ def startTagOther(self, token):
+ self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+ self.anythingElse()
+ return token
+
+ def endTagNoscript(self, token):
+ node = self.parser.tree.openElements.pop()
+ assert node.name == "noscript", "Expected noscript got %s" % node.name
+ self.parser.phase = self.parser.phases["inHead"]
+
+ def endTagBr(self, token):
+ self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+ self.anythingElse()
+ return token
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def anythingElse(self):
+ # Caller must raise parse error first!
+ self.endTagNoscript(impliedTagToken("noscript"))
+
+ class AfterHeadPhase(Phase):
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ self.startTagHandler = _utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ ("body", self.startTagBody),
+ ("frameset", self.startTagFrameset),
+ (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
+ "style", "title"),
+ self.startTagFromHead),
+ ("head", self.startTagHead)
+ ])
+ self.startTagHandler.default = self.startTagOther
+ self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
+ self.endTagHtmlBodyBr)])
+ self.endTagHandler.default = self.endTagOther
+
+ def processEOF(self):
+ self.anythingElse()
+ return True
+
+ def processCharacters(self, token):
+ self.anythingElse()
+ return token
+
+ def startTagHtml(self, token):
+ return self.parser.phases["inBody"].processStartTag(token)
+
+ def startTagBody(self, token):
+ self.parser.framesetOK = False
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inBody"]
+
+ def startTagFrameset(self, token):
+ self.tree.insertElement(token)
+ self.parser.phase = self.parser.phases["inFrameset"]
+
+ def startTagFromHead(self, token):
+ self.parser.parseError("unexpected-start-tag-out-of-my-head",
+ {"name": token["name"]})
+ self.tree.openElements.append(self.tree.headPointer)
+ self.parser.phases["inHead"].processStartTag(token)
+ for node in self.tree.openElements[::-1]:
+ if node.name == "head":
+ self.tree.openElements.remove(node)
+ break
+
+ def startTagHead(self, token):
+ self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+ def startTagOther(self, token):
+ self.anythingElse()
+ return token
+
+ def endTagHtmlBodyBr(self, token):
+ self.anythingElse()
+ return token
+
+ def endTagOther(self, token):
+ self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+ def anythingElse(self):
+ self.tree.insertElement(impliedTagToken("body", "StartTag"))
+ self.parser.phase = self.parser.phases["inBody"]
+ self.parser.framesetOK = True
+
+ class InBodyPhase(Phase):
+ # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
+ # the really-really-really-very crazy mode
+ def __init__(self, parser, tree):
+ Phase.__init__(self, parser, tree)
+
+ # Set this to the default handler
+ self.processSpaceCharacters = self.processSpaceCharactersNonPre
+
+ self.startTagHandler = _utils.MethodDispatcher([
+ ("html", self.startTagHtml),
+ (("base", "basefont", "bgsound", "command", "link", "meta",
+ "script", "style", "title"),
+ self.startTagProcessInHead),
+ ("body", self.startTagBody),
+ ("frameset", self.startTagFrameset),
+ (("address", "article", "aside", "blockquote", "center", "details",
+ "dir", "div", "dl", "fieldset", "figcaption", "figure",
+ "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
+ "section", "summary", "ul"),
+ self.startTagCloseP),
+ (headingElements, self.startTagHeading),
+ (("pre", "listing"), self.startTagPreListing),
+ ("form", self.startTagForm),
+ (("li", "dd", "dt"), self.startTagListItem),
+ ("plaintext", self.startTagPlaintext),
+ ("a", self.startTagA),
+ (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
+ "strong", "tt", "u"), self.startTagFormatting),
+ ("nobr", self.startTagNobr),
+ ("button", self.startTagButton),
+ (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
+ ("xmp", self.startTagXmp),
+ ("table", self.startTagTable),
+ (("area", "br", "embed", "img", "keygen", "wbr"),
+ self.startTagVoidFormatting),
+ (("param", "source", "track"), self.startTagParamSource),
+ ("input", self.startTagInput),
+ ("hr", self.startTagHr),
+ ("image", self.startTagImage),
+ ("isindex", self.startTagIsIndex),
+ ("textarea", self.startTagTextarea),
+ ("iframe", self.startTagIFrame),
+ ("noscript", self.startTagNoscript),
+ (("noembed", "noframes"), self.startTagRawtext),
+ ("select", self.startTagSelect),
+ (("rp", "rt"), self.startTagRpRt),
+ (("option", "optgroup"), self.startTagOpt),
+ (("math"), self.startTagMath),
+ (("svg"), self.startTagSvg),
+ (("caption", "col", "colgroup", "frame", "head",
+ "tbody", "td", "tfoot", "th", "thead",
+ "tr"), self.startTagMisplaced)
+ ])
+ self.startTagHandler.default = self.startTagOther
+
+ self.endTagHandler = _utils.MethodDispatcher([
+ ("body", self.endTagBody),
+ ("html", self.endTagHtml),
+ (("address", "article", "aside", "blockquote", "button", "center",
+ "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+ "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
+ "section", "summary", "ul"), self.endTagBlock),
+ ("form", self.endTagForm),
+ ("p", self.endTagP),
+ (("dd", "dt", "li"), self.endTagListItem),
+ (headingElements, self.endTagHeading),
+ (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
+ "strike", "strong", "tt", "u"), self.endTagFormatting),
+ (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
+ ("br", self.endTagBr),
+ ])
+ self.endTagHandler.default = self.endTagOther
+
+ def isMatchingFormattingElement(self, node1, node2):
+ return (node1.name == node2.name and
+ node1.namespace == node2.namespace and
+ node1.attributes == node2.attributes)
+
+ # helper
+ def addFormattingElement(self, token):
+ self.tree.insertElement(token)
+ element = self.tree.openElements[-1]
+
+ matchingElements = []
+ for node in self.tree.activeFormattingElements[::-1]:
+ if node is Marker:
+ break
+ elif self.isMatchingFormattingElement(node, element):
+ matchingElements.append(node)
+
+ assert len(matchingElements) <= 3
+ if len(matchingElements) == 3:
+ self.tree.activeFormattingElements.remove(matchingElements[-1])
+ self.tree.activeFormattingElements.append(element)
+
+ # the real deal
+ def processEOF(self):
+ allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
+ "tfoot", "th", "thead", "tr", "body",
+ "html"))
+ for node in self.tree.openElements[::-1]:
+ if node.name not in allowed_elements:
+ self.parser.parseError("expected-closing-tag-but-got-eof")
+ break
+ # Stop parsing
+
+ def processSpaceCharactersDropNewline(self, token):
+ # Sometimes (start of , , and