Initial commit

glutanimate · May 23, 2017 · a58f771 · a58f771
commit a58f771
Show file tree

Hide file tree

Showing 59 changed files with 16,141 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,10 @@
+# Exclude files from deployment:
+.gitattributes export-ignore
+.gitignore export-ignore
+docs export-ignore
+/screenshots export-ignore
+/tools export-ignore
+html_cleaner/test.py export-ignore
+obsolete export-ignore
+# Adjust GitHub linguist settings:
+ANKIWEB.md linguist-documentation
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,12 @@
+*.pyo
+*.pyc
+*.sublime-project
+*.sublime-workspace
+.hidden
+.directory
+obsolete
+html_cleaner/forms
+docs/todo.md
+docs/description.html
+html-cleaner-*.zip
+.gitold
diff --git a/HTML Cleaner.py b/HTML Cleaner.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+"""
+Anki Add-on: HTML Cleaner
+
+Entry point for the add-on into Anki
+
+Please don't edit this if you don't know what you're doing.
+
+Copyright: (c) Glutanimate 2017
+License: GNU AGPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
+"""
+
+import html_cleaner.main
diff --git a/Makefile b/Makefile
@@ -0,0 +1,33 @@
+# builds zip file for AnkiWeb (among other things)
+
+VERSION = `git describe HEAD --tags --abbrev=0`
+ADDON = "html-cleaner"
+ADDONDIR = "html_cleaner"
+
+all: ui zip
+
+clean:
+	rm -rf dist
+	rm $(ADDON)-*.zip
+
+ui:
+	rm -rf $(ADDON)/forms
+	./tools/build_ui.sh
+
+zip:
+	rm -rf dist
+	mkdir -p dist
+	find . -name '*.pyc' -delete
+	cp *.py dist/
+	cp -r $(ADDONDIR) dist/
+	cd dist && zip -r ../$(ADDON)-current.zip *
+	rm -rf dist
+
+release:
+	rm -rf dist
+	mkdir -p dist
+	find . -name '*.pyc' -delete
+	git archive --format tar $(VERSION) | tar -x -C dist/
+	cd dist &&  \
+		zip -r ../$(ADDON)-release-$(VERSION).zip $(ADDONDIR) *.py
+	rm -rf dist
diff --git a/README.md b/README.md
@@ -0,0 +1,28 @@
+## HTML Cleaner Add-on for Anki
+
+Cleans and minifies HTML content of the current field, removing extraneous tags and attributes copied over from apps like Word, Chrome, etc.
+
+## Usage
+
+The add-on comes with a button and two hotkeys:
+
+- Clicking on the *cH* button in the editor will clean the HTML code of the active field. The same functionality can also be invoked via <kbd>Alt</kbd> + <kbd>H</kbd>
+- Shift-clicking the button or combining the aforementioned hotkey with Shift will undo the changes to the current field. (Anki's inbuilt undo functionality does not work with the add-on. This is a limitation that can't be solved trivially, I'm afraid.)
+- <kbd>Alt</kbd> + <kbd>V</kbd> will clean the clipboard selection and paste the processed text into the current field
+
+## Configuration
+
+The add-on's HTML processing is highly configurable. All options can be accessed by editing the configuration section of `html_cleaner/main.py`.
+
+## License and Credits
+
+*Cloze Overlapper* is *Copyright © 2016-2017 [Aristotelis P.](https://github.com/Glutanimate)*
+
+Licensed under the [GNU AGPL v3](https://www.gnu.org/licenses/agpl.html).
+
+This add-on would not not have been possible without the following open-source libraries:
+
+- [Bleach](https://github.com/mozilla/bleach) 2.0.0. Copyright (c) 2014-2017 Mozilla Foundation. Licensed under the Apache License 2.0
+- [html5lib](https://github.com/html5lib/) 0.999999999. Copyright (c) 2006-2013 James Graham and other contributors. Licensed under the MIT license.
+- [webencodings](https://github.com/gsnedders/python-webencodings) 0.5.1. Copyright (c) 2012-2017 Geoffrey Sneddon. Licensed under the BSD license.
+- [six](https://github.com/benjaminp/six) 1.10.0. Copyright (c) 2010-2015 Benjamin Peterson. Licensed under the MIT license
diff --git a/docs/description.md b/docs/description.md
diff --git a/html_cleaner/LICENSES/LICENSE_BLEACH b/html_cleaner/LICENSES/LICENSE_BLEACH
@@ -0,0 +1,13 @@
+Copyright (c) 2014-2017, Mozilla Foundation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/html_cleaner/LICENSES/LICENSE_HTML5LIB b/html_cleaner/LICENSES/LICENSE_HTML5LIB
@@ -0,0 +1,20 @@
+Copyright (c) 2006-2013 James Graham and other contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/html_cleaner/LICENSES/LICENSE_SIX b/html_cleaner/LICENSES/LICENSE_SIX
@@ -0,0 +1,18 @@
+Copyright (c) 2010-2015 Benjamin Peterson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/html_cleaner/LICENSES/LICENSE_WEBENCODINGS b/html_cleaner/LICENSES/LICENSE_WEBENCODINGS
@@ -0,0 +1,18 @@
+Copyright (c) 2012-2017 Geoffrey Sneddon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/html_cleaner/__init__.py b/html_cleaner/__init__.py
diff --git a/html_cleaner/bleach/__init__.py b/html_cleaner/bleach/__init__.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+from bleach.linkifier import (
+    DEFAULT_CALLBACKS,
+    Linker,
+    LinkifyFilter,
+)
+from bleach.sanitizer import (
+    ALLOWED_ATTRIBUTES,
+    ALLOWED_PROTOCOLS,
+    ALLOWED_STYLES,
+    ALLOWED_TAGS,
+    BleachSanitizerFilter,
+    Cleaner,
+)
+from bleach.version import __version__, VERSION # flake8: noqa
+
+__all__ = ['clean', 'linkify']
+
+
+def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
+          styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
+          strip_comments=True):
+    """Clean an HTML fragment of malicious content and return it
+
+    This function is a security-focused function whose sole purpose is to
+    remove malicious content from a string such that it can be displayed as
+    content in a web page.
+
+    This function is not designed to use to transform content to be used in
+    non-web-page contexts.
+
+    Example::
+
+        import bleach
+
+        better_text = bleach.clean(yucky_text)
+
+
+    .. Note::
+
+       If you're cleaning a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.sanitizer.Cleaner` instance.
+
+    :arg str text: the text to clean
+
+    :arg list tags: allowed list of tags; defaults to
+        ``bleach.sanitizer.ALLOWED_TAGS``
+
+    :arg dict attributes: allowed attributes; can be a callable, list or dict;
+        defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
+
+    :arg list styles: allowed list of css styles; defaults to
+        ``bleach.sanitizer.ALLOWED_STYLES``
+
+    :arg list protocols: allowed list of protocols for links; defaults
+        to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
+
+    :arg bool strip: whether or not to strip disallowed elements
+
+    :arg bool strip_comments: whether or not to strip HTML comments
+
+    :returns: cleaned text as unicode
+
+    """
+    cleaner = Cleaner(
+        tags=tags,
+        attributes=attributes,
+        styles=styles,
+        protocols=protocols,
+        strip=strip,
+        strip_comments=strip_comments,
+    )
+    return cleaner.clean(text)
+
+
+def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False):
+    """Convert URL-like strings in an HTML fragment to links
+
+    This function converts strings that look like URLs, domain names and email
+    addresses in text that may be an HTML fragment to links, while preserving:
+
+    1. links already in the string
+    2. urls found in attributes
+    3. email addresses
+
+    linkify does a best-effort approach and tries to recover from bad
+    situations due to crazy text.
+
+    .. Note::
+
+       If you're linking a lot of text and passing the same argument values or
+       you want more configurability, consider using a
+       :py:class:`bleach.linkifier.Linker` instance.
+
+    .. Note::
+
+       If you have text that you want to clean and then linkify, consider using
+       the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean
+       pass. That way you're not parsing the HTML twice.
+
+    :arg str text: the text to linkify
+
+    :arg list callbacks: list of callbacks to run when adjusting tag attributes;
+        defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
+
+    :arg list skip_tags: list of tags that you don't want to linkify the
+        contents of; for example, you could set this to ``['pre']`` to skip
+        linkifying contents of ``pre`` tags
+
+    :arg bool parse_email: whether or not to linkify email addresses
+
+    :returns: linkified text as unicode
+
+    """
+    linker = Linker(
+        callbacks=callbacks,
+        skip_tags=skip_tags,
+        parse_email=parse_email
+    )
+    return linker.linkify(text)
diff --git a/html_cleaner/bleach/callbacks.py b/html_cleaner/bleach/callbacks.py
@@ -0,0 +1,25 @@
+"""A set of basic callbacks for bleach.linkify."""
+from __future__ import unicode_literals
+
+
+def nofollow(attrs, new=False):
+    href_key = (None, u'href')
+    if href_key not in attrs or attrs[href_key].startswith(u'mailto:'):
+        return attrs
+
+    rel_key = (None, u'rel')
+    rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val]
+    if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]:
+        rel_values.append(u'nofollow')
+    attrs[rel_key] = u' '.join(rel_values)
+
+    return attrs
+
+
+def target_blank(attrs, new=False):
+    href_key = (None, u'href')
+    if attrs[href_key].startswith(u'mailto:'):
+        return attrs
+
+    attrs[(None, u'target')] = u'_blank'
+    return attrs
diff --git a/html_cleaner/bleach/encoding.py b/html_cleaner/bleach/encoding.py
@@ -0,0 +1,62 @@
+import datetime
+from decimal import Decimal
+import types
+import six
+
+
+def is_protected_type(obj):
+    """Determine if the object instance is of a protected type.
+
+    Objects of protected types are preserved as-is when passed to
+    force_unicode(strings_only=True).
+    """
+    return isinstance(obj, (
+        six.integer_types +
+        (types.NoneType,
+         datetime.datetime, datetime.date, datetime.time,
+         float, Decimal))
+    )
+
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Similar to smart_text, except that lazy instances are resolved to
+    strings, rather than kept as lazy objects.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    # Handle the common case first, saves 30-40% when s is an instance of
+    # six.text_type. This function gets called often in that setting.
+    if isinstance(s, six.text_type):
+        return s
+    if strings_only and is_protected_type(s):
+        return s
+    try:
+        if not isinstance(s, six.string_types):
+            if hasattr(s, '__unicode__'):
+                s = s.__unicode__()
+            else:
+                if six.PY3:
+                    if isinstance(s, bytes):
+                        s = six.text_type(s, encoding, errors)
+                    else:
+                        s = six.text_type(s)
+                else:
+                    s = six.text_type(bytes(s), encoding, errors)
+        else:
+            # Note: We use .decode() here, instead of six.text_type(s,
+            # encoding, errors), so that if s is a SafeBytes, it ends up being
+            # a SafeText at the end.
+            s = s.decode(encoding, errors)
+    except UnicodeDecodeError as e:
+        if not isinstance(s, Exception):
+            raise UnicodeDecodeError(*e.args)
+        else:
+            # If we get to here, the caller has passed in an Exception
+            # subclass populated with non-ASCII bytestring data without a
+            # working unicode method. Try to handle this without raising a
+            # further exception by individually forcing the exception args
+            # to unicode.
+            s = ' '.join([force_unicode(arg, encoding, strings_only,
+                          errors) for arg in s])
+    return s