-
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit a58f771
Showing
59 changed files
with
16,141 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# Exclude files from deployment: | ||
.gitattributes export-ignore | ||
.gitignore export-ignore | ||
docs export-ignore | ||
/screenshots export-ignore | ||
/tools export-ignore | ||
html_cleaner/test.py export-ignore | ||
obsolete export-ignore | ||
# Adjust GitHub linguist settings: | ||
ANKIWEB.md linguist-documentation |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
*.pyo | ||
*.pyc | ||
*.sublime-project | ||
*.sublime-workspace | ||
.hidden | ||
.directory | ||
obsolete | ||
html_cleaner/forms | ||
docs/todo.md | ||
docs/description.html | ||
html-cleaner-*.zip | ||
.gitold |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
Anki Add-on: HTML Cleaner | ||
Entry point for the add-on into Anki | ||
Please don't edit this if you don't know what you're doing. | ||
Copyright: (c) Glutanimate 2017 | ||
License: GNU AGPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html | ||
""" | ||
|
||
import html_cleaner.main |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# builds zip file for AnkiWeb (among other things) | ||
|
||
VERSION = `git describe HEAD --tags --abbrev=0` | ||
ADDON = "html-cleaner" | ||
ADDONDIR = "html_cleaner" | ||
|
||
all: ui zip | ||
|
||
clean: | ||
rm -rf dist | ||
rm $(ADDON)-*.zip | ||
|
||
ui: | ||
rm -rf $(ADDON)/forms | ||
./tools/build_ui.sh | ||
|
||
zip: | ||
rm -rf dist | ||
mkdir -p dist | ||
find . -name '*.pyc' -delete | ||
cp *.py dist/ | ||
cp -r $(ADDONDIR) dist/ | ||
cd dist && zip -r ../$(ADDON)-current.zip * | ||
rm -rf dist | ||
|
||
release: | ||
rm -rf dist | ||
mkdir -p dist | ||
find . -name '*.pyc' -delete | ||
git archive --format tar $(VERSION) | tar -x -C dist/ | ||
cd dist && \ | ||
zip -r ../$(ADDON)-release-$(VERSION).zip $(ADDONDIR) *.py | ||
rm -rf dist |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
## HTML Cleaner Add-on for Anki | ||
|
||
Cleans and minifies HTML content of the current field, removing extraneous tags and attributes copied over from apps like Word, Chrome, etc. | ||
|
||
## Usage | ||
|
||
The add-on comes with a button and two hotkeys: | ||
|
||
- Clicking on the *cH* button in the editor will clean the HTML code of the active field. The same functionality can also be invoked via <kbd>Alt</kbd> + <kbd>H</kbd> | ||
- Shift-clicking the button or combining the aforementioned hotkey with Shift will undo the changes to the current field. (Anki's inbuilt undo functionality does not work with the add-on. This is a limitation that can't be solved trivially, I'm afraid.) | ||
- <kbd>Alt</kbd> + <kbd>V</kbd> will clean the clipboard selection and paste the processed text into the current field | ||
|
||
## Configuration | ||
|
||
The add-on's HTML processing is highly configurable. All options can be accessed by editing the configuration section of `html_cleaner/main.py`. | ||
|
||
## License and Credits | ||
|
||
*Cloze Overlapper* is *Copyright © 2016-2017 [Aristotelis P.](https://github.com/Glutanimate)* | ||
|
||
Licensed under the [GNU AGPL v3](https://www.gnu.org/licenses/agpl.html). | ||
|
||
This add-on would not not have been possible without the following open-source libraries: | ||
|
||
- [Bleach](https://github.com/mozilla/bleach) 2.0.0. Copyright (c) 2014-2017 Mozilla Foundation. Licensed under the Apache License 2.0 | ||
- [html5lib](https://github.com/html5lib/) 0.999999999. Copyright (c) 2006-2013 James Graham and other contributors. Licensed under the MIT license. | ||
- [webencodings](https://github.com/gsnedders/python-webencodings) 0.5.1. Copyright (c) 2012-2017 Geoffrey Sneddon. Licensed under the BSD license. | ||
- [six](https://github.com/benjaminp/six) 1.10.0. Copyright (c) 2010-2015 Benjamin Peterson. Licensed under the MIT license |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Copyright (c) 2014-2017, Mozilla Foundation | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
Copyright (c) 2006-2013 James Graham and other contributors | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining | ||
a copy of this software and associated documentation files (the | ||
"Software"), to deal in the Software without restriction, including | ||
without limitation the rights to use, copy, modify, merge, publish, | ||
distribute, sublicense, and/or sell copies of the Software, and to | ||
permit persons to whom the Software is furnished to do so, subject to | ||
the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be | ||
included in all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | ||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | ||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | ||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
Copyright (c) 2010-2015 Benjamin Peterson | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy of | ||
this software and associated documentation files (the "Software"), to deal in | ||
the Software without restriction, including without limitation the rights to | ||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | ||
the Software, and to permit persons to whom the Software is furnished to do so, | ||
subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | ||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | ||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | ||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
Copyright (c) 2012-2017 Geoffrey Sneddon | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy of | ||
this software and associated documentation files (the "Software"), to deal in | ||
the Software without restriction, including without limitation the rights to | ||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | ||
the Software, and to permit persons to whom the Software is furnished to do so, | ||
subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | ||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | ||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | ||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import unicode_literals | ||
|
||
from bleach.linkifier import ( | ||
DEFAULT_CALLBACKS, | ||
Linker, | ||
LinkifyFilter, | ||
) | ||
from bleach.sanitizer import ( | ||
ALLOWED_ATTRIBUTES, | ||
ALLOWED_PROTOCOLS, | ||
ALLOWED_STYLES, | ||
ALLOWED_TAGS, | ||
BleachSanitizerFilter, | ||
Cleaner, | ||
) | ||
from bleach.version import __version__, VERSION # flake8: noqa | ||
|
||
__all__ = ['clean', 'linkify'] | ||
|
||
|
||
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, | ||
styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, | ||
strip_comments=True): | ||
"""Clean an HTML fragment of malicious content and return it | ||
This function is a security-focused function whose sole purpose is to | ||
remove malicious content from a string such that it can be displayed as | ||
content in a web page. | ||
This function is not designed to use to transform content to be used in | ||
non-web-page contexts. | ||
Example:: | ||
import bleach | ||
better_text = bleach.clean(yucky_text) | ||
.. Note:: | ||
If you're cleaning a lot of text and passing the same argument values or | ||
you want more configurability, consider using a | ||
:py:class:`bleach.sanitizer.Cleaner` instance. | ||
:arg str text: the text to clean | ||
:arg list tags: allowed list of tags; defaults to | ||
``bleach.sanitizer.ALLOWED_TAGS`` | ||
:arg dict attributes: allowed attributes; can be a callable, list or dict; | ||
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` | ||
:arg list styles: allowed list of css styles; defaults to | ||
``bleach.sanitizer.ALLOWED_STYLES`` | ||
:arg list protocols: allowed list of protocols for links; defaults | ||
to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` | ||
:arg bool strip: whether or not to strip disallowed elements | ||
:arg bool strip_comments: whether or not to strip HTML comments | ||
:returns: cleaned text as unicode | ||
""" | ||
cleaner = Cleaner( | ||
tags=tags, | ||
attributes=attributes, | ||
styles=styles, | ||
protocols=protocols, | ||
strip=strip, | ||
strip_comments=strip_comments, | ||
) | ||
return cleaner.clean(text) | ||
|
||
|
||
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False): | ||
"""Convert URL-like strings in an HTML fragment to links | ||
This function converts strings that look like URLs, domain names and email | ||
addresses in text that may be an HTML fragment to links, while preserving: | ||
1. links already in the string | ||
2. urls found in attributes | ||
3. email addresses | ||
linkify does a best-effort approach and tries to recover from bad | ||
situations due to crazy text. | ||
.. Note:: | ||
If you're linking a lot of text and passing the same argument values or | ||
you want more configurability, consider using a | ||
:py:class:`bleach.linkifier.Linker` instance. | ||
.. Note:: | ||
If you have text that you want to clean and then linkify, consider using | ||
the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean | ||
pass. That way you're not parsing the HTML twice. | ||
:arg str text: the text to linkify | ||
:arg list callbacks: list of callbacks to run when adjusting tag attributes; | ||
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` | ||
:arg list skip_tags: list of tags that you don't want to linkify the | ||
contents of; for example, you could set this to ``['pre']`` to skip | ||
linkifying contents of ``pre`` tags | ||
:arg bool parse_email: whether or not to linkify email addresses | ||
:returns: linkified text as unicode | ||
""" | ||
linker = Linker( | ||
callbacks=callbacks, | ||
skip_tags=skip_tags, | ||
parse_email=parse_email | ||
) | ||
return linker.linkify(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
"""A set of basic callbacks for bleach.linkify.""" | ||
from __future__ import unicode_literals | ||
|
||
|
||
def nofollow(attrs, new=False): | ||
href_key = (None, u'href') | ||
if href_key not in attrs or attrs[href_key].startswith(u'mailto:'): | ||
return attrs | ||
|
||
rel_key = (None, u'rel') | ||
rel_values = [val for val in attrs.get(rel_key, u'').split(u' ') if val] | ||
if u'nofollow' not in [rel_val.lower() for rel_val in rel_values]: | ||
rel_values.append(u'nofollow') | ||
attrs[rel_key] = u' '.join(rel_values) | ||
|
||
return attrs | ||
|
||
|
||
def target_blank(attrs, new=False): | ||
href_key = (None, u'href') | ||
if attrs[href_key].startswith(u'mailto:'): | ||
return attrs | ||
|
||
attrs[(None, u'target')] = u'_blank' | ||
return attrs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import datetime | ||
from decimal import Decimal | ||
import types | ||
import six | ||
|
||
|
||
def is_protected_type(obj): | ||
"""Determine if the object instance is of a protected type. | ||
Objects of protected types are preserved as-is when passed to | ||
force_unicode(strings_only=True). | ||
""" | ||
return isinstance(obj, ( | ||
six.integer_types + | ||
(types.NoneType, | ||
datetime.datetime, datetime.date, datetime.time, | ||
float, Decimal)) | ||
) | ||
|
||
|
||
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): | ||
""" | ||
Similar to smart_text, except that lazy instances are resolved to | ||
strings, rather than kept as lazy objects. | ||
If strings_only is True, don't convert (some) non-string-like objects. | ||
""" | ||
# Handle the common case first, saves 30-40% when s is an instance of | ||
# six.text_type. This function gets called often in that setting. | ||
if isinstance(s, six.text_type): | ||
return s | ||
if strings_only and is_protected_type(s): | ||
return s | ||
try: | ||
if not isinstance(s, six.string_types): | ||
if hasattr(s, '__unicode__'): | ||
s = s.__unicode__() | ||
else: | ||
if six.PY3: | ||
if isinstance(s, bytes): | ||
s = six.text_type(s, encoding, errors) | ||
else: | ||
s = six.text_type(s) | ||
else: | ||
s = six.text_type(bytes(s), encoding, errors) | ||
else: | ||
# Note: We use .decode() here, instead of six.text_type(s, | ||
# encoding, errors), so that if s is a SafeBytes, it ends up being | ||
# a SafeText at the end. | ||
s = s.decode(encoding, errors) | ||
except UnicodeDecodeError as e: | ||
if not isinstance(s, Exception): | ||
raise UnicodeDecodeError(*e.args) | ||
else: | ||
# If we get to here, the caller has passed in an Exception | ||
# subclass populated with non-ASCII bytestring data without a | ||
# working unicode method. Try to handle this without raising a | ||
# further exception by individually forcing the exception args | ||
# to unicode. | ||
s = ' '.join([force_unicode(arg, encoding, strings_only, | ||
errors) for arg in s]) | ||
return s |
Oops, something went wrong.