Skip to content

Commit

Permalink
Add lexer autodetection
Browse files Browse the repository at this point in the history
  • Loading branch information
mweinelt committed May 30, 2020
1 parent f157cc1 commit 895d3dc
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 1 deletion.
4 changes: 4 additions & 0 deletions pinnwand/handler/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ def post(self) -> None: # type: ignore
raise error.ValidationError()

for (lexer, raw, filename) in zip(lexers, raws, filenames):
if lexer == 'AUTO':
lexer = utility.guess_language(raw, filename)
log.debug(f"CreateAction.post: guessed language {lexer}")

if lexer not in utility.list_languages():
log.info("CreateAction.post: a file had an invalid lexer")
raise error.ValidationError()
Expand Down
1 change: 1 addition & 0 deletions pinnwand/template/part/lexer-select.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<select name="lexer">
<option value="AUTO">Autodetect</option>
{% if handler.application.configuration.preferred_lexers %}
{% for key in handler.application.configuration.preferred_lexers %}
<option value="{{ key }}"{% if selected == key %} selected="selected"{% end %}>{{ lexers[key] }}</option>
Expand Down
37 changes: 36 additions & 1 deletion pinnwand/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from base64 import b32encode
from datetime import timedelta

from pygments.lexers import get_all_lexers
from pygments.lexers import get_all_lexers, guess_lexer, guess_lexer_for_filename

from pinnwand import database

Expand All @@ -26,6 +26,41 @@ def list_languages() -> Dict[str, str]:
return dict(sorted(lexers.items(), key=lambda x: x[1]))


GUESS_LANG_OVERRIDE_MAP = {
'python2': 'python'
}

def guess_language(raw: str, filename: str) -> str:
# Shorten the raw text to 1024 bytes, increases chance to get matching right
raw = raw[:1024]

# Guess a lexer based on filename and raw text first
if filename:
try:
return guess_lexer_for_filename(filename, raw).aliases[0]
except ValueError:
pass

# If that didn't work guess lexer just by looking at the raw text
try:
language = guess_lexer(raw).aliases[0]
except ValueError:
# If no lexer was detected, fallback to plain text.
return 'text'

# These are odd lexers that match far too often, so exclude them.
if language in ('mime', 'tsql'):
return 'text'

# Finally check for language overrides
try:
return GUESS_LANG_OVERRIDE_MAP[language]
except KeyError:
pass

return language


expiries = {"1day": timedelta(days=1), "1week": timedelta(days=7)}


Expand Down

0 comments on commit 895d3dc

Please sign in to comment.