Skip to content

Commit

Permalink
Fix syntax detection screwed up by wrong URLs (fixes #28)
Browse files Browse the repository at this point in the history
Before this commit, if you had two underscores in an URL the whole message was
marked as markdown. This caused issues because Telegram clients refuses to mark
links as links if they contain syntax in them.

This commit strips URLs and email addresses from the messages before checking
if they're markdown or not, so this bug is fixed. Also, there are new tests to
prevent this issue from happening in the future.
  • Loading branch information
Pietro Albini committed Feb 23, 2016
1 parent 68c3226 commit 4820507
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 2 deletions.
7 changes: 7 additions & 0 deletions botogram/syntaxes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

import re

from . import utils


_markdown_re = re.compile(r".*("
r"\*(.*)\*|"
Expand All @@ -30,11 +32,16 @@

def is_markdown(message):
"""Check if a string is actually markdown"""
# Don't mark part of URLs or email addresses as Markdown
message = utils.strip_urls(message)

return bool(_markdown_re.match(message))


def is_html(message):
"""Check if a string is actually HTML"""
# Here URLs are not stripped because no sane URL contains HTML tags in it,
# and for a few cases the speed penality is not worth
return bool(_html_re.match(message))


Expand Down
14 changes: 12 additions & 2 deletions botogram/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@
import logbook
import functools

# URLs regex created by http://twitter.com/imme_emosol

_username_re = re.compile(r"\@([a-zA-Z0-9_]{5}[a-zA-Z0-9_]*)")
_command_re = re.compile(r"^\/[a-zA-Z0-9_]+(\@[a-zA-Z0-9_]{5}[a-zA-Z0-9_]*)?$")
_email_re = re.compile(r"[a-zA-Z0-9_\.\+\-]+\@[a-zA-Z0-9_\.\-]+\.[a-zA-Z]+")
_url_re = re.compile(r"https?://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?")

# This small piece of global state will track if logbook was configured
_logger_configured = False
Expand Down Expand Up @@ -139,15 +142,22 @@ def docstring_of(func, bot=None, component_id=None):
return format_docstr(docstring)


def strip_urls(string):
"""Strip URLs and emails from a string"""
string = _url_re.sub("", string)
string = _email_re.sub("", string)
return string


def usernames_in(message):
"""Return all the matched usernames in the message"""
# Don't parse usernames in the commands
if _command_re.match(message.split(" ", 1)[0]):
message = message.split(" ", 1)[1]

# Strip email addresses from the message, in order to avoid matching the
# user's domain. This also happens to match username/passwords in URLs
message = _email_re.sub("", message)
# user's domain. Also strip URLs, in order to avoid usernames in them.
message = strip_urls(message)

results = []
for result in _username_re.finditer(message):
Expand Down
11 changes: 11 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,17 @@ botogram 0.2

No changes yet.

.. _changelog-0.1.2:

botogram 0.1.2
==============

*Bugfix release, not yet released*

* Fix automatic syntax detector recognizing markdown in URLs (`issue 28`_)

.. _issue 28: https://github.com/pietroalbini/botogram/issues/28

.. _changelog-0.1.1:

botogram 0.1.1
Expand Down
3 changes: 3 additions & 0 deletions tests/test_syntaxes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def test_is_markdown():
assert botogram.syntaxes.is_markdown("[a](b)")
assert botogram.syntaxes.is_markdown("![a](b)!")

assert not botogram.syntaxes.is_markdown("hey@this_is_awesome.com")
assert not botogram.syntaxes.is_markdown("https://www.this_is_awesome.com")


def test_is_html():
assert not botogram.syntaxes.is_html("not HTML, sorry!")
Expand Down

0 comments on commit 4820507

Please sign in to comment.