Fix syntax detection screwed up by wrong URLs (fixes #28)

Before this commit, if you had two underscores in an URL the whole message was marked as markdown. This caused issues because Telegram clients refuses to mark links as links if they contain syntax in them. This commit strips URLs and email addresses from the messages before checking if they're markdown or not, so this bug is fixed. Also, there are new tests to prevent this issue from happening in the future.
python-botogram · Feb 23, 2016 · 4820507 · 4820507
1 parent 68c3226
commit 4820507
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 2 deletions.
diff --git a/botogram/syntaxes.py b/botogram/syntaxes.py
@@ -8,6 +8,8 @@
 
 import re
 
+from . import utils
+
 
 _markdown_re = re.compile(r".*("
                           r"\*(.*)\*|"
@@ -30,11 +32,16 @@
 
 def is_markdown(message):
     """Check if a string is actually markdown"""
+    # Don't mark part of URLs or email addresses as Markdown
+    message = utils.strip_urls(message)
+
     return bool(_markdown_re.match(message))
 
 
 def is_html(message):
     """Check if a string is actually HTML"""
+    # Here URLs are not stripped because no sane URL contains HTML tags in it,
+    # and for a few cases the speed penality is not worth
     return bool(_html_re.match(message))
 
 

diff --git a/botogram/utils.py b/botogram/utils.py
@@ -17,9 +17,12 @@
 import logbook
 import functools
 
+# URLs regex created by http://twitter.com/imme_emosol
+
 _username_re = re.compile(r"\@([a-zA-Z0-9_]{5}[a-zA-Z0-9_]*)")
 _command_re = re.compile(r"^\/[a-zA-Z0-9_]+(\@[a-zA-Z0-9_]{5}[a-zA-Z0-9_]*)?$")
 _email_re = re.compile(r"[a-zA-Z0-9_\.\+\-]+\@[a-zA-Z0-9_\.\-]+\.[a-zA-Z]+")
+_url_re = re.compile(r"https?://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?")
 
 # This small piece of global state will track if logbook was configured
 _logger_configured = False
@@ -139,15 +142,22 @@ def docstring_of(func, bot=None, component_id=None):
     return format_docstr(docstring)
 
 
+def strip_urls(string):
+    """Strip URLs and emails from a string"""
+    string = _url_re.sub("", string)
+    string = _email_re.sub("", string)
+    return string
+
+
 def usernames_in(message):
     """Return all the matched usernames in the message"""
     # Don't parse usernames in the commands
     if _command_re.match(message.split(" ", 1)[0]):
         message = message.split(" ", 1)[1]
 
     # Strip email addresses from the message, in order to avoid matching the
-    # user's domain. This also happens to match username/passwords in URLs
-    message = _email_re.sub("", message)
+    # user's domain. Also strip URLs, in order to avoid usernames in them.
+    message = strip_urls(message)
 
     results = []
     for result in _username_re.finditer(message):

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -18,6 +18,17 @@ botogram 0.2
 
 No changes yet.
 
+.. _changelog-0.1.2:
+
+botogram 0.1.2
+==============
+
+*Bugfix release, not yet released*
+
+* Fix automatic syntax detector recognizing markdown in URLs (`issue 28`_)
+
+.. _issue 28: https://github.com/pietroalbini/botogram/issues/28
+
 .. _changelog-0.1.1:
 
 botogram 0.1.1

diff --git a/tests/test_syntaxes.py b/tests/test_syntaxes.py
@@ -21,6 +21,9 @@ def test_is_markdown():
     assert botogram.syntaxes.is_markdown("[a](b)")
     assert botogram.syntaxes.is_markdown("![a](b)!")
 
+    assert not botogram.syntaxes.is_markdown("hey@this_is_awesome.com")
+    assert not botogram.syntaxes.is_markdown("https://www.this_is_awesome.com")
+
 
 def test_is_html():
     assert not botogram.syntaxes.is_html("not HTML, sorry!")