Merge branch 'regex' of https://github.com/julienmalard/lark into jul…

…ienmalard-regex
lark-parser · Jun 30, 2020 · 1ef0e18 · 1ef0e18
2 parents 985c38e + 9e5108b
commit 1ef0e18
Show file tree

Hide file tree

Showing 6 changed files with 27 additions and 50 deletions.
diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi
@@ -107,6 +107,7 @@ class TraditionalLexer(Lexer):
     user_callbacks: Dict[str, _Callback]
     callback: Dict[str, _Callback]
     mres: List[Tuple[REPattern, Dict[int, str]]]
+    re: ModuleType
 
     def __init__(
         self,

diff --git a/lark/lexer.py b/lark/lexer.py
@@ -247,31 +247,31 @@ def _create_unless(terminals, g_regex_flags, re_):
                 if strtok.pattern.flags <= retok.pattern.flags:
                     embedded_strs.add(strtok)
         if unless:
-            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True))
+            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True))
 
     terminals = [t for t in terminals if t not in embedded_strs]
     return terminals, callback
 
 
-def _build_mres(terminals, max_size, g_regex_flags, match_whole):
+def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_):
     # Python sets an unreasonable group limit (currently 100) in its re module
     # Worse, the only way to know we reached it is by catching an AssertionError!
     # This function recursively tries less and less groups until it's successful.
     postfix = '$' if match_whole else ''
     mres = []
     while terminals:
         try:
-            mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
+            mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
         except AssertionError:  # Yes, this is what Python provides us.. :/
-            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole)
+            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_)
 
         # terms_from_name = {t.name: t for t in terminals[:max_size]}
         mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
         terminals = terminals[max_size:]
     return mres
 
-def build_mres(terminals, g_regex_flags, match_whole=False):
-    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole)
+def build_mres(terminals, g_regex_flags, re_, match_whole=False):
+    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_)
 
 def _regexp_has_newline(r):
     r"""Expressions that may indicate newlines in a regexp:
@@ -332,7 +332,7 @@ def build(self, g_regex_flags=0):
             else:
                 self.callback[type_] = f
 
-        self.mres = build_mres(terminals, g_regex_flags)
+        self.mres = build_mres(terminals, g_regex_flags, self.re)
 
     def match(self, stream, pos):
         for mre, type_from_index in self.mres:

diff --git a/tests/__main__.py b/tests/__main__.py
@@ -7,7 +7,7 @@
 from .test_tools import TestStandalone
 from .test_cache import TestCache
 from .test_reconstructor import TestReconstructor
-from .test_regex import TestRegex
+
 try:
     from .test_nearley.test_nearley import TestNearley
 except ImportError:

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -1787,6 +1787,23 @@ def test_lexer_detect_newline_tokens(self):
                 self.assertEqual(a.line, 1)
                 self.assertEqual(b.line, 2)
 
+        @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
+        def test_unicode_class(self):
+            "Tests that character classes from the `regex` module work correctly."
+            g = _Lark(r"""?start: NAME
+                           NAME: ID_START ID_CONTINUE*
+                           ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
+                           ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True)
+
+            self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
+
+        @unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
+        def test_unicode_word(self):
+            "Tests that a persistent bug in the `re` module works when `regex` is enabled."
+            g = _Lark(r"""?start: NAME
+                           NAME: /[\w]+/
+                        """, regex=True)
+            self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')
 
     _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
     _TestParser.__name__ = _NAME

diff --git a/tests/test_regex.py b/tests/test_regex.py
diff --git a/tox.ini b/tox.ini
@@ -14,11 +14,7 @@ pypy3 = pypy3
 [testenv]
 whitelist_externals = git
 deps =
-    -rnearley-requirements.txt
-    -rregex-requirements.txt
-
-# For regex testing
-extras = regex
+    -rtest-requirements.txt
 
 # to always force recreation and avoid unexpected side effects
 recreate=True