Skip to content

Commit

Permalink
Merge branch 'regex' of https://github.com/julienmalard/lark into jul…
Browse files Browse the repository at this point in the history
…ienmalard-regex
  • Loading branch information
erezsh committed Jun 30, 2020
2 parents 985c38e + 9e5108b commit 1ef0e18
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 50 deletions.
1 change: 1 addition & 0 deletions lark-stubs/lexer.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class TraditionalLexer(Lexer):
user_callbacks: Dict[str, _Callback]
callback: Dict[str, _Callback]
mres: List[Tuple[REPattern, Dict[int, str]]]
re: ModuleType

def __init__(
self,
Expand Down
14 changes: 7 additions & 7 deletions lark/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,31 +247,31 @@ def _create_unless(terminals, g_regex_flags, re_):
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True))
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True))

terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(terminals, max_size, g_regex_flags, match_whole):
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while terminals:
try:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole)
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_)

# terms_from_name = {t.name: t for t in terminals[:max_size]}
mres.append((mre, {i:n for n,i in mre.groupindex.items()} ))
terminals = terminals[max_size:]
return mres

def build_mres(terminals, g_regex_flags, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole)
def build_mres(terminals, g_regex_flags, re_, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_)

def _regexp_has_newline(r):
r"""Expressions that may indicate newlines in a regexp:
Expand Down Expand Up @@ -332,7 +332,7 @@ def build(self, g_regex_flags=0):
else:
self.callback[type_] = f

self.mres = build_mres(terminals, g_regex_flags)
self.mres = build_mres(terminals, g_regex_flags, self.re)

def match(self, stream, pos):
for mre, type_from_index in self.mres:
Expand Down
2 changes: 1 addition & 1 deletion tests/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .test_tools import TestStandalone
from .test_cache import TestCache
from .test_reconstructor import TestReconstructor
from .test_regex import TestRegex

try:
from .test_nearley.test_nearley import TestNearley
except ImportError:
Expand Down
17 changes: 17 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1787,6 +1787,23 @@ def test_lexer_detect_newline_tokens(self):
self.assertEqual(a.line, 1)
self.assertEqual(b.line, 2)

@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_class(self):
"Tests that character classes from the `regex` module work correctly."
g = _Lark(r"""?start: NAME
NAME: ID_START ID_CONTINUE*
ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/
ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True)

self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

@unittest.skipIf(sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')
def test_unicode_word(self):
"Tests that a persistent bug in the `re` module works when `regex` is enabled."
g = _Lark(r"""?start: NAME
NAME: /[\w]+/
""", regex=True)
self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்')

_NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
_TestParser.__name__ = _NAME
Expand Down
37 changes: 0 additions & 37 deletions tests/test_regex.py

This file was deleted.

6 changes: 1 addition & 5 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,7 @@ pypy3 = pypy3
[testenv]
whitelist_externals = git
deps =
-rnearley-requirements.txt
-rregex-requirements.txt

# For regex testing
extras = regex
-rtest-requirements.txt

# to always force recreation and avoid unexpected side effects
recreate=True
Expand Down

0 comments on commit 1ef0e18

Please sign in to comment.