-
Notifications
You must be signed in to change notification settings - Fork 307
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add custom argument parser for cell magic
- Loading branch information
Showing
4 changed files
with
653 additions
and
0 deletions.
There are no files selected for viewing
21 changes: 21 additions & 0 deletions
21
google/cloud/bigquery/ipython_magics/line_arg_parser/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from google.cloud.bigquery.ipython_magics.line_arg_parser.exceptions import ParseError | ||
from google.cloud.bigquery.ipython_magics.line_arg_parser.lexer import Lexer | ||
from google.cloud.bigquery.ipython_magics.line_arg_parser.lexer import TokenType | ||
from google.cloud.bigquery.ipython_magics.line_arg_parser.parser import Parser | ||
|
||
|
||
__all__ = ("Lexer", "ParseError", "Parser", "TokenType") |
17 changes: 17 additions & 0 deletions
17
google/cloud/bigquery/ipython_magics/line_arg_parser/exceptions.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
class ParseError(Exception): | ||
pass |
178 changes: 178 additions & 0 deletions
178
google/cloud/bigquery/ipython_magics/line_arg_parser/lexer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from collections import namedtuple | ||
from collections import OrderedDict | ||
import itertools | ||
import re | ||
|
||
import enum | ||
|
||
|
||
Token = namedtuple("Token", ("type_", "lexeme", "pos")) | ||
StateTransition = namedtuple("StateTransition", ("new_state", "total_offset")) | ||
|
||
|
||
token_types = OrderedDict( | ||
state_1=OrderedDict( | ||
GOTO_STATE_2=r"(?P<GOTO_STATE_2>(?=--))", # double dash - starting the options list | ||
DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID | ||
), | ||
state_2=OrderedDict( | ||
GOTO_STATE_3=r"(?P<GOTO_STATE_3>(?=--params(?=\s|$)))", # the --params option | ||
OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)", | ||
# NOTE: currently the only valid value for a non "--params" option is | ||
# either a project ID or an integer (e.g. max_results) | ||
OPT_VAL=r"(?P<OPT_VAL>\d+|[^_\d\W](?:\w|\.)+)", | ||
), | ||
state_3=OrderedDict( | ||
PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format( | ||
r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"' # single and double quoted strings | ||
), | ||
PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|$))", | ||
GOTO_STATE_2=r"(?P<GOTO_STATE_2>(?=--\w+))", # found another option spec | ||
PY_BOOL=r"(?P<PY_BOOL>True|False)", | ||
DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)", | ||
PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?(:?[e|E][+-]?\d+)?)", | ||
SQUOTE=r"(?P<SQUOTE>')", | ||
DQUOTE=r'(?P<DQUOTE>")', | ||
COLON=r"(?P<COLON>:)", | ||
COMMA=r"(?P<COMMA>,)", | ||
LCURL=r"(?P<LCURL>\{)", | ||
RCURL=r"(?P<RCURL>})", | ||
LSQUARE=r"(?P<LSQUARE>\[)", | ||
RSQUARE=r"(?P<RSQUARE>])", | ||
LPAREN=r"(?P<LPAREN>\()", | ||
RPAREN=r"(?P<RPAREN>\))", | ||
), | ||
common=OrderedDict( | ||
WS=r"(?P<WS>\s+)", | ||
EOL=r"(?P<EOL>$)", | ||
UNKNOWN=r"(?P<UNKNOWN>\S+)", # anything not a whitespace or matched by something else | ||
), | ||
) | ||
|
||
|
||
class AutoStrEnum(str, enum.Enum): | ||
def _generate_next_value_(name, start, count, last_values): | ||
return name | ||
|
||
|
||
TokenType = AutoStrEnum( | ||
"TokenType", | ||
[ | ||
name | ||
for name in itertools.chain.from_iterable(token_types.values()) | ||
if not name.startswith("GOTO_STATE") | ||
], | ||
) | ||
|
||
|
||
class LexerState(AutoStrEnum): | ||
STATE_1 = enum.auto() # parsing positional arguments | ||
STATE_2 = enum.auto() # parsing options other than "--params" | ||
STATE_3 = enum.auto() # parsing the "--params" option | ||
STATE_END = enum.auto() | ||
|
||
|
||
class Lexer(object): | ||
"""Lexical analyzer for tokenizing the cell magic input line.""" | ||
|
||
_GRAND_PATTERNS = { | ||
LexerState.STATE_1: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_1"].values(), token_types["common"].values(), | ||
) | ||
) | ||
), | ||
LexerState.STATE_2: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_2"].values(), token_types["common"].values(), | ||
) | ||
) | ||
), | ||
LexerState.STATE_3: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_3"].values(), token_types["common"].values(), | ||
) | ||
) | ||
), | ||
} | ||
|
||
def __init__(self, input_text): | ||
self._text = input_text | ||
|
||
def __iter__(self): | ||
# Since re.scanner does not seem to support manipulating inner scanner states, | ||
# we need to implement lexer state transitions manually using special | ||
# non-capturing lookahead token patterns to signal when a state transition | ||
# should be made. | ||
# Since we don't have "nested" states, we don't really need a stack and | ||
# this simple mechanism is sufficient. | ||
state = LexerState.STATE_1 | ||
offset = 0 # the number of characters processed so far | ||
|
||
while state != LexerState.STATE_END: | ||
token_generator = self._get_state_token_generator(state, offset) | ||
|
||
for maybe_token in token_generator: | ||
if isinstance(maybe_token, StateTransition): | ||
state = maybe_token.new_state | ||
offset = maybe_token.total_offset | ||
break | ||
|
||
if maybe_token.type_ != TokenType.WS: | ||
yield maybe_token | ||
|
||
if maybe_token.type_ == TokenType.EOL: | ||
state = LexerState.STATE_END | ||
break | ||
|
||
def _get_state_token_generator(self, state, current_offset): | ||
"""Return token generator for the current state starting at ``current_offset``. | ||
Args: | ||
state (LexerState): The current lexer state. | ||
current_offset (int): The offset in the input text, i.e. the number | ||
of characters already scanned so far. | ||
Returns: | ||
The next ``Token`` or ``StateTransition`` instance. | ||
""" | ||
pattern = self._GRAND_PATTERNS[state] | ||
scanner = pattern.scanner(self._text, pos=current_offset) | ||
return self._scan_for_tokens(scanner) | ||
|
||
def _scan_for_tokens(self, scanner): | ||
"""Yield tokens produced by the scanner or state transition objects. | ||
Args: | ||
scanner (SRE_Scanner): The text tokenizer. | ||
Yields: | ||
The next ``Token`` or ``StateTransition`` instance. | ||
""" | ||
for match in iter(scanner.match, None): | ||
token_type = match.lastgroup | ||
|
||
if token_type.startswith("GOTO_STATE"): | ||
yield StateTransition( | ||
new_state=getattr(LexerState, token_type[5:]), # w/o "GOTO_" prefix | ||
total_offset=match.start(), | ||
) | ||
|
||
yield Token(token_type, match.group(), match.start()) |
Oops, something went wrong.