Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support create function statements #296

Merged
merged 6 commits into from
Nov 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ All notable changes to this project will be documented in this file.

- sqlfmt now supports `delete` statements and the associated keywords `using` and `returning` ([#281](https://github.com/tconbeer/sqlfmt/issues/281))
- sqlfmt now supports `grant` and `revoke` statements and all associated keywords ([#283](https://github.com/tconbeer/sqlfmt/issues/283))
- sqlfmt now supports `create function` statements and all associated keywords ([#282](https://github.com/tconbeer/sqlfmt/issues/282))
- sqlfmt now supports the `explain` keyword ([#280](https://github.com/tconbeer/sqlfmt/issues/280))
- sqlfmt now supports BigQuery typed table and struct definitions and literals, like `table<a int64, b bytes(5), c string>`
- sqlfmt now supports variables like `$foo` as ordinary identifiers

### Features

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ sqlfmt is not configurable, except for line length. It enforces a single style.

sqlfmt is not a linter. It does not parse your code into an AST; it just lexes it and tracks a small subset of tokens that impact formatting. This lets us "do one thing and do it well:" sqlfmt is very fast, and easier to maintain and extend than linters that need a full SQL grammar.

For now, sqlfmt only works on `select` statements (which is all you need if you use sqlfmt with a dbt project). In the future, it will be extended to DDL statements, as well.
For now, sqlfmt only works on `select`, `delete`, `grant`, `revoke`, and `create function` statements (which is all you need if you use sqlfmt with a dbt project). It is being extended to additional DDL and DML. Visit [this tracking issue](https://github.com/tconbeer/sqlfmt/issues/262) for more information.

## Documentation

Expand Down
99 changes: 84 additions & 15 deletions src/sqlfmt/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def raise_sqlfmt_bracket_error(
raw_token = source_string[spos:epos]
raise SqlfmtBracketError(
f"Encountered closing bracket '{raw_token}' at position"
f" {spos}, before matching opening bracket:"
f" {spos}, before matching opening bracket. Context:"
f" {source_string[spos:spos+50]}"
)

Expand Down Expand Up @@ -71,18 +71,9 @@ def safe_add_node_to_buffer(
Then create a Node from that token and append it to the Analyzer's buffer
"""
try:
token = Token.from_match(source_string, match, token_type)
node = analyzer.node_manager.create_node(
token=token, previous_node=analyzer.previous_node
)
add_node_to_buffer(analyzer, source_string, match, token_type)
except SqlfmtBracketError:
token = Token.from_match(source_string, match, fallback_token_type)
node = analyzer.node_manager.create_node(
token=token, previous_node=analyzer.previous_node
)
finally:
analyzer.node_buffer.append(node)
analyzer.pos = token.epos
add_node_to_buffer(analyzer, source_string, match, fallback_token_type)


def add_comment_to_buffer(
Expand Down Expand Up @@ -174,6 +165,80 @@ def handle_semicolon(
)


def handle_ddl_as(
analyzer: "Analyzer",
source_string: str,
match: re.Match,
) -> None:
"""
When we hit "as" in a create function or table statement,
the following syntax should be parsed using the main (select) rules,
unless the next token is a quoted name.
"""
add_node_to_buffer(
analyzer=analyzer,
source_string=source_string,
match=match,
token_type=TokenType.UNTERM_KEYWORD,
)

quoted_name_rule = analyzer.get_rule("quoted_name")
comment_rule = analyzer.get_rule("comment")

quoted_name_pattern = rf"({comment_rule.pattern}|\s)*" + quoted_name_rule.pattern
quoted_name_match = re.match(
quoted_name_pattern, source_string[analyzer.pos :], re.IGNORECASE | re.DOTALL
)

if not quoted_name_match:
assert analyzer.rule_stack, (
"Internal Error! Open an issue. Could not parse DDL 'as' "
f"at pos {analyzer.pos}. Context: "
f"{source_string[analyzer.pos :analyzer.pos+50]}"
)
analyzer.pop_rules()


def handle_closing_angle_bracket(
analyzer: "Analyzer",
source_string: str,
match: re.Match,
) -> None:
"""
When we hit ">", it could be a closing bracket, the ">" operator,
or the first character of another operator, like ">>". We need
to first assume it's a closing bracket, but if that raises a lexing
error, we need to try to match the source again against the operator
rule, to get the whole operator token
"""
try:
add_node_to_buffer(
analyzer=analyzer,
source_string=source_string,
match=match,
token_type=TokenType.BRACKET_CLOSE,
)
except SqlfmtBracketError:
operator_rule = analyzer.get_rule("operator")
operator_pattern = re.compile(
r"\s*" + operator_rule.pattern,
re.IGNORECASE | re.DOTALL,
)
operator_match = operator_pattern.match(source_string, analyzer.pos)

assert operator_match, (
"Internal Error! Open an issue. Could not parse closing bracket '>' "
f"at pos {analyzer.pos}. Context: "
f"{source_string[analyzer.pos :analyzer.pos+10]}"
)
add_node_to_buffer(
analyzer=analyzer,
source_string=source_string,
match=operator_match,
token_type=TokenType.OPERATOR,
)


def handle_set_operator(
analyzer: "Analyzer", source_string: str, match: re.Match
) -> None:
Expand Down Expand Up @@ -268,8 +333,7 @@ def lex_ruleset(
"""
Makes a nested call to analyzer.lex, with the new ruleset activated.
"""
rules = sorted(new_ruleset, key=lambda rule: rule.priority)
analyzer.push_rules(rules)
analyzer.push_rules(new_ruleset)
try:
analyzer.lex(source_string)
except stop_exception:
Expand Down Expand Up @@ -372,7 +436,12 @@ def simplify_regex(pattern: str) -> str:
# using the ruleset that was active before jinja
next_tag_pos = next_tag_match.span(0)[0]
jinja_rules = analyzer.pop_rules()
analyzer.lex(source_string, eof_pos=next_tag_pos)
analyzer.stops.append(next_tag_pos)
try:
analyzer.lex(source_string)
except StopIteration:
analyzer.stops.pop()

analyzer.push_rules(jinja_rules)
# it is possible for the next_tag_match found above to have already been lexed.
# but if it hasn't, we need to process it
Expand Down
37 changes: 25 additions & 12 deletions src/sqlfmt/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Analyzer:
comment_buffer: List[Comment] = field(default_factory=list)
line_buffer: List[Line] = field(default_factory=list)
rule_stack: List[List[Rule]] = field(default_factory=list)
stops: List[int] = field(default_factory=list)
pos: int = 0

@property
Expand Down Expand Up @@ -100,7 +101,7 @@ def parse_query(self, source_string: str) -> Query:

def push_rules(self, new_rules: List[Rule]) -> None:
self.rule_stack.append(self.rules.copy())
self.rules = new_rules
self.rules = sorted(new_rules, key=lambda rule: rule.priority)

def pop_rules(self) -> List[Rule]:
old_rules = self.rules
Expand All @@ -117,6 +118,28 @@ def get_rule(self, rule_name: str) -> Rule:
except StopIteration:
raise ValueError(f"No rule '{rule_name}'")

def lex_one(self, source_string: str) -> None:
"""
Repeatedly match Rules to the source_string (at self.pos)
and apply the matched action.

Mutates the analyzer's buffers and pos
"""
if self.stops and self.pos >= self.stops[-1]:
raise StopIteration

for rule in self.rules:
match = rule.program.match(source_string, self.pos)
if match:
rule.action(self, source_string, match)
return
# nothing matched. Either whitespace or an error
else:
raise SqlfmtParsingError(
f"Could not parse SQL at position {self.pos}:"
f" '{source_string[self.pos:self.pos+50].strip()}'"
)

def lex(self, source_string: str, eof_pos: int = -1) -> None:
"""
Repeatedly match Rules to the source_string (until the source_string is
Expand All @@ -133,17 +156,7 @@ def lex(self, source_string: str, eof_pos: int = -1) -> None:
last_loop_pos = -1
while self.pos < eof_pos and self.pos > last_loop_pos:
last_loop_pos = self.pos
for rule in self.rules:
match = rule.program.match(source_string, self.pos)
if match:
rule.action(self, source_string, match)
break
# nothing matched. Either whitespace or an error
else:
raise SqlfmtParsingError(
f"Could not parse SQL at position {self.pos}:"
f" '{source_string[self.pos:self.pos+50].strip()}'"
)
self.lex_one(source_string)

def search_for_terminating_token(
self,
Expand Down
6 changes: 3 additions & 3 deletions src/sqlfmt/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,9 @@ def starts_with_jinja_statement(self) -> bool:
return False

@property
def starts_with_square_bracket_operator(self) -> bool:
def starts_with_bracket_operator(self) -> bool:
try:
return self.nodes[0].is_square_bracket_operator
return self.nodes[0].is_bracket_operator
except IndexError:
return False

Expand Down Expand Up @@ -237,7 +237,7 @@ def is_standalone_jinja_statement(self) -> bool:
@property
def is_standalone_operator(self) -> bool:
return self._is_standalone_if(
self.starts_with_operator and not self.starts_with_square_bracket_operator
self.starts_with_operator and not self.starts_with_bracket_operator
)

@property
Expand Down
21 changes: 16 additions & 5 deletions src/sqlfmt/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,23 +133,34 @@ def is_opening_bracket(self) -> bool:
)

@property
def is_square_bracket_operator(self) -> bool:
def is_bracket_operator(self) -> bool:
"""
Node is an opening square bracket ("[")
that follows a token that could be a name
that follows a token that could be a name.

Alternatively, node is an open paren ("(")
that follow an closing angle bracket.
"""
if self.token.type != TokenType.BRACKET_OPEN or self.value != "[":
if self.token.type != TokenType.BRACKET_OPEN:
return False

prev_token, _ = get_previous_token(self.previous_node)
if not prev_token:
return False
else:
elif self.value == "[":
return prev_token.type in (
TokenType.NAME,
TokenType.QUOTED_NAME,
TokenType.BRACKET_CLOSE,
)
# BQ struct literals have parens that follow closing angle
# brackets
else:
return (
self.value == "("
and prev_token.type == TokenType.BRACKET_CLOSE
and ">" in prev_token.token
)

@property
def is_closing_bracket(self) -> bool:
Expand Down Expand Up @@ -205,7 +216,7 @@ def is_operator(self) -> bool:
TokenType.SEMICOLON,
)
or self.is_multiplication_star
or self.is_square_bracket_operator
or self.is_bracket_operator
)

@property
Expand Down
12 changes: 12 additions & 0 deletions src/sqlfmt/node_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ def raise_on_mismatched_bracket(self, token: Token, last_bracket: Node) -> None:
"(": ")",
"[": "]",
"case": "end",
"array<": ">",
"table<": ">",
"struct<": ">",
}
if (
last_bracket.token.type
Expand Down Expand Up @@ -155,6 +158,14 @@ def whitespace(
and previous_token.type == TokenType.COLON
):
return NO_SPACE
# open brackets that contain `<` are bq type definitions
# like `array<` in `array<int64>` and require a space,
# unless the preceding token is also an open bracket
elif token.type == TokenType.BRACKET_OPEN and "<" in token.token:
if previous_token and previous_token.type != TokenType.BRACKET_OPEN:
return SPACE
else:
return NO_SPACE
# open brackets that follow names are function calls or array indexes.
# open brackets that follow closing brackets are array indexes.
# open brackets that follow open brackets are just nested brackets.
Expand Down Expand Up @@ -220,6 +231,7 @@ def standardize_value(self, token: Token) -> str:
"""
if token.type in (
TokenType.UNTERM_KEYWORD,
TokenType.BRACKET_OPEN,
TokenType.STATEMENT_START,
TokenType.STATEMENT_END,
TokenType.WORD_OPERATOR,
Expand Down
Loading