tconbeer · tconbeer · Nov 1, 2022 · Oct 31, 2022 · Oct 31, 2022 · Oct 31, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,10 @@ All notable changes to this project will be documented in this file.
 
 -   sqlfmt now supports `delete` statements and the associated keywords `using` and `returning` ([#281](https://github.com/tconbeer/sqlfmt/issues/281))
 -   sqlfmt now supports `grant` and `revoke` statements and all associated keywords ([#283](https://github.com/tconbeer/sqlfmt/issues/283))
+-   sqlfmt now supports `create function` statements and all associated keywords ([#282](https://github.com/tconbeer/sqlfmt/issues/282))
 -   sqlfmt now supports the `explain` keyword ([#280](https://github.com/tconbeer/sqlfmt/issues/280))
+-   sqlfmt now supports BigQuery typed table and struct definitions and literals, like `table<a int64, b bytes(5), c string>`
+-   sqlfmt now supports variables like `$foo` as ordinary identifiers
 
 ### Features
 

diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ sqlfmt is not configurable, except for line length. It enforces a single style.
 
 sqlfmt is not a linter. It does not parse your code into an AST; it just lexes it and tracks a small subset of tokens that impact formatting. This lets us "do one thing and do it well:" sqlfmt is very fast, and easier to maintain and extend than linters that need a full SQL grammar.
 
-For now, sqlfmt only works on `select` statements (which is all you need if you use sqlfmt with a dbt project). In the future, it will be extended to DDL statements, as well.
+For now, sqlfmt only works on `select`, `delete`, `grant`, `revoke`, and `create function` statements (which is all you need if you use sqlfmt with a dbt project). It is being extended to additional DDL and DML. Visit [this tracking issue](https://github.com/tconbeer/sqlfmt/issues/262) for more information.
 
 ## Documentation
 

diff --git a/src/sqlfmt/actions.py b/src/sqlfmt/actions.py
@@ -33,7 +33,7 @@ def raise_sqlfmt_bracket_error(
     raw_token = source_string[spos:epos]
     raise SqlfmtBracketError(
         f"Encountered closing bracket '{raw_token}' at position"
-        f" {spos}, before matching opening bracket:"
+        f" {spos}, before matching opening bracket. Context:"
         f" {source_string[spos:spos+50]}"
     )
 
@@ -71,18 +71,9 @@ def safe_add_node_to_buffer(
     Then create a Node from that token and append it to the Analyzer's buffer
     """
     try:
-        token = Token.from_match(source_string, match, token_type)
-        node = analyzer.node_manager.create_node(
-            token=token, previous_node=analyzer.previous_node
-        )
+        add_node_to_buffer(analyzer, source_string, match, token_type)
     except SqlfmtBracketError:
-        token = Token.from_match(source_string, match, fallback_token_type)
-        node = analyzer.node_manager.create_node(
-            token=token, previous_node=analyzer.previous_node
-        )
-    finally:
-        analyzer.node_buffer.append(node)
-        analyzer.pos = token.epos
+        add_node_to_buffer(analyzer, source_string, match, fallback_token_type)
 
 
 def add_comment_to_buffer(
@@ -174,6 +165,80 @@ def handle_semicolon(
     )
 
 
+def handle_ddl_as(
+    analyzer: "Analyzer",
+    source_string: str,
+    match: re.Match,
+) -> None:
+    """
+    When we hit "as" in a create function or table statement,
+    the following syntax should be parsed using the main (select) rules,
+    unless the next token is a quoted name.
+    """
+    add_node_to_buffer(
+        analyzer=analyzer,
+        source_string=source_string,
+        match=match,
+        token_type=TokenType.UNTERM_KEYWORD,
+    )
+
+    quoted_name_rule = analyzer.get_rule("quoted_name")
+    comment_rule = analyzer.get_rule("comment")
+
+    quoted_name_pattern = rf"({comment_rule.pattern}|\s)*" + quoted_name_rule.pattern
+    quoted_name_match = re.match(
+        quoted_name_pattern, source_string[analyzer.pos :], re.IGNORECASE | re.DOTALL
+    )
+
+    if not quoted_name_match:
+        assert analyzer.rule_stack, (
+            "Internal Error! Open an issue. Could not parse DDL 'as' "
+            f"at pos {analyzer.pos}. Context: "
+            f"{source_string[analyzer.pos :analyzer.pos+50]}"
+        )
+        analyzer.pop_rules()
+
+
+def handle_closing_angle_bracket(
+    analyzer: "Analyzer",
+    source_string: str,
+    match: re.Match,
+) -> None:
+    """
+    When we hit ">", it could be a closing bracket, the ">" operator,
+    or the first character of another operator, like ">>". We need
+    to first assume it's a closing bracket, but if that raises a lexing
+    error, we need to try to match the source again against the operator
+    rule, to get the whole operator token
+    """
+    try:
+        add_node_to_buffer(
+            analyzer=analyzer,
+            source_string=source_string,
+            match=match,
+            token_type=TokenType.BRACKET_CLOSE,
+        )
+    except SqlfmtBracketError:
+        operator_rule = analyzer.get_rule("operator")
+        operator_pattern = re.compile(
+            r"\s*" + operator_rule.pattern,
+            re.IGNORECASE | re.DOTALL,
+        )
+        operator_match = operator_pattern.match(source_string, analyzer.pos)
+
+        assert operator_match, (
+            "Internal Error! Open an issue. Could not parse closing bracket '>' "
+            f"at pos {analyzer.pos}. Context: "
+            f"{source_string[analyzer.pos :analyzer.pos+10]}"
+        )
+        add_node_to_buffer(
+            analyzer=analyzer,
+            source_string=source_string,
+            match=operator_match,
+            token_type=TokenType.OPERATOR,
+        )
+
+
 def handle_set_operator(
     analyzer: "Analyzer", source_string: str, match: re.Match
 ) -> None:
@@ -268,8 +333,7 @@ def lex_ruleset(
     """
     Makes a nested call to analyzer.lex, with the new ruleset activated.
     """
-    rules = sorted(new_ruleset, key=lambda rule: rule.priority)
-    analyzer.push_rules(rules)
+    analyzer.push_rules(new_ruleset)
     try:
         analyzer.lex(source_string)
     except stop_exception:
@@ -372,7 +436,12 @@ def simplify_regex(pattern: str) -> str:
         # using the ruleset that was active before jinja
         next_tag_pos = next_tag_match.span(0)[0]
         jinja_rules = analyzer.pop_rules()
-        analyzer.lex(source_string, eof_pos=next_tag_pos)
+        analyzer.stops.append(next_tag_pos)
+        try:
+            analyzer.lex(source_string)
+        except StopIteration:
+            analyzer.stops.pop()
+
         analyzer.push_rules(jinja_rules)
         # it is possible for the next_tag_match found above to have already been lexed.
         # but if it hasn't, we need to process it

diff --git a/src/sqlfmt/analyzer.py b/src/sqlfmt/analyzer.py
@@ -27,6 +27,7 @@ class Analyzer:
     comment_buffer: List[Comment] = field(default_factory=list)
     line_buffer: List[Line] = field(default_factory=list)
     rule_stack: List[List[Rule]] = field(default_factory=list)
+    stops: List[int] = field(default_factory=list)
     pos: int = 0
 
     @property
@@ -100,7 +101,7 @@ def parse_query(self, source_string: str) -> Query:
 
     def push_rules(self, new_rules: List[Rule]) -> None:
         self.rule_stack.append(self.rules.copy())
-        self.rules = new_rules
+        self.rules = sorted(new_rules, key=lambda rule: rule.priority)
 
     def pop_rules(self) -> List[Rule]:
         old_rules = self.rules
@@ -117,6 +118,28 @@ def get_rule(self, rule_name: str) -> Rule:
         except StopIteration:
             raise ValueError(f"No rule '{rule_name}'")
 
+    def lex_one(self, source_string: str) -> None:
+        """
+        Repeatedly match Rules to the source_string (at self.pos)
+        and apply the matched action.
+
+        Mutates the analyzer's buffers and pos
+        """
+        if self.stops and self.pos >= self.stops[-1]:
+            raise StopIteration
+
+        for rule in self.rules:
+            match = rule.program.match(source_string, self.pos)
+            if match:
+                rule.action(self, source_string, match)
+                return
+        # nothing matched. Either whitespace or an error
+        else:
+            raise SqlfmtParsingError(
+                f"Could not parse SQL at position {self.pos}:"
+                f" '{source_string[self.pos:self.pos+50].strip()}'"
+            )
+
     def lex(self, source_string: str, eof_pos: int = -1) -> None:
         """
         Repeatedly match Rules to the source_string (until the source_string is
@@ -133,17 +156,7 @@ def lex(self, source_string: str, eof_pos: int = -1) -> None:
         last_loop_pos = -1
         while self.pos < eof_pos and self.pos > last_loop_pos:
             last_loop_pos = self.pos
-            for rule in self.rules:
-                match = rule.program.match(source_string, self.pos)
-                if match:
-                    rule.action(self, source_string, match)
-                    break
-            # nothing matched. Either whitespace or an error
-            else:
-                raise SqlfmtParsingError(
-                    f"Could not parse SQL at position {self.pos}:"
-                    f" '{source_string[self.pos:self.pos+50].strip()}'"
-                )
+            self.lex_one(source_string)
 
     def search_for_terminating_token(
         self,

diff --git a/src/sqlfmt/line.py b/src/sqlfmt/line.py
@@ -204,9 +204,9 @@ def starts_with_jinja_statement(self) -> bool:
             return False
 
     @property
-    def starts_with_square_bracket_operator(self) -> bool:
+    def starts_with_bracket_operator(self) -> bool:
         try:
-            return self.nodes[0].is_square_bracket_operator
+            return self.nodes[0].is_bracket_operator
         except IndexError:
             return False
 
@@ -237,7 +237,7 @@ def is_standalone_jinja_statement(self) -> bool:
     @property
     def is_standalone_operator(self) -> bool:
         return self._is_standalone_if(
-            self.starts_with_operator and not self.starts_with_square_bracket_operator
+            self.starts_with_operator and not self.starts_with_bracket_operator
         )
 
     @property

diff --git a/src/sqlfmt/node.py b/src/sqlfmt/node.py
@@ -133,23 +133,34 @@ def is_opening_bracket(self) -> bool:
         )
 
     @property
-    def is_square_bracket_operator(self) -> bool:
+    def is_bracket_operator(self) -> bool:
         """
         Node is an opening square bracket ("[")
-        that follows a token that could be a name
+        that follows a token that could be a name.
+
+        Alternatively, node is an open paren ("(")
+        that follow an closing angle bracket.
         """
-        if self.token.type != TokenType.BRACKET_OPEN or self.value != "[":
+        if self.token.type != TokenType.BRACKET_OPEN:
             return False
 
         prev_token, _ = get_previous_token(self.previous_node)
         if not prev_token:
             return False
-        else:
+        elif self.value == "[":
             return prev_token.type in (
                 TokenType.NAME,
                 TokenType.QUOTED_NAME,
                 TokenType.BRACKET_CLOSE,
             )
+        # BQ struct literals have parens that follow closing angle
+        # brackets
+        else:
+            return (
+                self.value == "("
+                and prev_token.type == TokenType.BRACKET_CLOSE
+                and ">" in prev_token.token
+            )
 
     @property
     def is_closing_bracket(self) -> bool:
@@ -205,7 +216,7 @@ def is_operator(self) -> bool:
                 TokenType.SEMICOLON,
             )
             or self.is_multiplication_star
-            or self.is_square_bracket_operator
+            or self.is_bracket_operator
         )
 
     @property

diff --git a/src/sqlfmt/node_manager.py b/src/sqlfmt/node_manager.py
@@ -96,6 +96,9 @@ def raise_on_mismatched_bracket(self, token: Token, last_bracket: Node) -> None:
             "(": ")",
             "[": "]",
             "case": "end",
+            "array<": ">",
+            "table<": ">",
+            "struct<": ">",
         }
         if (
             last_bracket.token.type
@@ -155,6 +158,14 @@ def whitespace(
             and previous_token.type == TokenType.COLON
         ):
             return NO_SPACE
+        # open brackets that contain `<` are bq type definitions
+        # like `array<` in `array<int64>` and require a space,
+        # unless the preceding token is also an open bracket
+        elif token.type == TokenType.BRACKET_OPEN and "<" in token.token:
+            if previous_token and previous_token.type != TokenType.BRACKET_OPEN:
+                return SPACE
+            else:
+                return NO_SPACE
         # open brackets that follow names are function calls or array indexes.
         # open brackets that follow closing brackets are array indexes.
         # open brackets that follow open brackets are just nested brackets.
@@ -220,6 +231,7 @@ def standardize_value(self, token: Token) -> str:
         """
         if token.type in (
             TokenType.UNTERM_KEYWORD,
+            TokenType.BRACKET_OPEN,
             TokenType.STATEMENT_START,
             TokenType.STATEMENT_END,
             TokenType.WORD_OPERATOR,